In [1]:
import os
import re
from collections import Counter
import pandas as pd


In [2]:
RAW_HINDI_DATA_DIR = 'data/raw_hindi_data'


In [3]:
# def read_text_files(directory):
#     text = ""
#     for filename in os.listdir(directory):
#         if filename.endswith(".txt"):
#             with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
#                 text += file.read() + " "
#     return text

# def tokenize(text):
#     # Using regex to find words, which are sequences of word characters (alphanumeric + underscore)
#     words = re.findall(r'\b\w+\b', text)
#     return words

# def get_vocabulary(directory):
#     text = read_text_files(directory)
#     words = tokenize(text)
#     vocabulary = set(words)
#     return vocabulary

# Example usage:
# directory_path = 'data/raw_hindi_data'
# vocabulary = get_vocabulary(directory_path)

# # Print the vocabulary
# print("Vocabulary size:", len(vocabulary))
# print("Vocabulary:", vocabulary)


In [68]:
def read_text_files(directory):
    text = ""
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                text += file.read()
    return text

def filter_text(text):
    # Remove English alphabets and numbers 0-9 using regex
    filtered_text = re.sub(r'[a-zA-Z0-9]', '', text)
    
    # Remove newline characters and extra spaces
    filtered_text = re.sub(r'\n', '', filtered_text)
    
    # Remove special characters
    special_chars = {
        'punctuation': '!"\'()*,./:;?[]«»‹›""''„‒–—―',  # Basic punctuation and quotation marks
        'math_logical': '!%&*+-/<=>^|~',  # Mathematical and logical operators
        'currency': '$£€¥¢₹',  # Currency symbols
        'other_symbols': '#&@_¦¬\‰',  # Miscellaneous symbols
        'diacritical': '`ˆç',  # Diacritical marks and grave accent
        'special': '¡¿',  # Inverted punctuation (used in Spanish)
        'invisible': '\u200C\u200D',  # Zero Width Non-Joiner and Zero Width Joiner
        'whitespace': ' \t\n\r\f\v',  # Various whitespace characters
        'additional': '§©®°±µ¶·¹²³†‡',  # Additional symbols, superscript numbers, and special characters
        'brackets': '()[]{}⟨⟩',  # Various types of brackets
        'arrows': '←→↑↓↔↕↖↗↘↙',  # Arrow symbols
        'math_symbols': '∀∂∃∅∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨∩∪∫≅≈≠≡≤≥',  # Advanced mathematical symbols
        'greek_letters': 'αβγδεζηθικλμνξοπρστυφχψω',  # Lowercase Greek letters
        'format_control': '\u200E\u200F\u202A\u202B\u202C\u202D\u202E',  # Bidirectional formatting characters
        'quotes': '\'\"\'\"’“”',  # Various types of quotation marks
    }
    special_chars_pattern = '[' + re.escape(''.join(special_chars.values())) + ']'
    filtered_text = re.sub(special_chars_pattern, '', filtered_text)
    return filtered_text

def count_characters(text):
    return Counter(text)

def get_character_counts_from_directory(directory):
    text = read_text_files(directory)
    filtered_text = filter_text(text)
    character_counts = count_characters(filtered_text)
    return character_counts

# Example usage:
character_counts = get_character_counts_from_directory(RAW_HINDI_DATA_DIR)
print("Character counts:", len(character_counts))


Character counts: 77


The Unicode range for the Devanagari script, used for writing Hindi, spans from U+0900 to U+097F. Here is a breakdown of the range:

- U+0900 to U+0903: Vowel signs (e.g., ं, ँ)
- U+0904 to U+0939: Independent vowels and consonants (e.g., अ, आ, इ, ई, क, ख, ग)
- U+093A to U+094F: Various vowel diacritics (e.g., ़, ऽ, ा, ि)
- U+0950 to U+0954: Additional signs (e.g., ॐ, ः, ँ)
- U+0955 to U+0963: Additional vowel signs and diacritics
- U+0964 to U+096F: Punctuation marks and numerals (e.g., ।, ॥, ०, १, २)
- U+0970 to U+097F: Extended characters used in various Indian languages

The Devanagari Extended block spans from U+A8E0 to U+A8FF, adding more characters for specific linguistic purposes.

In [39]:
start = '\u0900'
end = '\u097F'
devanagari_character_counts = {char: count for char, count in character_counts.items() if start <= char <= end}
print("Devanagari character counts:", len(devanagari_character_counts))


Devanagari character counts: 77


In [51]:
start = '\u0900'
end = '\u097F'
devanagari_characters = [chr(i) for i in range(ord(start), ord(end) + 1)]
print("Total Devanagari characters:", len(devanagari_characters))


Total Devanagari characters: 128


In [53]:
start = '\uA8E0'
end = '\uA8FF'
devanagari_extended_character_counts = {char: count for char, count in character_counts.items() if start <= char <= end}
print("Devanagari Extended character counts:", len(devanagari_extended_character_counts))


Devanagari Extended character counts: 0


In [50]:
start = '\uA8E0'
end = '\uA8FF'
extended_devanagari_characters = [chr(i) for i in range(ord(start), ord(end) + 1)]
print("Total Extended Devanagari characters:", len(extended_devanagari_characters))


Total Extended Devanagari characters: 32


In [69]:
# Convert the character counts to a pandas DataFrame
# columns=['Character', 'Unicode', 'Is Devanagari', 'Count']
character_counts_df = pd.DataFrame(
    [(char, ord(char), char in devanagari_character_counts, count) for char, count in character_counts.items()],
    columns=['Character', 'Unicode', 'Is Devanagari', 'Count']
)
character_counts_df = character_counts_df.sort_values(by='Count', ascending=False).reset_index(drop=True)
character_counts_df


Unnamed: 0,Character,Unicode,Is Devanagari,Count
0,्,2381,True,1067132
1,ा,2366,True,864738
2,त,2340,True,668939
3,र,2352,True,590762
4,क,2325,True,507052
...,...,...,...,...
72,ऋ,2315,True,2586
73,ऊ,2314,True,1344
74,ॉ,2377,True,979
75,ॐ,2384,True,866


In [70]:
character_counts_df[character_counts_df['Is Devanagari']==False].sort_values(by='Unicode')


Unnamed: 0,Character,Unicode,Is Devanagari,Count


In [72]:
# vocab given by mayank
hindi_to_english = {
    'अ': 'a', 'आ': 'aa', 'इ': 'i', 'ई': 'ee', 'उ': 'u', 'ऊ': 'oo',
    'ए': 'e', 'ऐ': 'ai', 'ओ': 'o', 'औ': 'au',
    'क': 'k', 'ख': 'kh', 'ग': 'g', 'घ': 'gh', 'ङ': 'ng',
    'च': 'ch', 'छ': 'chh', 'ज': 'j', 'झ': 'jh', 'ञ': 'n',
    'ट': 't', 'ठ': 'th', 'ड': 'd', 'ढ': 'dh', 'ण': 'n',
    'त': 't', 'थ': 'th', 'द': 'd', 'ध': 'dh', 'न': 'n',
    'प': 'p', 'फ': 'ph', 'ब': 'b', 'भ': 'bh', 'म': 'm',
    'य': 'y', 'र': 'r', 'ल': 'l', 'व': 'v',
    'श': 'sh', 'ष': 'sh', 'स': 's', 'ह': 'h',
    'क्ष': 'ksh', 'त्र': 'tr', 'ज्ञ': 'gy',
    'ड़': 'r', 'ढ़': 'rh',
    'ा': 'a', 'ि': 'i', 'ी': 'ee', 'ु': 'u', 'ू': 'oo',
    'े': 'e', 'ै': 'ai', 'ो': 'o', 'ौ': 'au',
    'ं': 'n', 'ः': 'h', '्': '',
}


In [82]:
character_counts_df['English'] = character_counts_df['Character'].map(hindi_to_english)
character_counts_df[character_counts_df['English'].isnull()]['Character'].values.tolist()


['।',
 '१',
 'ृ',
 '॥',
 '०',
 '२',
 '५',
 '४',
 '८',
 '३',
 'ऽ',
 '६',
 '७',
 '९',
 'ँ',
 '॑',
 '॒',
 '़',
 'ऋ',
 'ॉ',
 'ॐ',
 'ऑ']

In [71]:
print(len("करें।"))


5
