# Corpora Analysis of Indian Language

In [30]:
import re
from collections import Counter
from indicnlp.tokenize import indic_tokenize
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
import nltk
from nltk.probability import FreqDist

In [31]:
def normalize_text(text, lang='hi'):
    normalizer_factory = IndicNormalizerFactory()
    normalizer = normalizer_factory.get_normalizer(lang)
    normalized_text = normalizer.normalize(text)
    return normalized_text

In [32]:

def type_token_analysis(text):
    tokens = indic_tokenize.trivial_tokenize(text)
    types = set(tokens)
    type_token_ratio = len(types) / len(tokens)
    return type_token_ratio, tokens, types

In [33]:
def syllable_analysis(text):
    syllables = []
    words = indic_tokenize.trivial_tokenize(text)
    
    for word in words:
        syllables.extend(re.findall(r'[क-ह]+', word))
    syllable_freq = FreqDist(syllables)
    most_common_syllables = syllable_freq.most_common(10)
    
    initial_syllables = Counter()
    medial_syllables = Counter()
    final_syllables = Counter()
    
    for word in words:
        word_syllables = re.findall(r'[क-ह]+', word)
        if word_syllables:
            initial_syllables[word_syllables[0]] += 1
            if len(word_syllables) > 1:
                medial_syllables.update(word_syllables[1:-1])
            if len(word_syllables) > 1:
                final_syllables[word_syllables[-1]] += 1
    
    return most_common_syllables, initial_syllables.most_common(10), medial_syllables.most_common(10), final_syllables.most_common(10)

# Analysis Done on Language Hindi using NLTK

In [34]:
def syllable_analysis(text):
    syllables = []
    words = indic_tokenize.trivial_tokenize(text)
    
    for word in words:
        syllables.extend(re.findall(r'[क-ह]+', word))
    syllable_freq = FreqDist(syllables)
    most_common_syllables = syllable_freq.most_common(10)
    
    initial_syllables = Counter()
    medial_syllables = Counter()
    final_syllables = Counter()
    
    for word in words:
        word_syllables = re.findall(r'[क-ह]+', word)
        if word_syllables:
            initial_syllables[word_syllables[0]] += 1
            if len(word_syllables) > 1:
                medial_syllables.update(word_syllables[1:-1])
            if len(word_syllables) > 1:
                final_syllables[word_syllables[-1]] += 1
    
    return most_common_syllables, initial_syllables.most_common(10), medial_syllables.most_common(10), final_syllables.most_common(10)

# Output of the follwoing Analysis


In [39]:
def main():
    corpus_file_path = 'demotext.txt'
    text = load_corpus(corpus_file_path)
    text = normalize_text(text)

    type_token_ratio, tokens, types = type_token_analysis(text)
    print(f'Type-Token Ratio: {type_token_ratio}')
    
    most_common_syllables, initial_syllables, medial_syllables, final_syllables = syllable_analysis(text)
    print(f'Most Frequent Syllables:\n{most_common_syllables}\n')
    print(f'Most Frequent Initial Syllables:\n{initial_syllables}\n')
    print(f'Most Frequent Medial Syllables:\n{medial_syllables}\n')
    print(f'Most Frequent Final Syllables:\n{final_syllables}\n')

if __name__ == '__main__':
    main()

Type-Token Ratio: 0.1927453027139875
Most Frequent Syllables:
[('ह', 3180), ('क', 2459), ('र', 2406), ('त', 2400), ('न', 1685), ('स', 1581), ('ग', 1315), ('द', 1260), ('म', 1253), ('ज', 984)]

Most Frequent Initial Syllables:
[('ह', 2860), ('क', 1818), ('त', 943), ('स', 886), ('म', 877), ('ज', 779), ('नह', 739), ('द', 603), ('ग', 594), ('न', 554)]

Most Frequent Medial Syllables:
[('र', 568), ('द', 480), ('ग', 398), ('क', 397), ('प', 273), ('त', 258), ('श', 249), ('य', 231), ('न', 230), ('स', 222)]

Most Frequent Final Syllables:
[('र', 1417), ('त', 1199), ('न', 901), ('स', 473), ('य', 428), ('ल', 373), ('ग', 323), ('क', 244), ('म', 233), ('छ', 208)]

