In [1]:
from datasets import load_dataset
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("ai4bharat/IndicCorpV2", "indiccorp_v2", split="guj_Gujr", streaming=True)

In [3]:
def sentence_tokenizer(paragraph):
    paragraph = re.sub(r'\s+', ' ', paragraph.strip())
    sentences = re.split(r'(?<=[.!?])\s+', paragraph)
    return [sentence.strip() for sentence in sentences if sentence.strip()]

In [4]:
def word_tokenizer(sentence):
    patterns = [
        r'https?://\S+',
        r'\b[\w\.-]+?@\w+\.\w+\b',
        r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b',
        r'\d+\.\d+|\d+',
        r'[^\w\s\u0A80-\u0AFF]',
        r'[\u0A80-\u0AFF]+',
        r'[A-Za-z]+'
    ]
    return re.compile('|'.join(patterns)).findall(sentence)

In [5]:
no_of_sentences = 0
with open("Q1_tokenized_sentences.txt", "w", encoding="utf-8") as f:
    for i, data in enumerate(dataset):
        sentences = sentence_tokenizer(data['text'])
        for sentence in sentences:
            f.write(sentence + '\n')
            no_of_sentences += 1
        if i >= 1000:
            break
print("Tokenized sentences stored in 'Q1_tokenized_sentences.txt'")

Tokenized sentences stored in 'Q1_tokenized_sentences.txt'


In [6]:
no_of_words = []
no_of_chars = []
with open("Q1_tokenized_sentences.txt", "r", encoding="utf-8") as f:
    sentences = f.readlines()
with open("Q1_tokenized_sentences_from_words.txt", "w", encoding="utf-8") as f:
    for sentence in sentences:
        words = word_tokenizer(sentence)
        for word in words:
            f.write(word + ' ')
            no_of_chars.append(len(word))
        f.write('\n')
        no_of_words.append(len(words))
print("Tokenized sentences from words stored in 'Q1_tokenized_sentences_from_words.txt'")

Tokenized sentences from words stored in 'Q1_tokenized_sentences_from_words.txt'


In [7]:
print("Corpus Statistics:-")
print(f"Total number of sentences: {no_of_sentences}")
print(f"Total number of words: {sum(no_of_words)}")
print(f"Total number of characters: {sum(no_of_chars)}")
print(f"Average sentence length: {sum(no_of_words) / no_of_sentences if no_of_sentences > 0 else 0}")
print(f"Average word length: {sum(no_of_chars) / sum(no_of_words) if sum(no_of_words) > 0 else 0}")
print(f"Type/Token Ratio (TTR): {len(set(no_of_words)) / sum(no_of_words) if sum(no_of_words) > 0 else 0}")

Corpus Statistics:-
Total number of sentences: 1447
Total number of words: 25130
Total number of characters: 110673
Average sentence length: 17.36696613683483
Average word length: 4.404019100676482
Type/Token Ratio (TTR): 0.0029048945483485873
