In [None]:
from datasets import load_dataset

dataset = load_dataset("ai4bharat/IndicCorpV2",split = "mar_Deva" , streaming = True)
display(dataset)

In [None]:
import pandas as pd
import numpy as np
import re
import os
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

In [None]:
#sample values
for i, example in enumerate(islice(dataset, 5)):
    print(f"{example['text']}")

In [None]:
# Sentence tokenizer
def sentence_tokenizer(text):
    return re.split(r'(?<=[।!?|])\s+', text.strip())

# Word tokenizer
def word_tokenizer(sentence):
    pattern = r'''(?x)
        (https?://[^\s]+)                    # URLs
      | (\w+@\w+\.\w+)                       # Emails
      | (\d{1,2}[-/]\d{1,2}[-/]\d{2,4})      # Dates
      | (\d+\.\d+)                           # Decimals
      | (\d+)                                # Whole numbers
      | ([\u0900-\u097F]+)                   # Devanagari (Marathi/Hindi) words
      | ([a-zA-Z]+)                          # English words
      | ([।.,!?;:"'\-—()])                   # Punctuation
    '''
    tokens = re.findall(pattern, sentence)
    # Flatten the list of tuples into a single list
    return [t for group in tokens for t in group if t]


In [None]:
# this is not used (only for understanding)
# Word tokenizer
#def word_tokenizer(sentence):
    pattern = r'''(?x)
        (?:https?://[^\s]+)                   # URLs
        | (?:\w+@\w+\.\w+)                    # Emails
        | (?:\d{1,2}[-/]\d{1,2}[-/]\d{2,4})   # Dates
        | (?:\d+\.\d+)                        # Decimals
        | (?:\d+)                             # Whole numbers
        | (?:\w+|[^\w\s])                     # Words and punctuation
    '''
    return re.findall(pattern, sentence)
# does not properly tokenize Marathi words (Devanagari script) unless you use Unicode-aware regex.
#Python's \w does not include Devanagari characters by default.
# This is incorrect, as it's splitting valid Marathi words into letters and matras

In [None]:
sentence = sentence_tokenizer('मजकुराचा पैलू  मागील कॅमेरा ?, ड्युअल टोन!! एल. ई. डी. फ्लॅश आणि फिंगरप्रिंट स्कॅनरची सेटअप वैशिष्ट्ये रेडमी नोट 3 सारखीच आहेत|')
print(sentence)
#Split the text wherever there is a space following a sentence-ending punctuation mark (।, !, ?, or |)

In [None]:
#using the unicode- aware tokenizer
words = word_tokenizer(sentence[-1])
print(words)

In [None]:
from itertools import islice

max_examples = 5000  # Or any other limit
tokenized_sentences = []
total_words = 0
total_chars = 0
unique_tokens = set()

# Safely iterate over streaming dataset with limit
for example in islice(dataset, max_examples):
    text = example["text"]
    sentences = sentence_tokenizer(text)
    for sentence in sentences:
        tokens = word_tokenizer(sentence)
        if tokens:
            tokenized_sentence = " ".join(tokens)
            tokenized_sentences.append(tokenized_sentence)
            total_words += len(tokens)
            total_chars += sum(len(t) for t in tokens)
            unique_tokens.update(tokens)

# Step 4: Save to file
df = pd.DataFrame({'sentence': tokenized_sentences})
df.to_csv("marathi_tokenized_sentences.csv", index=False, encoding="utf-8")
# Corpus statistics
num_sentences = len(tokenized_sentences)
avg_sentence_length = total_words / num_sentences
avg_word_length = total_chars / total_words
ttr = len(unique_tokens) / total_words
token_count = len(unique_tokens)
print("Corpus Statistics:")
print(f"Total number of sentences: {num_sentences}")
print(f"Total number of words: {total_words}")
print(f"Unique_tokens:{token_count}")
print(f"Total number of characters: {total_chars}")
print(f"Average sentence length: {avg_sentence_length:.2f}")
print(f"Average word length: {avg_word_length:.2f}")
print(f"Type/Token Ratio (TTR): {ttr:.4f}")


In [None]:
df.shape

In [None]:
df1 = pd.read_csv("marathi_tokenized_sentences.csv")
df1.head()

 ✅ For CSV files opened in Excel:
When opening in Excel:

Don't double-click to open!

Instead:

Open Excel.

Go to File > Open > Browse.

In the Open dialog, select "All Files" and choose your CSV.

You'll get a Text Import Wizard:

Select "65001: Unicode (UTF-8)" as the encoding.

Use comma (,) as delimiter.

Finish.

This will render Marathi text correctly.

In [None]:
#!pip install pyarrow
# Assuming df is your DataFrame with tokenized sentences
df.to_parquet("marathi_tokenized_sentences.parquet", engine='pyarrow', compression='snappy', index=False)

In [None]:
word_lengths = [len(word) for sentence in tokenized_sentences for word in sentence.split()]
sentence_lengths = [len(sentence.split()) for sentence in tokenized_sentences]
all_words = [word for sentence in tokenized_sentences for word in sentence.split()]
word_freq = Counter(all_words).most_common(20)
# Plot 1: Word Length Distribution
plt.figure(figsize=(8, 5))
sns.histplot(word_lengths, bins=range(1, max(word_lengths) + 1), kde=False)
plt.title("Distribution of Word Lengths")
plt.xlabel("Word Length")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

In [None]:
# Plot 2: Sentence Length Distribution
plt.figure(figsize=(8, 5))
sns.histplot(sentence_lengths, bins=range(1, max(sentence_lengths) + 1), kde=False)
plt.title("Distribution of Sentence Lengths")
plt.xlabel("Number of Words per Sentence")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()