In [None]:
from datasets import load_dataset

# Stream Hindi dataset from Hugging Face without full download
hindi_dataset = load_dataset(
    "text",
    data_files="https://huggingface.co/datasets/ai4bharat/IndicCorpV2/resolve/main/data/hi-1.txt",
    split="train",
    streaming=True
)

# Example: read first 5 lines
for i, item in enumerate(hindi_dataset):
    print(item["text"])
    if i == 4:
        break

рд▓реЛрдЧреЛрдВ рдХреЛ рдмрд┐рд▓реЛрдВ рд╕рдВрдмрдВрдзреА рд╕реБрд╡рд┐рдзрд╛ рджреЗрдирд╛ рд╣реА рдЙрдирдХрд╛ рдХрд╛рдо

рдЗрдиреЗрд▓реЛ 1987 рдореЗрдВ рдЙрд╕ рд╡рдХреНрдд рдРрд╕реЗ рд╣реА рджреЛрд░рд╛рд╣реЗ рдкрд░ рдЦрдбрд╝реА рдереА, рдЬрдм рдкреВрд░реНрд╡ рдЙрдкрдкреНрд░рдзрд╛рдирдордВрддреНрд░реА рджреЗрд╡реАрд▓рд╛рд▓ рдиреЗ рдЕрдкрдиреЗ рдкреБрддреНрд░ рдУрдордкреНрд░рдХрд╛рд╢ рдЪреМрдЯрд╛рд▓рд╛ рдХреЛ рдЕрдкрдирд╛ рд░рд╛рдЬрдиреАрддрд┐рдХ рдЙрддреНрддрд░рд╛рдзрд┐рдХрд╛рд░реА рдШреЛрд╖рд┐рдд рдХрд┐рдпрд╛ рдерд╛ред рд╣рд╛рд▓рд╛рдВрдХрд┐ рддрдм рдкрд╛рд░реНрдЯреА рдкрд░ рджреЗрд╡реАрд▓рд╛рд▓ рдХреА рдордЬрдмреВрдд рдкрдХрдбрд╝ рдХреЗ рдЪрд▓рддреЗ рдкрд╛рд░реНрдЯреА рдЯреВрдЯрдиреЗ рд╕реЗ рдмрдЪ рдЧрдИ рдереАред 1989 рдореЗрдВ рджреЗрд╡реАрд▓рд╛рд▓ рдХреЗрдиреНрджреНрд░ рдХреА рд░рд╛рдЬрдиреАрддрд┐ рдореЗрдВ рд╕рдХреНрд░рд┐рдп рд╣реЛ рдЧрдП рдереЗ рдФрд░ рдЙрдирдХреЗ рдЙрдкрдкреНрд░рдзрд╛рдирдордВрддреНрд░реА рдмрдирдиреЗ рдХреЗ рдкрд╢реНрдЪрд╛рддреН рдЙрдирдХреЗ рддреАрди рдмреЗрдЯр

In [None]:
import unicodedata

def is_devanagari_char(ch):
    # Check if char belongs to Devanagari block by Unicode name
    try:
        return 'DEVANAGARI' in unicodedata.name(ch)
    except ValueError:
        return False

def is_digit(ch):
    return unicodedata.category(ch) == 'Nd'  # Decimal number

def is_punctuation(ch):
    return unicodedata.category(ch).startswith('P')  # Any punctuation

def is_latin_char(ch):
    try:
        return 'LATIN' in unicodedata.name(ch)
    except ValueError:
        return False


In [None]:
def tokenize_word(word):
    tokens = []
    current_token = ""
    current_type = None  # 'devanagari', 'digit', 'latin', 'punct', 'other'

    for ch in word:
        if is_devanagari_char(ch):
            ch_type = 'devanagari'
        elif is_digit(ch):
            ch_type = 'digit'
        elif is_latin_char(ch):
            ch_type = 'latin'
        elif is_punctuation(ch):
            ch_type = 'punct'
        else:
            ch_type = 'other'

        if current_type == ch_type or current_type is None:
            current_token += ch
            current_type = ch_type
        else:
            tokens.append(current_token)
            current_token = ch
            current_type = ch_type

    if current_token:
        tokens.append(current_token)

    return tokens

def sentence_tokenize(text):
    sentences = []
    sentence = ""
    for ch in text:
        sentence += ch
        if ch in ['.', '!', '?', 'ред']:
            sentences.append(sentence.strip())
            sentence = ""
    if sentence.strip():
        sentences.append(sentence.strip())
    return sentences

def word_tokenize(sentence):
    words = sentence.split()
    all_tokens = []
    for word in words:
        if word.startswith(("http://", "https://", "www.")) or "@" in word:
            all_tokens.append(word)
        else:
            all_tokens.extend(tokenize_word(word))
    return all_tokens


In [None]:
MAX_PARAGRAPHS = 5
count = 0

for item in hindi_dataset:
    paragraph = item["text"].strip()
    if not paragraph:
        continue

    print(f"\nParagraph {count+1}:\n{paragraph}\n")

    sentences = sentence_tokenize(paragraph)
    for sentence in sentences:
        tokens = word_tokenize(sentence)
        print(f"Sentence: {sentence}")
        print(f"Tokens: {tokens}\n")

    count += 1
    if count >= MAX_PARAGRAPHS:
        break



Paragraph 1:
рд▓реЛрдЧреЛрдВ рдХреЛ рдмрд┐рд▓реЛрдВ рд╕рдВрдмрдВрдзреА рд╕реБрд╡рд┐рдзрд╛ рджреЗрдирд╛ рд╣реА рдЙрдирдХрд╛ рдХрд╛рдо

Sentence: рд▓реЛрдЧреЛрдВ рдХреЛ рдмрд┐рд▓реЛрдВ рд╕рдВрдмрдВрдзреА рд╕реБрд╡рд┐рдзрд╛ рджреЗрдирд╛ рд╣реА рдЙрдирдХрд╛ рдХрд╛рдо
Tokens: ['рд▓реЛрдЧреЛрдВ', 'рдХреЛ', 'рдмрд┐рд▓реЛрдВ', 'рд╕рдВрдмрдВрдзреА', 'рд╕реБрд╡рд┐рдзрд╛', 'рджреЗрдирд╛', 'рд╣реА', 'рдЙрдирдХрд╛', 'рдХрд╛рдо']


Paragraph 2:
рдЗрдиреЗрд▓реЛ 1987 рдореЗрдВ рдЙрд╕ рд╡рдХреНрдд рдРрд╕реЗ рд╣реА рджреЛрд░рд╛рд╣реЗ рдкрд░ рдЦрдбрд╝реА рдереА, рдЬрдм рдкреВрд░реНрд╡ рдЙрдкрдкреНрд░рдзрд╛рдирдордВрддреНрд░реА рджреЗрд╡реАрд▓рд╛рд▓ рдиреЗ рдЕрдкрдиреЗ рдкреБрддреНрд░ рдУрдордкреНрд░рдХрд╛рд╢ рдЪреМрдЯрд╛рд▓рд╛ рдХреЛ рдЕрдкрдирд╛ рд░рд╛рдЬрдиреАрддрд┐рдХ рдЙрддреНрддрд░рд╛рдзрд┐рдХрд╛рд░реА рдШреЛрд╖рд┐рдд рдХрд┐рдпрд╛ рдерд╛ред рд╣рд╛рд▓рд╛рдВрдХрд┐ рддрдм рдкрд╛рд░реНрдЯреА рдкрд░ рджреЗрд╡реАрд▓рд╛рд▓ рдХреА рдордЬрдмреВрдд рдкрдХрдбрд╝ рдХреЗ рдЪрд▓рддреЗ рдкрд╛рд░реНрдЯреА рдЯреВрд

Using RE

In [None]:
import re
import unicodedata
from datasets import load_dataset

# Load 5 lines from the Hindi dataset
hindi_dataset = load_dataset(
    "text",
    data_files="https://huggingface.co/datasets/ai4bharat/IndicCorpV2/resolve/main/data/hi-1.txt",
    split="train",
    streaming=True
)

# Unicode Block: Devanagari range for Hindi (U+0900 to U+097F)
def is_hindi_char(char):
    return '\u0900' <= char <= '\u097F'

# Sentence tokenizer (split on ред or sentence-ending punctuation)
def sentence_tokenize(text):
    return re.split(r'(?<=[ред!?])\s+', text.strip())

# Word tokenizer using regex to handle URLs, emails, numbers, punctuations
def word_tokenize(sentence):
    pattern = r'''(?x)               # verbose mode
        (https?://\S+|www\.\S+)      # URLs
      | (\w+@\w+\.\w+)               # emails
      | (\d+\.\d+|\d+)               # numbers (integers, decimals)
      | ([\u0900-\u097F]+)           # Hindi words (Devanagari)
      | ([^\s\w])                    # punctuation
    '''
    tokens = re.findall(pattern, sentence)
    # Each match is a tuple with only one non-empty value
    return [token for group in tokens for token in group if token]

# Stream and process first 5 lines
for i, item in enumerate(hindi_dataset):
    paragraph = item["text"].strip()
    if not paragraph:
        continue

    print(f"\n Paragraph :")
    print(paragraph)

    # Unicode block check (optional)
    devnagari_chars = [char for char in paragraph if is_hindi_char(char)]
    if not devnagari_chars:
        print("  Not detected as Hindi (Devanagari)")
        continue

    # Sentence tokenization
    sentences = sentence_tokenize(paragraph)
    for sent_num, sentence in enumerate(sentences, 1):
        print(f"\n  Sentence {sent_num}: {sentence}")
        tokens = word_tokenize(sentence)
        print(f"      Tokens: {tokens}")

    if i == 4:
        break



 Paragraph :
рд▓реЛрдЧреЛрдВ рдХреЛ рдмрд┐рд▓реЛрдВ рд╕рдВрдмрдВрдзреА рд╕реБрд╡рд┐рдзрд╛ рджреЗрдирд╛ рд╣реА рдЙрдирдХрд╛ рдХрд╛рдо

  Sentence 1: рд▓реЛрдЧреЛрдВ рдХреЛ рдмрд┐рд▓реЛрдВ рд╕рдВрдмрдВрдзреА рд╕реБрд╡рд┐рдзрд╛ рджреЗрдирд╛ рд╣реА рдЙрдирдХрд╛ рдХрд╛рдо
      Tokens: ['рд▓реЛрдЧреЛрдВ', 'рдХреЛ', 'рдмрд┐рд▓реЛрдВ', 'рд╕рдВрдмрдВрдзреА', 'рд╕реБрд╡рд┐рдзрд╛', 'рджреЗрдирд╛', 'рд╣реА', 'рдЙрдирдХрд╛', 'рдХрд╛рдо']

 Paragraph :
рдЗрдиреЗрд▓реЛ 1987 рдореЗрдВ рдЙрд╕ рд╡рдХреНрдд рдРрд╕реЗ рд╣реА рджреЛрд░рд╛рд╣реЗ рдкрд░ рдЦрдбрд╝реА рдереА, рдЬрдм рдкреВрд░реНрд╡ рдЙрдкрдкреНрд░рдзрд╛рдирдордВрддреНрд░реА рджреЗрд╡реАрд▓рд╛рд▓ рдиреЗ рдЕрдкрдиреЗ рдкреБрддреНрд░ рдУрдордкреНрд░рдХрд╛рд╢ рдЪреМрдЯрд╛рд▓рд╛ рдХреЛ рдЕрдкрдирд╛ рд░рд╛рдЬрдиреАрддрд┐рдХ рдЙрддреНрддрд░рд╛рдзрд┐рдХрд╛рд░реА рдШреЛрд╖рд┐рдд рдХрд┐рдпрд╛ рдерд╛ред рд╣рд╛рд▓рд╛рдВрдХрд┐ рддрдм рдкрд╛рд░реНрдЯреА рдкрд░ рджреЗрд╡реАрд▓рд╛рд▓ рдХреА рдордЬрдмреВрдд рдкрдХрдбрд╝ рдХреЗ рдЪрд▓рддреЗ рдкрд╛рд░реНрдЯреА

In [3]:
from datasets import load_dataset
import re
import unicodedata
import json
import pickle
import os
from collections import Counter
from typing import List, Dict, Any

hindi_dataset = load_dataset(
    "text",
    data_files="https://huggingface.co/datasets/ai4bharat/IndicCorpV2/resolve/main/data/hi-1.txt",
    split="train",
    streaming=True
)

def sentence_split(paragraph):
    sentence_endings = ['.', '?', '!', 'ред']
    sentences = []
    current = ''
    for char in paragraph:
        current += char
        if char in sentence_endings:
            if current.strip():
                sentences.append(current.strip())
                current = ''
    if current.strip():
        sentences.append(current.strip())
    return sentences

def word_tokenize(sentence):
    pattern = r'''
        (https?://[^\s]+) |                     # Group 1: URLs
        (www\.[^\s]+) |                         # Group 2: www URLs
        (\w+@\w+\.\w+) |                        # Group 3: Emails
        (\d{1,2}/\d{1,2}/\d{2,4}) |             # Group 4: Dates
        (\d+\.\d+) |                            # Group 5: Decimal numbers
        ([\u0900-\u097F]+) |                    # Group 6: Hindi (Devanagari) words
        ([a-zA-Z0-9_-]+) |                      # Group 7: Latin words/digits
        ([^\s])                                 # Group 8: Other single chars
    '''
    tokens = re.findall(pattern, sentence, re.VERBOSE)
    flat_tokens = [token for group in tokens for token in group if token]
    return flat_tokens

class CorpusStatistics:
    def __init__(self):
        self.total_sentences = 0
        self.total_words = 0
        self.total_characters = 0
        self.sentence_lengths = []
        self.word_lengths = []
        self.vocabulary = Counter()
        self.processed_documents = 0
        self.tokenized_data = []

    def process_document(self, text: str, doc_id: int) -> Dict[str, Any]:
        """Process a single document and update statistics"""
        if not text or not text.strip():
            return None

        text = text.strip()
        sentences = sentence_split(text)

        processed_sentences = []
        doc_word_count = 0

        for sentence in sentences:
            if not sentence.strip():
                continue

            words = word_tokenize(sentence)

            if words:
                processed_sentences.append({
                    'text': sentence,
                    'tokens': words,
                    'word_count': len(words)
                })

                self.total_sentences += 1
                self.total_words += len(words)
                self.total_characters += len(sentence)
                self.sentence_lengths.append(len(words))

                self.vocabulary.update(words)

                for word in words:
                    self.word_lengths.append(len(word))

                doc_word_count += len(words)

        if processed_sentences:
            self.processed_documents += 1

            document_data = {
                'document_id': doc_id,
                'original_text': text,
                'sentences': processed_sentences,
                'document_stats': {
                    'sentence_count': len(processed_sentences),
                    'word_count': doc_word_count,
                    'character_count': len(text)
                }
            }

            return document_data

        return None

    def compute_final_statistics(self) -> Dict[str, float]:
        """Compute all required corpus statistics"""
        if self.total_sentences == 0:
            return {}

        avg_sentence_length = (sum(self.sentence_lengths) / len(self.sentence_lengths)) if self.sentence_lengths else 0
        avg_word_length = (sum(self.word_lengths) / len(self.word_lengths)) if self.word_lengths else 0

        unique_tokens = len(self.vocabulary)
        total_tokens = self.total_words
        ttr = unique_tokens / total_tokens if total_tokens > 0 else 0

        return {
            'total_sentences': self.total_sentences,
            'total_words': self.total_words,
            'total_characters': self.total_characters,
            'average_sentence_length': round(avg_sentence_length, 2),
            'average_word_length': round(avg_word_length, 2),
            'type_token_ratio': round(ttr, 4),
            'vocabulary_size': unique_tokens,
            'processed_documents': self.processed_documents
        }

    def save_data_and_statistics(self, output_dir: str = "hindi_corpus_output"):
        """Save tokenized data and statistics to files - optimized for large datasets"""
        os.makedirs(output_dir, exist_ok=True)

        # For very large datasets, save in chunks to manage memory
        chunk_size = 10000
        if len(self.tokenized_data) > chunk_size:
            print(f"Saving large dataset in chunks of {chunk_size} documents...")

            for i in range(0, len(self.tokenized_data), chunk_size):
                chunk = self.tokenized_data[i:i+chunk_size]
                chunk_file = os.path.join(output_dir, f"tokenized_data_chunk_{i//chunk_size + 1:03d}.json")
                with open(chunk_file, 'w', encoding='utf-8') as f:
                    json.dump(chunk, f, ensure_ascii=False, indent=2)

            # Save complete data in pickle format (more memory efficient)
            pickle_file = os.path.join(output_dir, "tokenized_data_complete.pkl")
            with open(pickle_file, 'wb') as f:
                pickle.dump(self.tokenized_data, f)

        else:
            # Save tokenized data in JSON format
            json_file = os.path.join(output_dir, "tokenized_data.json")
            with open(json_file, 'w', encoding='utf-8') as f:
                json.dump(self.tokenized_data, f, ensure_ascii=False, indent=2)

            # Save tokenized data in pickle format (faster loading)
            pickle_file = os.path.join(output_dir, "tokenized_data.pkl")
            with open(pickle_file, 'wb') as f:
                pickle.dump(self.tokenized_data, f)

        # Compute and save statistics
        stats = self.compute_final_statistics()

        # Save statistics as JSON
        stats_file = os.path.join(output_dir, "corpus_statistics.json")
        with open(stats_file, 'w', encoding='utf-8') as f:
            json.dump(stats, f, indent=2, ensure_ascii=False)

        # Save top vocabulary (to manage file size for large corpora)
        vocab_file = os.path.join(output_dir, "vocabulary_top10000.json")
        vocab_dict = dict(self.vocabulary.most_common(10000))  # Top 10K words
        with open(vocab_file, 'w', encoding='utf-8') as f:
            json.dump(vocab_dict, f, indent=2, ensure_ascii=False)

        # Save complete vocabulary in pickle format
        full_vocab_file = os.path.join(output_dir, "vocabulary_complete.pkl")
        with open(full_vocab_file, 'wb') as f:
            pickle.dump(dict(self.vocabulary), f)

        report_file = os.path.join(output_dir, "statistics_report.txt")
        with open(report_file, 'w', encoding='utf-8') as f:
            f.write("Dataset: ai4bharat/IndicCorpV2 (Hindi)\n")
            f.write("File: hi-1.txt\n")
            f.write(f"Processing Scale: {stats['processed_documents']:,} documents\n\n")

            f.write(f"i.   Total number of sentences: {stats['total_sentences']:,}\n")
            f.write(f"ii.  Total number of words: {stats['total_words']:,}\n")
            f.write(f"iii. Total number of characters: {stats['total_characters']:,}\n")
            f.write(f"iv.  Average Sentence Length: {stats['average_sentence_length']} words per sentence\n")
            f.write(f"v.   Average word length: {stats['average_word_length']} characters per word\n")
            f.write(f"vi.  Type/Token Ratio (TTR): {stats['type_token_ratio']}\n\n")

            f.write("EXTENDED STATISTICS:\n")
            f.write("-" * 20 + "\n")
            f.write(f"Vocabulary size (unique tokens): {stats['vocabulary_size']:,}\n")
            f.write(f"Processed documents: {stats['processed_documents']:,}\n")
            f.write(f"Longest sentence: {max(self.sentence_lengths) if self.sentence_lengths else 0} words\n")
            f.write(f"Shortest sentence: {min(self.sentence_lengths) if self.sentence_lengths else 0} words\n")
            f.write(f"Longest word: {max(self.word_lengths) if self.word_lengths else 0} characters\n\n")

            f.write("TOP 50 MOST FREQUENT TOKENS:\n")
            f.write("-" * 30 + "\n")
            for i, (word, freq) in enumerate(self.vocabulary.most_common(50), 1):
                f.write(f"{i:2d}. {word}: {freq:,}\n")

        return stats

def main():
    # Initialize corpus statistics
    corpus_stats = CorpusStatistics()

    print("\nProcessing Hindi dataset...\n")

    # Process first few examples for demonstration (like your original code)
    print("SAMPLE PROCESSING (First 5 examples):")
    print("-" * 40)

    sample_count = 0
    for i, example in enumerate(hindi_dataset):
        # Skip if 'text' is missing or just whitespace
        if 'text' not in example or not example['text'].strip():
            continue

        text = example['text'].strip()
        if not text:
            continue

        # Show sample processing for first 5 examples
        if sample_count < 5:
            print(f"\n--- Example {sample_count + 1} ---")
            print("Original Paragraph:")
            print(text[:200] + "..." if len(text) > 200 else text)

            sentences = sentence_split(text)

            print(f"\nTokenized Sentences ({len(sentences)} total):")
            for j, sent in enumerate(sentences[:2]):  # Show first 2 sentences
                print(f"{j+1}. {sent}")
            if len(sentences) > 2:
                print(f"... and {len(sentences) - 2} more sentences")

            print("\nTokenized Words (first sentence):")
            if sentences:
                tokens = word_tokenize(sentences[0])
                print(tokens[:15])  # Show first 15 tokens
                if len(tokens) > 15:
                    print(f"... and {len(tokens) - 15} more tokens")

            sample_count += 1

        # Process document for statistics
        processed_doc = corpus_stats.process_document(text, i)
        if processed_doc:
            corpus_stats.tokenized_data.append(processed_doc)

        # Progress indicator - more frequent updates for larger processing
        if i > 0 and i % 5000 == 0:
            print(f"\nProcessed {i:,} documents...")
            print(f"Current stats: {corpus_stats.total_sentences:,} sentences, {corpus_stats.total_words:,} words")
            print(f"Current TTR: {len(corpus_stats.vocabulary) / corpus_stats.total_words:.4f}" if corpus_stats.total_words > 0 else "")

            # Show memory-friendly batch saving every 25,000 documents
            if i % 25000 == 0:
                print(f"Saving checkpoint at {i:,} documents...")
                temp_stats = corpus_stats.save_data_and_statistics(f"hindi_corpus_checkpoint_{i}")
                print(f"Checkpoint saved: {temp_stats['total_sentences']:,} sentences processed so far")

        # Process more documents for comprehensive statistics
        # You can adjust this number or remove the condition entirely for full dataset
        if i >= 100000:  # Process 100,000 documents (remove this line for full dataset)
            print(f"\nProcessed {i:,} documents - stopping for memory management...")
            break

    print(f"\nProcessing complete!")
    print(f"Processed {corpus_stats.processed_documents:,} documents")

    print("\nSaving tokenized data and computing statistics...")
    stats = corpus_stats.save_data_and_statistics()

    print("\n" + "=" * 60)
    print("FINAL CORPUS STATISTICS (Assignment Task 1d)")
    print("=" * 60)
    print(f"i.   Total number of sentences: {stats['total_sentences']:,}")
    print(f"ii.  Total number of words: {stats['total_words']:,}")
    print(f"iii. Total number of characters: {stats['total_characters']:,}")
    print(f"iv.  Average Sentence Length: {stats['average_sentence_length']} words per sentence")
    print(f"v.   Average word length: {stats['average_word_length']} characters per word")
    print(f"vi.  Type/Token Ratio (TTR): {stats['type_token_ratio']}")
    print("-" * 60)
    print(f"Vocabulary size: {stats['vocabulary_size']:,} unique tokens")
    print(f"Processed documents: {stats['processed_documents']:,}")
    print("=" * 60)

    print(f"ЁЯУК SCALE: Processed {stats['processed_documents']:,} documents")
    print(f"ЁЯУБ Check 'hindi_corpus_output' directory for:")
    if stats['processed_documents'] > 10000:
        print(f"ЁЯУБ tokenized_data_chunk_*.json - Chunked tokenized documents")
        print(f"ЁЯУБ tokenized_data_complete.pkl - Complete binary format")
        print(f"ЁЯУБ vocabulary_top10000.json - Top 10,000 frequent words")
        print(f"ЁЯУБ vocabulary_complete.pkl - Complete vocabulary")
    else:
        print(f"ЁЯУБ tokenized_data.json - All tokenized documents")
        print(f"ЁЯУБ tokenized_data.pkl - Fast-loading binary format")
        print(f"ЁЯУБ vocabulary_top10000.json - Top frequent words")
    print(f"ЁЯУБ corpus_statistics.json - All computed statistics")
    print(f"ЁЯУБ statistics_report.txt - Extended human-readable report")

    print(f"\nЁЯОп All requirements completed at scale:")
    print(f"тЬЕ a. Downloaded and extracted {stats['processed_documents']:,} Hindi documents")
    print(f"тЬЕ b. Tokenized {stats['total_sentences']:,} sentences into {stats['total_words']:,} words")
    print(f"тЬЕ c. Saved all tokenized data with memory-efficient chunking")
    print(f"тЬЕ d. Computed comprehensive corpus statistics")

    print(f"\nЁЯУИ CORPUS INSIGHTS:")
    print(f"ЁЯФд Vocabulary richness: {stats['vocabulary_size']:,} unique tokens")
    print(f"ЁЯУЭ Text density: {stats['total_characters']:,} characters processed")
    print(f"ЁЯОн Language diversity: TTR = {stats['type_token_ratio']} (lower = more repetitive)")

if __name__ == "__main__":
    main()


Processing Hindi dataset...

SAMPLE PROCESSING (First 5 examples):
----------------------------------------

--- Example 1 ---
Original Paragraph:
рд▓реЛрдЧреЛрдВ рдХреЛ рдмрд┐рд▓реЛрдВ рд╕рдВрдмрдВрдзреА рд╕реБрд╡рд┐рдзрд╛ рджреЗрдирд╛ рд╣реА рдЙрдирдХрд╛ рдХрд╛рдо

Tokenized Sentences (1 total):
1. рд▓реЛрдЧреЛрдВ рдХреЛ рдмрд┐рд▓реЛрдВ рд╕рдВрдмрдВрдзреА рд╕реБрд╡рд┐рдзрд╛ рджреЗрдирд╛ рд╣реА рдЙрдирдХрд╛ рдХрд╛рдо

Tokenized Words (first sentence):
['рд▓реЛрдЧреЛрдВ', 'рдХреЛ', 'рдмрд┐рд▓реЛрдВ', 'рд╕рдВрдмрдВрдзреА', 'рд╕реБрд╡рд┐рдзрд╛', 'рджреЗрдирд╛', 'рд╣реА', 'рдЙрдирдХрд╛', 'рдХрд╛рдо']

--- Example 2 ---
Original Paragraph:
рдЗрдиреЗрд▓реЛ 1987 рдореЗрдВ рдЙрд╕ рд╡рдХреНрдд рдРрд╕реЗ рд╣реА рджреЛрд░рд╛рд╣реЗ рдкрд░ рдЦрдбрд╝реА рдереА, рдЬрдм рдкреВрд░реНрд╡ рдЙрдкрдкреНрд░рдзрд╛рдирдордВрддреНрд░реА рджреЗрд╡реАрд▓рд╛рд▓ рдиреЗ рдЕрдкрдиреЗ рдкреБрддреНрд░ рдУрдордкреНрд░рдХрд╛рд╢ рдЪреМрдЯрд╛рд▓рд╛ рдХреЛ рдЕрдкрдирд╛ рд░рд╛рдЬрдиреАрддрд┐рдХ рдЙрддреНрддрд░рд╛рдзрд┐рдХ