In [1]:
from datasets import load_dataset
import re
import unicodedata
import json
import pickle
import os
from collections import Counter
from typing import List, Dict, Any
import gc
import psutil
import time

# -------------------- Memory Management Helper --------------------
def get_memory_usage():
    """Get current memory usage in MB"""
    try:
        process = psutil.Process(os.getpid())
        return process.memory_info().rss / 1024 / 1024
    except:
        return 0

def clear_memory():
    """Force garbage collection to free memory"""
    gc.collect()

# -------------------- Sentence Tokenizer --------------------
def sentence_split(paragraph):
    """Split paragraph into sentences using Hindi and English punctuation"""
    sentence_endings = ['.', '?', '!', '‡•§']
    sentences = []
    current = ''

    for char in paragraph:
        current += char
        if char in sentence_endings:
            if current.strip():
                sentences.append(current.strip())
                current = ''

    if current.strip():
        sentences.append(current.strip())

    return sentences

# -------------------- Word Tokenizer using regex --------------------
def word_tokenize(sentence):
    """Tokenize sentence into words handling URLs, emails, dates, numbers, Hindi and English text"""
    pattern = r'''
        (https?://[^\s]+) |                     # Group 1: URLs
        (www\.[^\s]+) |                         # Group 2: www URLs
        (\w+@\w+\.\w+) |                        # Group 3: Emails
        (\d{1,2}/\d{1,2}/\d{2,4}) |             # Group 4: Dates
        (\d+\.\d+) |                            # Group 5: Decimal numbers
        ([\u0900-\u097F]+) |                    # Group 6: Hindi (Devanagari) words
        ([a-zA-Z0-9_-]+) |                      # Group 7: Latin words/digits
        ([^\s])                                 # Group 8: Other single chars
    '''

    tokens = re.findall(pattern, sentence, re.VERBOSE)
    flat_tokens = [token for group in tokens for token in group if token]
    return flat_tokens

# -------------------- Corpus Statistics Class --------------------
class CorpusStatistics:
    def __init__(self):
        self.total_sentences = 0
        self.total_words = 0
        self.total_characters = 0
        self.sentence_lengths = []
        self.word_lengths = []
        self.vocabulary = Counter()
        self.processed_documents = 0
        self.tokenized_data = []
        self.memory_threshold = 1000  # MB - adjust based on Colab limits

    def check_memory_and_save(self, output_dir: str = "hindi_corpus_output"):
        """Check memory usage and save checkpoint if needed"""
        current_memory = get_memory_usage()

        if current_memory > self.memory_threshold:
            print(f"\nMemory usage: {current_memory:.1f}MB - Saving checkpoint...")

            # Save current data
            self.save_checkpoint(output_dir)

            # Clear tokenized data to free memory but keep statistics
            self.tokenized_data = []
            clear_memory()

            print(f"Memory cleared. Current usage: {get_memory_usage():.1f}MB")

    def process_document(self, text: str, doc_id: int) -> Dict[str, Any]:
        """Process a single document and update statistics"""
        if not text or not text.strip():
            return None

        text = text.strip()

        # Skip extremely long documents to prevent memory issues
        if len(text) > 50000:  # Skip documents longer than 50K characters
            return None

        sentences = sentence_split(text)
        processed_sentences = []
        doc_word_count = 0

        for sentence in sentences:
            if not sentence.strip():
                continue

            words = word_tokenize(sentence)

            if words:  # Only process non-empty sentences
                processed_sentences.append({
                    'text': sentence,
                    'tokens': words,
                    'word_count': len(words)
                })

                # Update statistics
                self.total_sentences += 1
                self.total_words += len(words)
                self.total_characters += len(sentence)
                self.sentence_lengths.append(len(words))

                # Update vocabulary
                self.vocabulary.update(words)

                # Track word lengths
                for word in words:
                    self.word_lengths.append(len(word))

                doc_word_count += len(words)

        if processed_sentences:
            self.processed_documents += 1

            document_data = {
                'document_id': doc_id,
                'original_text': text[:1000] + "..." if len(text) > 1000 else text,  # Truncate for memory
                'sentences': processed_sentences,
                'document_stats': {
                    'sentence_count': len(processed_sentences),
                    'word_count': doc_word_count,
                    'character_count': len(text)
                }
            }

            return document_data

        return None

    def compute_final_statistics(self) -> Dict[str, float]:
        """Compute all required corpus statistics"""
        if self.total_sentences == 0:
            return {}

        # Calculate averages
        avg_sentence_length = (sum(self.sentence_lengths) / len(self.sentence_lengths)) if self.sentence_lengths else 0
        avg_word_length = (sum(self.word_lengths) / len(self.word_lengths)) if self.word_lengths else 0

        # Calculate Type-Token Ratio
        unique_tokens = len(self.vocabulary)
        total_tokens = self.total_words
        ttr = unique_tokens / total_tokens if total_tokens > 0 else 0

        return {
            'total_sentences': self.total_sentences,
            'total_words': self.total_words,
            'total_characters': self.total_characters,
            'average_sentence_length': round(avg_sentence_length, 2),
            'average_word_length': round(avg_word_length, 2),
            'type_token_ratio': round(ttr, 4),
            'vocabulary_size': unique_tokens,
            'processed_documents': self.processed_documents
        }

    def save_checkpoint(self, output_dir: str):
        """Save current progress as checkpoint"""
        os.makedirs(output_dir, exist_ok=True)

        checkpoint_file = os.path.join(output_dir, f"checkpoint_{self.processed_documents}.pkl")

        checkpoint_data = {
            'tokenized_data': self.tokenized_data,
            'processed_documents': self.processed_documents,
            'timestamp': time.time()
        }

        with open(checkpoint_file, 'wb') as f:
            pickle.dump(checkpoint_data, f)

    def save_data_and_statistics(self, output_dir: str = "hindi_corpus_output"):
        """Save tokenized data and statistics to files - optimized for Colab"""
        os.makedirs(output_dir, exist_ok=True)

        # Save current tokenized data if any
        if self.tokenized_data:
            json_file = os.path.join(output_dir, "tokenized_data_final.json")
            with open(json_file, 'w', encoding='utf-8') as f:
                json.dump(self.tokenized_data, f, ensure_ascii=False, indent=2)

        # Compute and save statistics
        stats = self.compute_final_statistics()

        # Save statistics as JSON
        stats_file = os.path.join(output_dir, "corpus_statistics.json")
        with open(stats_file, 'w', encoding='utf-8') as f:
            json.dump(stats, f, indent=2, ensure_ascii=False)

        # Save top vocabulary (memory-friendly)
        vocab_file = os.path.join(output_dir, "vocabulary_top1000.json")
        vocab_dict = dict(self.vocabulary.most_common(1000))  # Top 1K words for Colab
        with open(vocab_file, 'w', encoding='utf-8') as f:
            json.dump(vocab_dict, f, indent=2, ensure_ascii=False)

        # Save detailed report
        report_file = os.path.join(output_dir, "statistics_report.txt")
        with open(report_file, 'w', encoding='utf-8') as f:
            f.write("NLP ASSIGNMENT 1 - HINDI CORPUS STATISTICS REPORT\n")
            f.write("=" * 55 + "\n\n")
            f.write("Dataset: ai4bharat/IndicCorpV2 (Hindi)\n")
            f.write("File: hi-1.txt (streaming)\n")
            f.write(f"Processing Scale: {stats['processed_documents']:,} documents\n")
            f.write(f"Memory Management: Colab-optimized\n\n")

            f.write("ASSIGNMENT REQUIREMENTS (Task 1d):\n")
            f.write("-" * 35 + "\n")
            f.write(f"i.   Total number of sentences: {stats['total_sentences']:,}\n")
            f.write(f"ii.  Total number of words: {stats['total_words']:,}\n")
            f.write(f"iii. Total number of characters: {stats['total_characters']:,}\n")
            f.write(f"iv.  Average Sentence Length: {stats['average_sentence_length']} words per sentence\n")
            f.write(f"v.   Average word length: {stats['average_word_length']} characters per word\n")
            f.write(f"vi.  Type/Token Ratio (TTR): {stats['type_token_ratio']}\n\n")

            f.write("EXTENDED STATISTICS:\n")
            f.write("-" * 20 + "\n")
            f.write(f"Vocabulary size (unique tokens): {stats['vocabulary_size']:,}\n")
            f.write(f"Processed documents: {stats['processed_documents']:,}\n")
            f.write(f"Longest sentence: {max(self.sentence_lengths) if self.sentence_lengths else 0} words\n")
            f.write(f"Shortest sentence: {min(self.sentence_lengths) if self.sentence_lengths else 0} words\n")
            f.write(f"Longest word: {max(self.word_lengths) if self.word_lengths else 0} characters\n\n")

            f.write("TOP 20 MOST FREQUENT TOKENS:\n")
            f.write("-" * 30 + "\n")
            for i, (word, freq) in enumerate(self.vocabulary.most_common(20), 1):
                f.write(f"{i:2d}. {word}: {freq:,}\n")

        return stats

# -------------------- Main Processing Function --------------------
def main():
    print("NLP ASSIGNMENT 1: Text Preprocessing with Hindi IndicCorpV2")
    print("=" * 60)
    print("Dataset: ai4bharat/IndicCorpV2 (Hindi - hi-1.txt)")
    print("Environment: Google Colab Optimized")
    print("=" * 60)

    try:
        # Stream Hindi dataset from Hugging Face without full download
        print("Loading dataset stream...")
        hindi_dataset = load_dataset(
            "text",
            data_files="https://huggingface.co/datasets/ai4bharat/IndicCorpV2/resolve/main/data/hi-1.txt",
            split="train",
            streaming=True
        )
        print("‚úÖ Dataset stream loaded successfully!")

    except Exception as e:
        print(f"‚ùå Error loading dataset: {e}")
        print("Trying alternative approach...")
        # Alternative: use a smaller sample or local file
        return

    # Initialize corpus statistics
    corpus_stats = CorpusStatistics()

    print(f"\nInitial memory usage: {get_memory_usage():.1f}MB")
    print("\nProcessing Hindi dataset...\n")

    # Process examples with memory management
    print("SAMPLE PROCESSING (First 3 examples):")
    print("-" * 40)

    sample_count = 0
    processed_count = 0
    max_documents = 10000  # Limit for Colab (adjust as needed)

    try:
        for i, example in enumerate(hindi_dataset):
            # Skip if 'text' is missing or just whitespace
            if 'text' not in example or not example['text'].strip():
                continue

            text = example['text'].strip()
            if not text:
                continue

            # Show sample processing for first 3 examples
            if sample_count < 3:
                print(f"\n--- Example {sample_count + 1} ---")
                print("Original Paragraph:")
                print(text[:150] + "..." if len(text) > 150 else text)

                sentences = sentence_split(text)
                print(f"\nSentences found: {len(sentences)}")

                if sentences:
                    print("\nFirst sentence tokens:")
                    tokens = word_tokenize(sentences[0])
                    print(tokens[:10])  # Show first 10 tokens
                    if len(tokens) > 10:
                        print(f"... and {len(tokens) - 10} more tokens")

                sample_count += 1

            # Process document for statistics
            processed_doc = corpus_stats.process_document(text, processed_count)
            if processed_doc:
                corpus_stats.tokenized_data.append(processed_doc)
                processed_count += 1

            # Memory management and progress updates
            if processed_count > 0 and processed_count % 1000 == 0:
                print(f"\nüìä Progress: {processed_count:,} documents processed")
                print(f"   Sentences: {corpus_stats.total_sentences:,}")
                print(f"   Words: {corpus_stats.total_words:,}")
                print(f"   Memory: {get_memory_usage():.1f}MB")

                # Check memory and save checkpoint if needed
                corpus_stats.check_memory_and_save()

            # Stop if we reach the limit
            if processed_count >= max_documents:
                print(f"\nüõë Reached processing limit of {max_documents:,} documents")
                break

    except Exception as e:
        print(f"‚ö†Ô∏è Processing interrupted: {e}")
        print("Continuing with data processed so far...")

    print(f"\n‚úÖ Processing complete!")
    print(f"üìä Processed {processed_count:,} documents")
    print(f"üíæ Current memory usage: {get_memory_usage():.1f}MB")

    # Save data and compute statistics
    print("\nüíæ Saving tokenized data and computing statistics...")
    try:
        stats = corpus_stats.save_data_and_statistics()

        # Display final statistics
        print("\n" + "=" * 60)
        print("FINAL CORPUS STATISTICS (Assignment Task 1d)")
        print("=" * 60)
        print(f"i.   Total number of sentences: {stats['total_sentences']:,}")
        print(f"ii.  Total number of words: {stats['total_words']:,}")
        print(f"iii. Total number of characters: {stats['total_characters']:,}")
        print(f"iv.  Average Sentence Length: {stats['average_sentence_length']} words per sentence")
        print(f"v.   Average word length: {stats['average_word_length']} characters per word")
        print(f"vi.  Type/Token Ratio (TTR): {stats['type_token_ratio']}")
        print("-" * 60)
        print(f"Vocabulary size: {stats['vocabulary_size']:,} unique tokens")
        print(f"Processed documents: {stats['processed_documents']:,}")
        print("=" * 60)

        print(f"\nüéØ ASSIGNMENT TASK 1 COMPLETED!")
        print(f"üìÅ Check 'hindi_corpus_output' directory for:")
        print(f"   üìÑ tokenized_data_final.json - Tokenized documents")
        print(f"   üìä corpus_statistics.json - All computed statistics")
        print(f"   üìù statistics_report.txt - Detailed human-readable report")
        print(f"   üî§ vocabulary_top1000.json - Most frequent words")

        print(f"\n‚úÖ All requirements completed:")
        print(f"   a. ‚úÖ Downloaded and processed {stats['processed_documents']:,} Hindi documents")
        print(f"   b. ‚úÖ Tokenized {stats['total_sentences']:,} sentences into {stats['total_words']:,} words")
        print(f"   c. ‚úÖ Saved all tokenized data")
        print(f"   d. ‚úÖ Computed comprehensive corpus statistics")

        print(f"\nüìà CORPUS INSIGHTS:")
        print(f"   üî§ Vocabulary richness: {stats['vocabulary_size']:,} unique tokens")
        print(f"   üìù Text density: {stats['total_characters']:,} characters processed")
        print(f"   üîÑ Language diversity: TTR = {stats['type_token_ratio']}")

    except Exception as e:
        print(f"‚ùå Error saving results: {e}")
        return

# -------------------- Installation and Setup --------------------
def install_requirements():
    """Install required packages for Colab"""
    import subprocess
    import sys

    packages = ['datasets', 'psutil']

    for package in packages:
        try:
            __import__(package)
            print(f"‚úÖ {package} already installed")
        except ImportError:
            print(f"üì¶ Installing {package}...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# -------------------- Run the script --------------------
if __name__ == "__main__":
    # Uncomment the line below if you need to install packages
    # install_requirements()

    main()

NLP ASSIGNMENT 1: Text Preprocessing with Hindi IndicCorpV2
Dataset: ai4bharat/IndicCorpV2 (Hindi - hi-1.txt)
Environment: Google Colab Optimized
Loading dataset stream...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


‚úÖ Dataset stream loaded successfully!

Initial memory usage: 555.2MB

Processing Hindi dataset...

SAMPLE PROCESSING (First 3 examples):
----------------------------------------

--- Example 1 ---
Original Paragraph:
‡§≤‡•ã‡§ó‡•ã‡§Ç ‡§ï‡•ã ‡§¨‡§ø‡§≤‡•ã‡§Ç ‡§∏‡§Ç‡§¨‡§Ç‡§ß‡•Ä ‡§∏‡•Å‡§µ‡§ø‡§ß‡§æ ‡§¶‡•á‡§®‡§æ ‡§π‡•Ä ‡§â‡§®‡§ï‡§æ ‡§ï‡§æ‡§Æ

Sentences found: 1

First sentence tokens:
['‡§≤‡•ã‡§ó‡•ã‡§Ç', '‡§ï‡•ã', '‡§¨‡§ø‡§≤‡•ã‡§Ç', '‡§∏‡§Ç‡§¨‡§Ç‡§ß‡•Ä', '‡§∏‡•Å‡§µ‡§ø‡§ß‡§æ', '‡§¶‡•á‡§®‡§æ', '‡§π‡•Ä', '‡§â‡§®‡§ï‡§æ', '‡§ï‡§æ‡§Æ']

--- Example 2 ---
Original Paragraph:
‡§á‡§®‡•á‡§≤‡•ã 1987 ‡§Æ‡•á‡§Ç ‡§â‡§∏ ‡§µ‡§ï‡•ç‡§§ ‡§ê‡§∏‡•á ‡§π‡•Ä ‡§¶‡•ã‡§∞‡§æ‡§π‡•á ‡§™‡§∞ ‡§ñ‡§°‡§º‡•Ä ‡§•‡•Ä, ‡§ú‡§¨ ‡§™‡•Ç‡§∞‡•ç‡§µ ‡§â‡§™‡§™‡•ç‡§∞‡§ß‡§æ‡§®‡§Æ‡§Ç‡§§‡•ç‡§∞‡•Ä ‡§¶‡•á‡§µ‡•Ä‡§≤‡§æ‡§≤ ‡§®‡•á ‡§Ö‡§™‡§®‡•á ‡§™‡•Å‡§§‡•ç‡§∞ ‡§ì‡§Æ‡§™‡•ç‡§∞‡§ï‡§æ‡§∂ ‡§ö‡•å‡§ü‡§æ‡§≤‡§æ ‡§ï‡•ã ‡§Ö‡§™‡§®‡§æ ‡§∞‡§æ‡§ú‡§®‡•Ä‡§§‡§ø‡§ï ‡§â‡§§‡•ç‡§§‡§∞‡§æ‡§ß‡§ø‡§ï‡§æ‡§∞‡•Ä ‡§ò‡•ã‡§∑‡§ø‡§§ ‡§ï‡§ø‡§Ø...

Sentences found: 7

First sentence 