<a href="https://colab.research.google.com/github/Harshini-Manchala/NLP_Lab/blob/main/NLP_lab01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
# from datasets import load_dataset

# dataset = load_dataset("ai4bharat/IndicCorpV2", name="indiccorp_v2", split="tel_Telu",streaming=True)


In [17]:
import re
import os
from datasets import load_dataset

# -----------------------------
#  Load Telugu Dataset
# -----------------------------
print("Loading Telugu dataset from Hugging Face...")
ds = load_dataset("ai4bharat/IndicCorpV2", name="indiccorp_v2", split="tel_Telu",streaming=True)

# -----------------------------
#  Sentence Tokenization
# -----------------------------
# Split after sentence-ending punctuation (Telugu & common)
SENT_SPLIT_RE = re.compile(r'(?<=[।॥.!?])\s*')

def sentence_tokenize(text):
    sentences = re.split(SENT_SPLIT_RE, text.strip())
    return [s for s in sentences if s.strip()]

# -----------------------------
#  Word Tokenization
# -----------------------------
TOKEN_RE = re.compile(r"""
    (https?://\S+)                           # URLs
  | ([\w\.-]+@[\w\.-]+\.\w+)                 # Emails
  | (\d{1,2}[/-]\d{1,2}[/-]\d{2,4})          # Dates
  | (\d+(?:\.\d+)?)                          # Numbers & decimals
  | ([\u0C00-\u0C7F]+)                       # Telugu words
  | ([A-Za-z]+)                              # English words
  | (\S)                                     # Punctuation / symbols
""", re.VERBOSE)

def word_tokenize(sentence):
    return [m.group(0) for m in TOKEN_RE.finditer(sentence)]

# -----------------------------
# 4️⃣ Process, Save, and Compute Statistics
# -----------------------------
def process_telugu(out_dir="telugu_tokenized", sample_size=None):
    os.makedirs(out_dir, exist_ok=True)
    sentences_path = os.path.join(out_dir, "sentences.txt")
    words_path = os.path.join(out_dir, "words.txt")

    total_sentences = 0
    total_words = 0
    total_chars = 0
    unique_tokens = set()

    # Limit for testing
    rows = ds.take(sample_size) if sample_size is not None else ds


    with open(sentences_path, "w", encoding="utf-8") as fs, \
     open(words_path, "w", encoding="utf-8") as fw:
        for row in rows:
            text = row["text"]

        # Sentence tokenization
            for sent in sentence_tokenize(text):
                fs.write(sent + "\n")
                total_sentences += 1

            # Word tokenization
                tokens = word_tokenize(sent)
                for token in tokens:
                    fw.write(token + "\n")  # <-- write each token on its own line
                total_words += len(tokens)
                total_chars += sum(len(t) for t in tokens)
                unique_tokens.update(tokens)


    # Stats
    avg_sent_len = total_words / total_sentences if total_sentences else 0
    avg_word_len = total_chars / total_words if total_words else 0
    ttr = len(unique_tokens) / total_words if total_words else 0

    print("\n--- Corpus Statistics (Telugu) ---")
    print(f"Total sentences: {total_sentences}")
    print(f"Total words/tokens: {total_words}")
    print(f"Total characters in tokens: {total_chars}")
    print(f"Average sentence length (words): {avg_sent_len:.2f}")
    print(f"Average word length (characters): {avg_word_len:.2f}")
    print(f"Type-Token Ratio: {ttr:.4f}")

# -----------------------------
# 5️⃣ Run (Test mode first)
# -----------------------------
if __name__ == "__main__":
    # For quick test, set sample_size=2000
    # For full dataset, set sample_size=None (⚠ huge file!)
    process_telugu(out_dir="telugu_tokenized", sample_size=100000)

Loading Telugu dataset from Hugging Face...

--- Corpus Statistics (Telugu) ---
Total sentences: 227481
Total words/tokens: 2256541
Total characters in tokens: 12112204
Average sentence length (words): 9.92
Average word length (characters): 5.37
Type-Token Ratio: 0.0903
