In [1]:
# Cell 1: imports & paths

from pathlib import Path
import collections
import json

PROJECT_ROOT = Path.cwd().resolve().parents[0] if Path.cwd().name == "notebooks" else Path.cwd()
DATA_ROOT = PROJECT_ROOT / "data"
CORPUS_ROOT = DATA_ROOT / "corpora"

SA_ITIHASA_DIR = CORPUS_ROOT / "sa_en_itihasa"
HI_GENERAL_DIR = CORPUS_ROOT / "hi_en_general"

print("PROJECT_ROOT:", PROJECT_ROOT)
print("SA_ITIHASA_DIR exists:", SA_ITIHASA_DIR.exists())
print("HI_GENERAL_DIR exists:", HI_GENERAL_DIR.exists())


PROJECT_ROOT: /Users/jyotirmoy/Desktop/Image/ancient-script-ai
SA_ITIHASA_DIR exists: True
HI_GENERAL_DIR exists: True


In [2]:
# Cell 2: load Sanskrit corpus into memory as one big string

def load_sanskrit_corpus(sa_dir: Path):
    """Load all Sanskrit text from the itihasa files."""
    texts = []

    # Case 1: paired .sn files (train.sn, dev.sn, test.sn)
    sn_files = sorted(sa_dir.glob("*.sn"))
    if sn_files:
        for f in sn_files:
            print("Reading", f.name)
            texts.append(f.read_text(encoding="utf-8"))
    else:
        # Case 2 (fallback): maybe you converted to CSV later
        # You can fill this in when/if you make sa_en_itihasa/*.csv files
        raise FileNotFoundError("No .sn files found in sa_en_itihasa. Check folder.")

    full_text = "\n".join(texts)
    print("Total Sanskrit characters:", len(full_text))
    return full_text

sa_text = load_sanskrit_corpus(SA_ITIHASA_DIR)
sa_text[:500]


Reading dev.sn
Reading test.sn
Reading train.sn
Total Sanskrit characters: 9191388


'तस्यां चीरं वसानायां नाथवत्यामनाथवत्। प्रचुक्रोश जनः सर्वो धिक् त्वां दशरथं त्विति ॥\nतेन तत्र प्रणादेन दुःखितः स महीपतिः। चिच्छेद जीविते श्रद्धां धर्मे यशसि चात्मनः॥ स निःश्वस्योष्णमैक्ष्वाकस्तां भार्यामिदमब्रवीत्। कैकेयि कुशचीरेण न सीता गन्तुमर्हति॥\nसुकुमारी च बाला च सततं च सुखोचिता। नेयं वनस्य योग्येति सत्यमाह गुरुर्मम ॥\nइयं हि कस्यापि करोति किंचित् तपस्विनी राजवरस्य पुत्री। या चीरमासाद्य वनस्य मध्ये जाता विसंज्ञा श्रमणीव काचित्॥\nचीराण्यपास्याज्जनकस्य कन्या नेयं प्रतिज्ञा मम दत्तपूर्वा। यथासुख'

In [3]:
# Cell 3: build character n-gram counts (for reconstruction)

def build_char_ngrams(text: str, n: int = 3):
    counts = collections.Counter()
    cleaned = "".join(text.split())  # remove whitespace for pure char stream

    for i in range(len(cleaned) - n + 1):
        gram = cleaned[i:i+n]
        counts[gram] += 1

    print(f"Built {len(counts)} unique {n}-grams")
    return counts

sa_char_trigrams = build_char_ngrams(sa_text, n=3)
list(sa_char_trigrams.items())[:20]


Built 48047 unique 3-grams


[('तस्', 16338),
 ('स्य', 32726),
 ('्या', 38474),
 ('यां', 5453),
 ('ांच', 3576),
 ('ंची', 24),
 ('चीर', 191),
 ('ीरं', 455),
 ('रंव', 1168),
 ('ंवस', 362),
 ('वसा', 666),
 ('सान', 1690),
 ('ाना', 13841),
 ('नाय', 1018),
 ('ाया', 4012),
 ('ांन', 1967),
 ('ंना', 1771),
 ('नाथ', 485),
 ('ाथव', 168),
 ('थवत', 105)]

In [4]:
# Cell 4: Sanskrit word frequencies

import re

def tokenize_words(text: str):
    # simple split on whitespace + punctuation
    tokens = re.split(r"\s+|[।॥,.?!;:\"'()\[\]{}]", text)
    tokens = [t for t in tokens if t.strip()]
    return tokens

sa_words = tokenize_words(sa_text)
print("Total tokens:", len(sa_words))

sa_word_freq = collections.Counter(sa_words)
print("Unique words:", len(sa_word_freq))
list(sa_word_freq.items())[:20]


Total tokens: 1046260
Unique words: 296342


[('तस्यां', 164),
 ('चीरं', 5),
 ('वसानायां', 1),
 ('नाथवत्यामनाथवत्', 1),
 ('प्रचुक्रोश', 3),
 ('जनः', 123),
 ('सर्वो', 56),
 ('धिक्', 38),
 ('त्वां', 1080),
 ('दशरथं', 20),
 ('त्विति', 7),
 ('तेन', 1286),
 ('तत्र', 2715),
 ('प्रणादेन', 1),
 ('दुःखितः', 38),
 ('स', 10613),
 ('महीपतिः', 159),
 ('चिच्छेद', 259),
 ('जीविते', 46),
 ('श्रद्धां', 9)]

In [5]:
# Cell 5: suggestion function for reconstruction

def suggest_similar(prefix: str, word_freq: collections.Counter, top_k: int = 10):
    """Return top_k most frequent words that start with the given prefix."""
    candidates = [(w, c) for w, c in word_freq.items() if w.startswith(prefix)]
    candidates.sort(key=lambda x: x[1], reverse=True)
    return candidates[:top_k]

# Example:
suggest_similar("कर्म", sa_word_freq, top_k=5)


[('कर्म', 990),
 ('कर्मणा', 405),
 ('कर्माणि', 194),
 ('कर्मसु', 71),
 ('कर्मणः', 63)]

In [6]:
# Cell 6: save to models/ for reuse

MODELS_ROOT = PROJECT_ROOT / "models"
MODELS_ROOT.mkdir(exist_ok=True)

# Save n-grams as JSON
with open(MODELS_ROOT / "sa_char_trigrams.json", "w", encoding="utf-8") as f:
    json.dump(sa_char_trigrams, f, ensure_ascii=False)

# Save word frequencies as JSON
with open(MODELS_ROOT / "sa_word_freq.json", "w", encoding="utf-8") as f:
    json.dump(sa_word_freq, f, ensure_ascii=False)

print("Saved:", MODELS_ROOT / "sa_char_trigrams.json")
print("Saved:", MODELS_ROOT / "sa_word_freq.json")


Saved: /Users/jyotirmoy/Desktop/Image/ancient-script-ai/models/sa_char_trigrams.json
Saved: /Users/jyotirmoy/Desktop/Image/ancient-script-ai/models/sa_word_freq.json
