In [1]:
import re
import math
from collections import defaultdict, Counter
from pathlib import Path

PROJECT_ROOT = Path.cwd().resolve().parents[0] if Path.cwd().name == "notebooks" else Path.cwd()
DATA_ROOT = PROJECT_ROOT / "data"

print("Data root:", DATA_ROOT)


Data root: /Users/jyotirmoy/Desktop/Image/ancient-script-ai/data


In [2]:
# Example: Load Sanskrit or Hindi text
sample_corpus = [
    "कर्मणा जायते पुरुषः",
    "ज्ञानं योगेन साध्यते",
    "मेरा घर बड़ा है",
    "वह बाज़ार गया था",
]

# Normalize spaces
corpus = [re.sub(r'\s+', ' ', sent.strip()) for sent in sample_corpus]
print("Loaded", len(corpus), "sentences")


Loaded 4 sentences


In [3]:
def tokenize_sentence(sentence):
    return list(sentence.replace(" ", ""))  # character-level tokens

tokens = []
for sent in corpus:
    tokens.extend(tokenize_sentence(sent))

vocab = sorted(set(tokens))
print("Vocabulary size:", len(vocab))
print("Sample tokens:", vocab[:20])


Vocabulary size: 29
Sample tokens: ['ं', 'ः', 'क', 'ग', 'घ', 'ज', 'ञ', 'ड', 'ण', 'त', 'थ', 'ध', 'न', 'प', 'ब', 'म', 'य', 'र', 'व', 'ष']


In [4]:
unigram_counts = Counter(tokens)
bigram_counts = Counter(zip(tokens[:-1], tokens[1:]))

print("Sample bigrams:")
for (a, b), count in list(bigram_counts.items())[:10]:
    print(f"{a}{b} : {count}")


Sample bigrams:
कर : 1
र् : 1
्म : 1
मण : 1
णा : 1
ाज : 2
जा : 1
ाय : 1
यत : 2
ते : 2


In [5]:
def bigram_prob(c1, c2, unigram_counts, bigram_counts, vocab_size, alpha=1):
    # P(c2 | c1) = (count(c1,c2) + α) / (count(c1) + α*|V|)
    return (bigram_counts[(c1, c2)] + alpha) / (unigram_counts[c1] + alpha * vocab_size)


In [6]:
def predict_missing(sequence, unigram_counts, bigram_counts, vocab):
    vocab_size = len(vocab)
    best_candidate = None
    best_prob = -math.inf

    for candidate in vocab:
        test_seq = sequence.replace("_", candidate)
        total_log_prob = 0.0

        for i in range(len(test_seq) - 1):
            c1, c2 = test_seq[i], test_seq[i + 1]
            prob = bigram_prob(c1, c2, unigram_counts, bigram_counts, vocab_size)
            total_log_prob += math.log(prob)

        if total_log_prob > best_prob:
            best_prob = total_log_prob
            best_candidate = candidate

    return best_candidate


In [7]:
test_seq = "क_र्म"
pred_char = predict_missing(test_seq, unigram_counts, bigram_counts, vocab)

print(f"Sequence: {test_seq}")
print(f"Predicted missing character: {pred_char}")
print(f"Reconstructed: {test_seq.replace('_', pred_char)}")


Sequence: क_र्म
Predicted missing character: क
Reconstructed: ककर्म


In [8]:
def reconstruct_text_with_missing(text, vocab, unigram_counts, bigram_counts):
    for i, ch in enumerate(text):
        if ch == "_":
            pred = predict_missing(text, unigram_counts, bigram_counts, vocab)
            text = text.replace("_", pred)
    return text

print(reconstruct_text_with_missing("पु_ष", vocab, unigram_counts, bigram_counts))


पुषष
