In [1]:
import pandas as pd
import unicodedata
from pathlib import Path

PROJECT_ROOT = Path.cwd().resolve().parents[0] if Path.cwd().name == "notebooks" else Path.cwd()
DATA_ROOT = PROJECT_ROOT / "data"
PHONETIC_PATH = DATA_ROOT / "phonetic_mappings"/"all_script_phonetic_data.csv"

print("Phonetic file:", PHONETIC_PATH)


Phonetic file: /Users/jyotirmoy/Desktop/Image/ancient-script-ai/data/phonetic_mappings/all_script_phonetic_data.csv


In [2]:
# Read CSV
phonetic_df = pd.read_csv(PHONETIC_PATH)

# Show first few rows
phonetic_df.head(10)


Unnamed: 0,Unicode,Relative Offset,Devanagari,ITRANS,Notes,Valid Vector Representation,is_vowel,is_consonant,nukta,halanta,...,not_nasal,front,central,back,close,close-mid,open-mid,open,rounded,not_rounded
0,900,0,ऀ,ऀ,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,901,1,ँ,.n,,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,902,2,ं,.n,,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,903,3,ः,H,Should represent as pure aspiration and not as...,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,904,4,ऄ,ऄ,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,905,5,अ,a,,1,1,0,0,0,...,1,0,0,1,0,0,0,1,0,1
6,906,6,आ,A,,1,1,0,0,0,...,1,0,0,1,0,0,0,1,0,1
7,907,7,इ,i,,1,1,0,0,0,...,1,1,0,0,1,0,0,0,0,1
8,908,8,ई,I,,1,1,0,0,0,...,1,1,0,0,1,0,0,0,0,1
9,909,9,उ,u,,1,1,0,0,0,...,1,0,0,1,1,0,0,0,1,0


In [4]:
from pathlib import Path

DATA_ROOT = Path("../data")  # adjust if notebook is already inside notebooks/
resource_dir = DATA_ROOT / "phonetic_mappings"

print("Checking contents of:", resource_dir)
print("-" * 60)

if resource_dir.exists():
    for p in resource_dir.iterdir():
        print(p.name)
else:
    print("⚠️ Folder not found:", resource_dir)


Checking contents of: ../data/phonetic_mappings
------------------------------------------------------------
english_arpabet_list.csv
tamil_script_phonetic_data.xlsx
english_script_phonetic_data.xlsx
english_script_phonetic_data.csv
devanagari_itrans_mapping.json
all_script_phonetic_data.xlsx
arpabet.pdf
tamil_script_phonetic_data.csv
.git
all_script_phonetic_data.csv


In [7]:
# Drop rows where Devanagari is NaN
phonetic_df = phonetic_df.dropna(subset=["Devanagari", "ITRANS"])

# Build dictionaries
dev_to_itrans = dict(zip(phonetic_df["Devanagari"], phonetic_df["ITRANS"]))
itrans_to_dev = {v: k for k, v in dev_to_itrans.items()}

print("Sample mappings:")
for i, (k, v) in enumerate(dev_to_itrans.items()):
    print(f"{k}  ->  {v}")
    if i > 10: break



Sample mappings:
ऀ  ->  ऀ
ँ  ->  .n
ं  ->  .n
ः  ->  H
ऄ  ->  ऄ
अ  ->  a
आ  ->  A
इ  ->  i
ई  ->  I
उ  ->  u
ऊ  ->  uu
ऋ  ->  R^i


In [8]:
def devanagari_to_itrans(text: str) -> str:
    """
    Convert a Devanagari string into ITRANS / Latin phonetic.
    """
    result = []
    for char in text:
        if char in dev_to_itrans:
            result.append(dev_to_itrans[char])
        else:
            # Keep punctuation / spaces / unknowns as-is
            result.append(char)
    return "".join(result)


In [9]:
sample_texts = [
    "कर्मणा जायते",
    "मेरा घर",
    "धम्मपद"
]

for t in sample_texts:
    print(f"{t}  →  {devanagari_to_itrans(t)}")


कर्मणा जायते  →  kara्maNaA jaAyatae
मेरा घर  →  maeraA ghara
धम्मपद  →  dhama्mapada


In [10]:
# Suppose your OCR predicted these class names:
ocr_pred_classes = ["character_1_ka", "character_7_ma", "character_8_ra"]

# Extract the Devanagari symbol (optional manual mapping)
# e.g., you can map class names → actual Devanagari chars using your own list
# For demonstration:
class_to_char = {
    "character_1_ka": "क",
    "character_7_ma": "म",
    "character_8_ra": "र",
}

predicted_text = "".join([class_to_char[c] for c in ocr_pred_classes])
print("OCR predicted (Devanagari):", predicted_text)

print("Transliterated (ITRANS):", devanagari_to_itrans(predicted_text))


OCR predicted (Devanagari): कमर
Transliterated (ITRANS): kamara


In [11]:
import json

mapping_out = {
    "dev_to_itrans": dev_to_itrans,
    "itrans_to_dev": itrans_to_dev,
}

out_path = DATA_ROOT / "phonetic_mappings" / "devanagari_itrans_mapping.json"
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(mapping_out, f, ensure_ascii=False, indent=2)

print("Saved transliteration mapping to:", out_path)


Saved transliteration mapping to: ../data/phonetic_mappings/devanagari_itrans_mapping.json
