In [1]:
# Disable TensorFlow (force PyTorch-only)
import os
os.environ["USE_TF"] = "0"

from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

from transformers import pipeline

print("All imports successful ✅")

PROJECT_ROOT = Path.cwd().resolve().parents[0] if Path.cwd().name == "notebooks" else Path.cwd()
DATA_ROOT = PROJECT_ROOT / "data"
print("Data root:", DATA_ROOT)


All imports successful ✅
Data root: /Users/jyotirmoy/Desktop/Image/ancient-script-ai/data


In [2]:
data = {
    "text": [
        # Sanskrit
        "कर्मणा जायते पुरुषः", "ज्ञानं योगेन साध्यते", "धर्मो रक्षति रक्षितः", "सत्यमेव जयते", "विद्या ददाति विनयं",
        "योगः कर्मसु कौशलम्", "अहं ब्रह्मास्मि", "तत्त्वमसि", "शिवोऽहम्", "सर्वं खल्विदं ब्रह्म",
        # Hindi
        "मेरा घर बड़ा है", "वह बाज़ार गया था", "आज मौसम अच्छा है", "राम मंदिर बहुत सुंदर है", "मैं स्कूल जा रहा हूँ",
        "उसका नाम गीता है", "मैंने खाना खाया", "वह पढ़ाई कर रहा है", "यह मेरा दोस्त है", "मैं सोने जा रहा हूँ",
        # Marathi
        "माझे घर मोठे आहे", "तो बाजारात गेला होता", "आज हवा छान आहे", "राम मंदिर सुंदर आहे", "मी शाळेत जात आहे",
        "त्याचं नाव गीता आहे", "मी जेवलो आहे", "तो अभ्यास करत आहे", "हा माझा मित्र आहे", "मी झोपायला जात आहे"
    ],
    "language": ["sanskrit"]*10 + ["hindi"]*10 + ["marathi"]*10
}

lang_df = pd.DataFrame(data)
print("Dataset size:", len(lang_df))
lang_df.sample(5)


Dataset size: 30


Unnamed: 0,text,language
2,धर्मो रक्षति रक्षितः,sanskrit
29,मी झोपायला जात आहे,marathi
13,राम मंदिर बहुत सुंदर है,hindi
20,माझे घर मोठे आहे,marathi
15,उसका नाम गीता है,hindi


In [3]:
# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(
    lang_df["text"], lang_df["language"], test_size=0.3, random_state=42
)

# Extract character-level features (n-grams)
vectorizer = CountVectorizer(analyzer="char", ngram_range=(1, 3))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train Naive Bayes
lang_clf = MultinomialNB()
lang_clf.fit(X_train_vec, y_train)

# Evaluate
y_pred = lang_clf.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.8888888888888888
              precision    recall  f1-score   support

       hindi       0.75      1.00      0.86         3
     marathi       1.00      0.75      0.86         4
    sanskrit       1.00      1.00      1.00         2

    accuracy                           0.89         9
   macro avg       0.92      0.92      0.90         9
weighted avg       0.92      0.89      0.89         9



In [4]:
def detect_language(text: str):
    """
    Detect language from input text using trained Naive Bayes classifier.
    Returns (predicted_language, confidence).
    """
    X = vectorizer.transform([text])
    probs = lang_clf.predict_proba(X)[0]
    classes = lang_clf.classes_
    pred_lang = classes[np.argmax(probs)]
    confidence = np.max(probs)
    return pred_lang, confidence


# Quick test
samples = ["कर्मणा जायते पुरुषः", "मेरा घर बड़ा है", "माझे घर मोठे आहे"]
for s in samples:
    lang, conf = detect_language(s)
    print(f"{s} → {lang} ({conf:.2f})")


कर्मणा जायते पुरुषः → sanskrit (1.00)
मेरा घर बड़ा है → hindi (1.00)
माझे घर मोठे आहे → marathi (1.00)


In [5]:
from transformers import pipeline

translator = pipeline(
    "translation",
    model="Helsinki-NLP/opus-mt-mul-en",
    framework="pt"
)

def translate_text(text, lang):
    if lang in ["hindi", "marathi", "sanskrit"]:
        result = translator(text, clean_up_tokenization_spaces=True)
        return result[0]["translation_text"]
    else:
        return text

test_sentences = ["मेरा घर बड़ा है", "कर्मणा जायते पुरुषः", "माझे घर मोठे आहे"]

for t in test_sentences:
    lang, conf = detect_language(t)
    translation = translate_text(t, lang)
    print(f"\nText: {t}\nDetected: {lang} ({conf:.2f})\nEnglish: {translation}")


Device set to use mps:0



Text: मेरा घर बड़ा है
Detected: hindi (1.00)
English: My house is big.

Text: कर्मणा जायते पुरुषः
Detected: sanskrit (1.00)
English: Men to be worked:

Text: माझे घर मोठे आहे
Detected: marathi (1.00)
English: My home is large


In [6]:
def full_pipeline(text):
    lang, conf = detect_language(text)
    if conf < 0.6:
        return {"text": text, "status": "low_confidence"}
    english = translate_text(text, lang)
    return {
        "input_text": text,
        "language": lang,
        "confidence": round(conf, 3),
        "translation": english
    }


# Example
result = full_pipeline("मेरा घर बड़ा है")
print(result)


{'input_text': 'मेरा घर बड़ा है', 'language': np.str_('hindi'), 'confidence': np.float64(1.0), 'translation': 'My house is big.'}


In [7]:
import joblib

models_dir = PROJECT_ROOT / "models"
models_dir.mkdir(exist_ok=True)

joblib.dump(lang_clf, models_dir / "language_id_model.pkl")
joblib.dump(vectorizer, models_dir / "language_vectorizer.pkl")

print("Saved models to:", models_dir)


Saved models to: /Users/jyotirmoy/Desktop/Image/ancient-script-ai/models
