In [5]:
# Language Detection with WiLI-2018 and Naive Bayes

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, accuracy_score
import joblib
from google.colab import drive

In [6]:
#Load Dataset
# Assuming CSV format: columns "Text", "language"
drive.mount('/content/drive')
path = '/content/drive/MyDrive/Colab Notebooks/sklearn_project/dataset.csv'
df = pd.read_csv(path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
#Optional Filtering (e.g., limit to top N languages for faster experimentation)
top_langs = df['language'].value_counts().nlargest(50).index
df = df[df['language'].isin(top_langs)]

In [8]:
#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    df['Text'], df['language'], test_size=0.2, random_state=42, stratify=df['language']
)

In [9]:
#Build a Pipeline (Vectorization + Classifier)
pipeline = make_pipeline(
    TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 5)),
    MultinomialNB()
)

In [10]:
#Train the Model
pipeline.fit(X_train, y_train)

In [11]:
#Evaluate
y_pred = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9722727272727273

Classification Report:
               precision    recall  f1-score   support

      Arabic       1.00      1.00      1.00       200
     Chinese       0.99      0.87      0.93       200
       Dutch       0.98      0.97      0.98       200
     English       0.68      1.00      0.81       200
    Estonian       1.00      0.95      0.98       200
      French       0.95      0.99      0.97       200
       Hindi       1.00      0.96      0.98       200
  Indonesian       0.99      0.97      0.98       200
    Japanese       1.00      0.97      0.98       200
      Korean       1.00      0.98      0.99       200
       Latin       0.97      0.92      0.94       200
     Persian       0.99      0.99      0.99       200
   Portugese       0.98      0.95      0.97       200
      Pushto       1.00      0.95      0.97       200
    Romanian       1.00      0.98      0.99       200
     Russian       0.99      0.99      0.99       200
     Spanish       0.99    

In [12]:
#Save the Trained Model
joblib.dump(pipeline, "language_detector_wili.joblib")

['language_detector_wili.joblib']

In [13]:
#Inference Example
def predict_sentences(sentences):
    preds = pipeline.predict(sentences)
    for s, l in zip(sentences, preds):
        print(f"'{s}' → {l}")

In [14]:
predict_sentences([
    "Bonjour tout le monde",
    "Hello, how are you?",
    "नमस्ते, आप कैसे हैं?"
])

'Bonjour tout le monde' → French
'Hello, how are you?' → English
'नमस्ते, आप कैसे हैं?' → Hindi
