In [3]:
import pandas as pd
import numpy as np
import re
import gc
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [4]:
dataset_path = "sentences.csv" 
df = pd.read_csv(dataset_path, delimiter='\t', names=['id', 'lang', 'text'], dtype={'id': str, 'lang': str, 'text': str})

In [5]:
df.head(5)

Unnamed: 0,id,lang,text
0,1,cmn,我們試試看！
1,2,cmn,我该去睡觉了。
2,3,cmn,你在干什麼啊？
3,4,cmn,這是什麼啊？
4,5,cmn,今天是６月１８号，也是Muiriel的生日！


In [6]:
target_languages = [
    "eng", "hin", "urd", "ara", "tel", "tam", "ben", "mar", "guj", "pan", "kan", "mal", "nep", "asm", "ori", "san",
    "fra", "spa", "deu", "rus", "ita", "por", "tur", "jpn", "kor", "zho", "vie", "tha", "ind", "fil", "pol", "nld",
    "swe", "dan", "nor", "fin", "ell", "heb", "fas", "ukr", "hun", "ces", "slk", "bul", "ron", "srp", "hrv", "bos",
    "lit", "lav", "est", "isl", "mlt", "aze", "geo", "arm", "kaz", "uzb", "tgk", "kur", "swh", "amh", "yor", "hau",
    "ibo", "zul", "xho", "sna", "tso", "tsn", "sot", "afr", "cat", "glg", "eus", "oci", "grn", "tat", "mon", "khm",
    "lao", "bur", "sin", "dhivehi"
] 
df = df[df['lang'].isin(target_languages)]

In [7]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text.strip()

df['text'] = df['text'].apply(clean_text)


In [8]:
# TF-IDF Vectorization (Optimized)
vectorizer = TfidfVectorizer(max_features=10000, stop_words=None)  # Increase features for better accuracy
X = vectorizer.fit_transform(df['text']).astype('float32')
y = df['lang']

In [9]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [10]:
# Train Naïve Bayes Classifier
model = MultinomialNB()
model.fit(X_train, y_train)

In [11]:
# Predictions
y_pred = model.predict(X_test)

In [12]:
# Model Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.902091503616475


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

         afr       1.00      0.45      0.62       877
         amh       0.00      0.00      0.00        61
         ara       1.00      0.73      0.85     13153
         asm       0.84      0.51      0.63      2678
         aze       1.00      0.11      0.21      1164
         ben       0.86      0.64      0.73      3151
         bos       0.00      0.00      0.00       347
         bul       0.88      0.60      0.72      5046
         cat       0.98      0.41      0.58      1926
         ces       0.88      0.66      0.76     16512
         dan       0.90      0.85      0.88     12961
         deu       0.98      1.00      0.99    143064
         ell       1.00      0.90      0.95      8006
         eng       0.74      1.00      0.85    393449
         est       1.00      0.22      0.36      1221
         eus       1.00      0.17      0.28      1265
         fin       0.96      0.79      0.87     30166
         fra       0.95    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
import joblib

# Save model
joblib.dump(model, "language_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")


['tfidf_vectorizer.pkl']