In [32]:
import pandas as pd
import numpy as np
import re

In [33]:
df = pd.read_csv("language.csv")   # your file name
df.head()

Unnamed: 0,Text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch


In [34]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    return text

df['Text'] = df['Text'].apply(clean_text)

In [35]:
X = df['Text']
y = df['language']

In [36]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    analyzer='char',
    ngram_range=(2, 4)
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [38]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_vec, y_train)

In [39]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.975
              precision    recall  f1-score   support

      Arabic       1.00      1.00      1.00       202
     Chinese       0.99      0.90      0.94       201
       Dutch       0.99      0.98      0.98       230
     English       0.71      1.00      0.83       194
    Estonian       1.00      0.95      0.97       200
      French       0.95      0.99      0.97       188
       Hindi       1.00      0.99      0.99       208
  Indonesian       0.99      0.98      0.99       213
    Japanese       1.00      0.98      0.99       194
      Korean       1.00      0.99      1.00       190
       Latin       0.98      0.90      0.94       210
     Persian       0.99      0.99      0.99       196
   Portugese       0.99      0.95      0.97       194
      Pushto       1.00      0.96      0.98       196
    Romanian       1.00      0.97      0.99       197
     Russian       0.99      1.00      0.99       213
     Spanish       0.98      0.98      0.98       199
     Swedis

In [40]:
def predict_language(text):
    text = clean_text(text)
    text = vectorizer.transform([text])
    return model.predict(text)[0]

In [41]:
def predict_language(text):
    text = clean_text(text)
    text = vectorizer.transform([text])
    return model.predict(text)[0]

In [44]:


predict_language("straat en gebouw in nederland")
# Dutch

np.str_('Dutch')

In [43]:
predict_language("This is a machine learning project")
# English

np.str_('English')

In [45]:
predict_language("நான் மெஷின் லெர்னிங் கற்றுக்கொள்கிறேன்")
# Tamil



np.str_('Tamil')

In [46]:
import joblib

joblib.dump(model, "language_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']