In [None]:
import pandas as pd # Data Cleaning
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer  # To Convert The Text into Numbers..
# Count Vectorizer Convert the text in to Numbers
# feature Extraction is A class in the Sk-learn Library for extractiing the features here we work on txt so we use .text here
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv("language.csv")

In [None]:
data.head(5)

Unnamed: 0,Text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch


## Structure Of the Project 
   ## Language (Text) --> Numbers (0,1) --> Model (using ML ALgo) --> Train The Model --> Testing 

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
data.isnull().sum()

Text        0
language    0
dtype: int64

In [None]:
data['language'].value_counts()

language
Estonian      1000
Swedish       1000
Thai          1000
Tamil         1000
Dutch         1000
Japanese      1000
Turkish       1000
Latin         1000
Urdu          1000
Indonesian    1000
Portugese     1000
French        1000
Chinese       1000
Korean        1000
Hindi         1000
Spanish       1000
Pushto        1000
Persian       1000
Romanian      1000
Russian       1000
English       1000
Arabic        1000
Name: count, dtype: int64

In [None]:
# Convert the Text Column to Array format Numpy Library
x=np.array(data['Text'])
y=np.array(data['language'])

In [None]:
#convert the Text array(x) to Numbers
cv=CountVectorizer()
X = cv.fit_transform(x)

In [None]:
X_train , X_test , y_train ,y_test = train_test_split(X,y, test_size = 0.40 , random_state= 42)
'''X: The features (input data) of your dataset.
y: The labels (output data) associated with the features.
test_size=0.40: Specifies that 40% of the data should be allocated to the test set, while the remaining 60% will go to the training set.
random_state=42: This is a seed for the random number generator, which ensures that you get the same split each time you run the code, making results reproducible.'''

'X: The features (input data) of your dataset.\ny: The labels (output data) associated with the features.\ntest_size=0.40: Specifies that 40% of the data should be allocated to the test set, while the remaining 60% will go to the training set.\nrandom_state=42: This is a seed for the random number generator, which ensures that you get the same split each time you run the code, making results reproducible.'

In [None]:
# Model Building
model=MultinomialNB()

In [None]:
model.fit(X_train , y_train)

In [None]:
model.score(X_test , y_test)

0.95375

In [None]:
#Predict on Test Data
y_pred = model.predict(X_test)

In [None]:
# Evaluate with Metrics
from sklearn.metrics import classification_report, accuracy_score

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.95375

Classification Report:
               precision    recall  f1-score   support

      Arabic       1.00      1.00      1.00       403
     Chinese       0.92      0.51      0.65       384
       Dutch       0.99      0.99      0.99       419
     English       0.69      1.00      0.82       397
    Estonian       0.99      0.96      0.98       407
      French       0.95      0.99      0.97       394
       Hindi       1.00      0.98      0.99       406
  Indonesian       0.99      0.97      0.98       381
    Japanese       0.70      0.85      0.77       403
      Korean       1.00      0.98      0.99       403
       Latin       0.99      0.92      0.95       415
     Persian       1.00      1.00      1.00       391
   Portugese       0.99      0.96      0.97       391
      Pushto       1.00      0.97      0.98       398
    Romanian       0.99      0.99      0.99       404
     Russian       0.99      0.99      0.99       385
     Spanish       0.98      0.99     

In [None]:
import pickle
with open("model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

with open("vectorizer.pkl", "wb") as vectorizer_file:
    pickle.dump(cv, vectorizer_file)
    

In [None]:
# Load the model and vectorizer from the files
with open("model.pkl", "rb") as model_file:
    model = pickle.load(model_file)

with open("vectorizer.pkl", "rb") as vectorizer_file:
    cv = pickle.load(vectorizer_file)


In [None]:
user = input("Enter the text: ")
data = cv.transform([user]).toarray()
op=model.predict(data)
print(op)

['English']
