## Language detection

Download the Dataset from https://www.kaggle.com/datasets/basilb2s/language-detection, then extract it and upload the csv file to this Notebook.

In [26]:
import pandas as pd
import numpy as np
import re 
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

import warnings
warnings.simplefilter(action="ignore")

In [27]:
# load the dataset
data = pd.read_csv("Language Detection.csv")
data.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [28]:
X= data['Text']
y= data['Language']


In [29]:
#Encoding the y to understand how may qty are there and assign the number to it
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
le.classes_

array(['Arabic', 'Danish', 'Dutch', 'English', 'French', 'German',
       'Greek', 'Hindi', 'Italian', 'Kannada', 'Malayalam', 'Portugeese',
       'Russian', 'Spanish', 'Sweedish', 'Tamil', 'Turkish'], dtype=object)

In [30]:
# From X data removed special characters and lower case the text
data_list=[]
for text in X:
    text = re.sub(r'[!@#$(),\n"%^*?:;~`0-9]', ' ', text)
    text = re.sub(r'[[]]', ' ', text)
    text = text.lower()
    data_list.append(text)
X_train= pd.DataFrame(data_list)
X_train

Unnamed: 0,0
0,nature in the broadest sense is the natural...
1,nature can refer to the phenomena of the phy...
2,the study of nature is a large if not the onl...
3,although humans are part of nature human acti...
4,[ ] the word nature is borrowed from the old f...
...,...
10332,ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...
10333,ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...
10334,ಹೇಗೆ ' ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎ...
10335,ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...


In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [32]:
# creating bag of words using count vectorizer
from sklearn.feature_extraction.text import CountVectorizer
cv  = CountVectorizer()
cv.fit(X_train)

x_train= cv.transform(X_train).toarray()
x_test= cv.transform(X_test).toarray()
 
x_train
x_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [33]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(x_train, y_train)

In [34]:
y_pred=model.predict(x_test)
y_pred

array([ 3, 12,  3, ..., 13,  3,  8])

In [35]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score

ac=accuracy_score(y_test,y_pred)
cm = confusion_matrix(y_test ,y_pred)
cr = classification_report(y_test,y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy is :",ac)#Accuracy can be misleading if the data is imbalanced.
#print("Confusion Metrix:", cm)
#print("Classification_report:",cr)
print("F1 score:",f1) #Especially useful in imbalanced datasets where accuracy fails.

Accuracy is : 0.980174081237911
F1 score: 0.9802034718589206


In [36]:
from sklearn.pipeline import Pipeline

pipe=Pipeline([('vectorizer',cv),('multinominalNB',model)])
pipe.fit(X_train,y_train)

In [37]:
y_pred2= pipe.predict(X_test)
ac2= accuracy_score(y_test,y_pred2)
print("Accuracy is :",ac2)

with open('trained_pipeline-0.1.0.pkl','wb') as f:
    pickle.dump(pipe, f)

Accuracy is : 0.980174081237911


In [38]:
# if you need to zip and download a folder, you can do it with this commend:
#!zip -r ./trained _pipeline-0.1.0.pkl.zip ./trained_pipeline-0.1.0.pkl

## Prediction

In [42]:
text ="Ciao, come stai?"

y=pipe.predict([text])
print(y)
le.classes_[y[0]],y

[8]


('Italian', array([8]))