## Language Detection

Download the Dataset from https://www.kaggle.com/datasets/basilb2s/language-detection, then extract it and upload the csv file to this Notebook.


In [2]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB
import pickle

import warnings
warnings.simplefilter("ignore")

In [3]:
# Loading the dataset
data = pd.read_csv("Language Detection.csv")

In [4]:
data.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [5]:
data.shape

(10337, 2)

In [6]:
data["Language"].unique()

array(['English', 'Malayalam', 'Hindi', 'Tamil', 'Portugeese', 'French',
       'Dutch', 'Spanish', 'Greek', 'Russian', 'Danish', 'Italian',
       'Turkish', 'Sweedish', 'Arabic', 'German', 'Kannada'], dtype=object)

In [18]:
# value count for each language
data["Language"].value_counts()

English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: Language, dtype: int64

In [7]:
X = data["Text"]
y = data["Language"]

In [8]:
X

0         Nature, in the broadest sense, is the natural...
1        "Nature" can refer to the phenomena of the phy...
2        The study of nature is a large, if not the onl...
3        Although humans are part of nature, human acti...
4        [1] The word nature is borrowed from the Old F...
                               ...                        
10332    ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...
10333    ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...
10334    ಹೇಗೆ ' ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎ...
10335    ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...
10336    ಟೆರ್ರಿ ನೀವು ನಿಜವಾಗಿಯೂ ಆ ದೇವದೂತನಂತೆ ಸ್ವಲ್ಪ ಕಾಣು...
Name: Text, Length: 10337, dtype: object

In [9]:
y

0        English
1        English
2        English
3        English
4        English
          ...   
10332    Kannada
10333    Kannada
10334    Kannada
10335    Kannada
10336    Kannada
Name: Language, Length: 10337, dtype: object

In [10]:
# converting categorical variables to numerical
le = LabelEncoder()
y = le.fit_transform(y)

In [11]:
y

array([3, 3, 3, ..., 9, 9, 9])

In [12]:
le.classes_

array(['Arabic', 'Danish', 'Dutch', 'English', 'French', 'German',
       'Greek', 'Hindi', 'Italian', 'Kannada', 'Malayalam', 'Portugeese',
       'Russian', 'Spanish', 'Sweedish', 'Tamil', 'Turkish'], dtype=object)

In [13]:
# text preprocessing
data_list = []
for text in X:
    text = re.sub(r'[!@#$(),\n"%^*?\:;~`0-9]', ' ', text)
    text = re.sub(r'[[]]', ' ', text)
    text = text.lower()
    data_list.append(text)

In [14]:
data_list

[' nature  in the broadest sense  is the natural  physical  material world or universe.',
 ' nature  can refer to the phenomena of the physical world  and also to life in general.',
 'the study of nature is a large  if not the only  part of science.',
 'although humans are part of nature  human activity is often understood as a separate category from other natural phenomena.',
 '[ ] the word nature is borrowed from the old french nature and is derived from the latin word natura  or  essential qualities  innate disposition   and in ancient times  literally meant  birth .',
 '[ ] in ancient philosophy  natura is mostly used as the latin translation of the greek word physis  φύσις   which originally related to the intrinsic characteristics that plants  animals  and other features of the world develop of their own accord.',
 '[ ][ ]  the concept of nature as a whole  the physical universe  is one of several expansions of the original notion [ ] it began with certain core applications of th

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

In [16]:
# creating bag of words using countvectorizer

cv = CountVectorizer()
cv.fit(X_train)

x_train = cv.transform(X_train).toarray()
x_test  = cv.transform(X_test).toarray()

In [17]:
model = MultinomialNB()
model.fit(x_train, y_train)

MultinomialNB()

In [19]:
y_pred = model.predict(x_test)

In [20]:
ac = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

In [21]:
print("Accuracy is :", ac)

Accuracy is : 0.9729206963249516


In [22]:
print(cr)

              precision    recall  f1-score   support

           0       1.00      0.94      0.97       104
           1       1.00      0.93      0.97        92
           2       1.00      0.97      0.99       120
           3       0.85      1.00      0.92       272
           4       0.98      0.98      0.98       173
           5       1.00      0.98      0.99       114
           6       1.00      0.99      0.99        75
           7       1.00      0.93      0.97        15
           8       1.00      0.98      0.99       145
           9       1.00      0.95      0.98        88
          10       1.00      0.99      1.00       114
          11       0.99      0.99      0.99       147
          12       0.99      0.98      0.99       135
          13       0.99      0.97      0.98       149
          14       0.98      0.98      0.98       122
          15       1.00      0.98      0.99        81
          16       1.00      0.91      0.95       122

    accuracy              

In [23]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([('vectorizer', cv), ('multinomialNB', model)])
pipe.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('multinomialNB', MultinomialNB())])

In [24]:
y_pred2 = pipe.predict(X_test)

In [25]:
ac2 = accuracy_score(y_test, y_pred2)
print("Accuracy is :", ac2)

Accuracy is : 0.9729206963249516


In [26]:
with open('trained_pipeline-0.1.0.pkl','wb') as f:
    pickle.dump(pipe, f)

In [31]:
text1 = "Hello, how are you?"
text2 = "Ciao, come stai?"

lang = pipe.predict([text1])
print(le.classes_[lang[0]], lang)

lang = pipe.predict([text2])
print(le.classes_[lang[0]], lang)

English [3]
Italian [8]


In [None]:
# If you need to zip and download a folder, you can do it with this command:

# !zip -r ./trained_pipeline-0.1.0.pkl.zip ./trained_pipeline-0.1.0.pkl