In [1]:
# https://www.kaggle.com/datasets/basilb2s/language-detection

In [2]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import warnings

warnings.simplefilter("ignore")

In [3]:
# Loading the dataset
data = pd.read_csv("Language Detection.csv")

In [None]:
# Analize the dataset head
data.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [None]:
# Define the labels and features (x, y)
# The "Text" is the feature "x"
x = data["Text"]
# And the "Language" is the label "y"
y = data["Language"]

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
# For each label "y" apply a number position starting from 0
y = le.fit_transform(y)

In [None]:
le.classes_

array(['Arabic', 'Danish', 'Dutch', 'English', 'French', 'German',
       'Greek', 'Hindi', 'Italian', 'Kannada', 'Malayalam', 'Portugeese',
       'Russian', 'Spanish', 'Sweedish', 'Tamil', 'Turkish'], dtype=object)

In [None]:
data_list = []
for text in x:
    # String normalization of the text
    text = re.sub(r'[!@#$(),\n"%^*?\:;~`0-9]', ' ', text)
    text = re.sub(r'[[]]', ' ', text)
    text = text.lower()
    data_list.append(text)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x,y, test_size = 0.20)

In [None]:
# creating bag of words using countvectorizer
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
cv.fit(X_train)
x_train = cv.transform(X_train).toarray()
x_test = cv.transform(X_test).toarray()

In [None]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(x_train, y_train)

MultinomialNB()

In [None]:
y_pred = model.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

ac = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

In [None]:
print("Accuracy is: ", ac)

Accuracy is:  0.9787234042553191


In [None]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([('vectorizer', cv), ('multinomialNB', model)])
pipe.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('multinomialNB', MultinomialNB())])

In [None]:
y_pred2 = pipe.predict(X_test)
ac2 = accuracy_score(y_test, y_pred2)
print("Accuracy is: ", ac2)

Accuracy is:  0.9787234042553191


In [None]:
with open('trained_pipeline-0.1.0.pkl', 'wb') as f:
    pickle.dump(pipe, f)

In [None]:
!zip -r ./trained_pipeline-0.1.0.pkl.zip ./trained_pipeline-0.1.0.pkl

'zip' is not recognized as an internal or external command,
operable program or batch file.


In [None]:
text = "Hello, how are you?"

y = pipe.predict([text])
le.classes_[y[0]], y

('English', array([3]))