In [1]:
# Importing the necessary libraries and modules
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [None]:
# Loading the dataset
df = pd.read_csv("Language Detection.csv")

In [None]:
# Displaying the first few rows of the dataset
df.head()

Unnamed: 0,Text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch


In [None]:
# Displaying the shape of the dataset
df.shape

(10337, 2)

In [None]:
# Checking for null values
df.isnull().sum()

Text        0
Language    0
dtype: int64

In [None]:
# Print the columns of the DataFrame
print(df.columns)

Index(['Text', 'Language'], dtype='object')


In [None]:
# Calculating and printing the frequency of each unique value in the 'Language' column of the DataFrame.
df["Language"].value_counts()

Language
English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: count, dtype: int64

In [None]:
 # Finding all unique languages in the 'Language' column of the DataFrame
languages = df['Language'].unique()

# counting the number of unique languages
number_of_languages = len(df['Language'].unique())

#printing both the unique languages and their count
print(languages)
print(number_of_languages)

['English' 'Malayalam' 'Hindi' 'Tamil' 'Portugeese' 'French' 'Dutch'
 'Spanish' 'Greek' 'Russian' 'Danish' 'Italian' 'Turkish' 'Sweedish'
 'Arabic' 'German' 'Kannada']
17


In [None]:
# Extracting the 'Text' and 'Language' columns from the DataFrame
# converting them to numpy arrays, and assigns them to the variables x and y respectively.

x = df['Text']
y = df['Language']

x = np.array(x)
y = np.array(y)

In [None]:
# Using CountVectorizer to convert the text data in the 'Text' column into a matrix of token counts
cv = CountVectorizer()
x = cv.fit_transform(x)

In [None]:
# Spliting the text data and labels into training and test sets.
xtrain,xtest,ytrain,ytest = train_test_split(x, y,test_size = 0.20, random_state = 42)

In [None]:
# Initializing a Multinomial Naive Bayes model, which is suitable for classification tasks with discrete features
model = MultinomialNB()

In [None]:
# Fiting the Multinomial Naive Bayes model to the training data
model.fit(xtrain,ytrain)

In [None]:
# Making Prediction
ypred = model.predict(xtest)

In [None]:
# Displaying the true labels of the test set for comparison with the predicted labels
ytest

array(['Russian', 'Italian', 'English', ..., 'Dutch', 'Greek', 'French'],
      dtype=object)

In [None]:
# Displaying the predicted labels
ypred

array(['Russian', 'Italian', 'English', ..., 'Dutch', 'Greek', 'French'],
      dtype='<U10')

In [None]:
# Calculating accuracy
accuracy = accuracy_score(ytest, ypred)

# Generating classification report
report = classification_report(ytest, ypred)

In [None]:
# Printing the accuracy score
print(f"Accuracy: {accuracy}")

Accuracy: 0.9825918762088974


In [None]:
# Printing the classification report
print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

      Arabic       1.00      0.98      0.99       106
      Danish       0.97      0.96      0.97        73
       Dutch       0.98      0.97      0.98       111
     English       0.92      1.00      0.96       291
      French       0.99      0.99      0.99       219
      German       1.00      0.97      0.98        93
       Greek       1.00      0.99      0.99        68
       Hindi       1.00      1.00      1.00        10
     Italian       1.00      0.99      1.00       145
     Kannada       1.00      1.00      1.00        66
   Malayalam       1.00      0.98      0.99       121
  Portugeese       0.99      0.98      0.99       144
     Russian       1.00      0.99      1.00       136
     Spanish       0.99      0.97      0.98       160
    Sweedish       1.00      0.98      0.99       133
       Tamil       1.00      0.99      0.99        87
     Turkish       1.00      0.93      0.97       105

   

In [None]:
# Input a text, converts it into a feature vector using the CountVectorizer (cv), and predicts the language of the input text using the trained Multinomial Naive Bayes model
user = input("Enter a Text in any language would you like: ")
data = cv.transform([user]).toarray()
output = model.predict(data)
print(output)

Enter a Text in any language would you like: hi
['English']
