# **Language** **Detection**


### <b>Mounting to Google Drive</b>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### <b>Importing Libraries</b>

In [2]:
#Importing Libraries

import pandas as pd
import numpy as np
import re    #regular expression
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter("ignore")

import nltk

#nltk.download('punkt')
from nltk.tokenize import word_tokenize

#nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

#nltk.download('stopwords')
from nltk.corpus import stopwords

### <b>Reading the Dataset</b>

In [3]:
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Kaggle/Language Detection/Language Detection.csv")
data.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


### <b>Data Cleaning and Exploratory Data Analysis</b>

In [4]:
data.duplicated().sum()

66

In [5]:
data.drop_duplicates(inplace = True)

In [6]:
data.shape

(10271, 2)

In [7]:
data.columns

Index(['Text', 'Language'], dtype='object')

In [8]:
data['Language'].describe()

count       10271
unique         17
top       English
freq         1382
Name: Language, dtype: object

In [9]:
data['Language'].unique()

array(['English', 'Malayalam', 'Hindi', 'Tamil', 'Portugeese', 'French',
       'Dutch', 'Spanish', 'Greek', 'Russian', 'Danish', 'Italian',
       'Turkish', 'Sweedish', 'Arabic', 'German', 'Kannada'], dtype=object)

In [10]:
data['Language'].value_counts()

English       1382
French        1007
Spanish        816
Portugeese     736
Italian        694
Russian        688
Sweedish       673
Malayalam      591
Dutch          542
Arabic         532
Turkish        471
German         465
Tamil          464
Danish         424
Kannada        366
Greek          358
Hindi           62
Name: Language, dtype: int64

In [11]:
data.isnull().sum()

Text        0
Language    0
dtype: int64

In [12]:
def clean_func(Text):
  Text = re.sub(r'[\([{})\]!@#$,"%^*?:;~`0-9]', ' ', Text)   # removing the symbols and numbers
  Text = Text.lower()   # converting the text to lower case
  Text = re.sub('#\S+', '', Text)  # remove hashtags

  return Text

In [13]:
data['cleaned_Text'] = data["Text"].apply(lambda x:clean_func(x))
data.head()

Unnamed: 0,Text,Language,cleaned_Text
0,"Nature, in the broadest sense, is the natural...",English,nature in the broadest sense is the natural...
1,"""Nature"" can refer to the phenomena of the phy...",English,nature can refer to the phenomena of the phy...
2,"The study of nature is a large, if not the onl...",English,the study of nature is a large if not the onl...
3,"Although humans are part of nature, human acti...",English,although humans are part of nature human acti...
4,[1] The word nature is borrowed from the Old F...,English,the word nature is borrowed from the old f...


In [14]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [15]:
lemmatizer = WordNetLemmatizer()

def lemmatize_words(text):
    words = text.split()
    words = [lemmatizer.lemmatize(word,pos='v') for word in words]
    return (words)

data['lemmatized'] = data['cleaned_Text'].apply(lemmatize_words)

data.head()

Unnamed: 0,Text,Language,cleaned_Text,lemmatized
0,"Nature, in the broadest sense, is the natural...",English,nature in the broadest sense is the natural...,"[nature, in, the, broadest, sense, be, the, na..."
1,"""Nature"" can refer to the phenomena of the phy...",English,nature can refer to the phenomena of the phy...,"[nature, can, refer, to, the, phenomena, of, t..."
2,"The study of nature is a large, if not the onl...",English,the study of nature is a large if not the onl...,"[the, study, of, nature, be, a, large, if, not..."
3,"Although humans are part of nature, human acti...",English,although humans are part of nature human acti...,"[although, humans, be, part, of, nature, human..."
4,[1] The word nature is borrowed from the Old F...,English,the word nature is borrowed from the old f...,"[the, word, nature, be, borrow, from, the, old..."


In [16]:
df = data.drop(['Text','cleaned_Text'], axis = 1)
df[['Language','lemmatized']]= df[['lemmatized','Language']]
df.rename(columns = {'Language':'Text','lemmatized':'Language'}, inplace = True)

In [17]:
df.head()

Unnamed: 0,Text,Language
0,"[nature, in, the, broadest, sense, be, the, na...",English
1,"[nature, can, refer, to, the, phenomena, of, t...",English
2,"[the, study, of, nature, be, a, large, if, not...",English
3,"[although, humans, be, part, of, nature, human...",English
4,"[the, word, nature, be, borrow, from, the, old...",English


In [18]:
df.tail()

Unnamed: 0,Text,Language
10332,"[ನಿಮ್ಮ, ತಪ್ಪು, ಏನು, ಬಂದಿದೆಯೆಂದರೆ, ಆ, ದಿನದಿಂದ, ...",Kannada
10333,"[ನಾರ್ಸಿಸಾ, ತಾನು, ಮೊದಲಿಗೆ, ಹೆಣಗಾಡುತ್ತಿದ್ದ, ಮಾರ್...",Kannada
10334,"[ಹೇಗೆ, ', ನಾರ್ಸಿಸಿಸಮ್, ಈಗ, ಮರಿಯನ್, ಅವರಿಗೆ, ಸಂಭ...",Kannada
10335,"[ಅವಳು, ಈಗ, ಹೆಚ್ಚು, ಚಿನ್ನದ, ಬ್ರೆಡ್, ಬಯಸುವುದಿಲ್ಲ...",Kannada
10336,"[ಟೆರ್ರಿ, ನೀವು, ನಿಜವಾಗಿಯೂ, ಆ, ದೇವದೂತನಂತೆ, ಸ್ವಲ್...",Kannada


***

### <b>Splitting the Data into Train and Validation</b>

In [19]:
#Separating Independent and Dependent features
x = data["Text"]
y = data["Language"]

In [20]:
y.nunique()

17

In [21]:
#train-test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 39,stratify=data['Language'],)

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfVectorizer(sublinear_tf = True, min_df = 5, ngram_range = (1, 2))
x_train = tfidf.fit_transform(x_train).toarray()
type(x_train)

numpy.ndarray

In [23]:
x_test = tfidf.transform(x_test).toarray()

In [24]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(7189, 5616)
(7189,)
(3082, 5616)
(3082,)




---





### <b>Model Building</b>
####<li> <b>Multinomial Naive Bayes</b>
####<li> <b>Random Forest</b>

In [25]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

models = {"M_Naive_Bayes" : MultinomialNB(), "Random_Forest" : RandomForestClassifier()}

In [26]:
for name, model in models.items():
    print( f'{name} training started...')
    model.fit(x_train, y_train)
    print(f'{name} trained.')

M_Naive_Bayes training started...
M_Naive_Bayes trained.
Random_Forest training started...
Random_Forest trained.


In [27]:
from sklearn.metrics import accuracy_score, classification_report
for name in models:
    acc_score = accuracy_score(y_train, models.get(name).predict(x_train))
    print(f'{name} accuracy score :  {acc_score}')

M_Naive_Bayes accuracy score :  0.9617471136458479
Random_Forest accuracy score :  0.9734316316594798


In [28]:
for name in models:
    acc_score = accuracy_score(y_test, models.get(name).predict(x_test))
    print(f'{name} accuracy score :  {acc_score}')

M_Naive_Bayes accuracy score :  0.9484101232965607
Random_Forest accuracy score :  0.9325113562621674


#### <b>Language Detection</b>


In [29]:
def display(input_text):
  X_input = tfidf.transform([input_text])
  for name, model in models.items():
    predicted_language = model.predict(X_input)[0]
    print(f'{name}: \n Detected Language: {predicted_language}')

In [30]:
display('¡Hola! ¿Cómo estás?')

M_Naive_Bayes: 
 Detected Language: Spanish
Random_Forest: 
 Detected Language: Spanish


In [31]:
display('Nature, in the broadest sense, is the natural.')

M_Naive_Bayes: 
 Detected Language: English
Random_Forest: 
 Detected Language: English


In [32]:
display('வணக்கம்')

M_Naive_Bayes: 
 Detected Language: Tamil
Random_Forest: 
 Detected Language: Tamil


In [33]:
display('Bonjour, comment ça va ?')

M_Naive_Bayes: 
 Detected Language: French
Random_Forest: 
 Detected Language: French


In [34]:
display('Γειά σας! Πώς είστε;')

M_Naive_Bayes: 
 Detected Language: Greek
Random_Forest: 
 Detected Language: Greek


In [35]:
display('Olá, como você está?')

M_Naive_Bayes: 
 Detected Language: Portugeese
Random_Forest: 
 Detected Language: Portugeese


In [36]:
display('Türkçe de bu cümleyi yazıyorum')

M_Naive_Bayes: 
 Detected Language: Turkish
Random_Forest: 
 Detected Language: Turkish


In [37]:
display('ಕನ್ನಡದಲ್ಲಿ ಈ ಸಂದೇಶವನ್ನು ಬರೆಯುತ್ತಿದ್ದೇನೆ')

M_Naive_Bayes: 
 Detected Language: Kannada
Random_Forest: 
 Detected Language: Turkish


#### <b>Fine-Tuning Random Forest</b>

In [38]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Create a Random Forest classifier with class weights and more trees
rf_classifier = RandomForestClassifier(n_estimators=300, class_weight='balanced', random_state=42)

# Train the model
rf_classifier.fit(x_train, y_train)

# Make predictions on the test set
y_pred = rf_classifier.predict(x_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_report_output = classification_report(y_test, y_pred)

print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report_output)


Model Accuracy: 93.25%

Classification Report:
              precision    recall  f1-score   support

      Arabic       1.00      0.91      0.95       160
      Danish       0.97      0.87      0.92       127
       Dutch       0.99      0.96      0.97       163
     English       0.99      0.97      0.98       415
      French       0.99      0.93      0.96       302
      German       0.99      0.94      0.96       140
       Greek       1.00      0.92      0.96       107
       Hindi       1.00      0.95      0.97        19
     Italian       0.96      0.88      0.92       208
     Kannada       1.00      0.92      0.96       110
   Malayalam       1.00      0.97      0.98       177
  Portugeese       0.95      0.94      0.95       221
     Russian       1.00      0.90      0.95       206
     Spanish       0.92      0.91      0.92       245
    Sweedish       0.92      0.92      0.92       202
       Tamil       1.00      0.96      0.98       139
     Turkish       0.50      0.99 

In [40]:
input_text = 'Türkçe de bu cümleyi yazıyorum'
X_input = tfidf.transform([input_text])
rf_classifier.predict(X_input)[0]

'Turkish'

<b>Over all, Naive Bayes is the best fit model for this multiclass classification problem, with 94% accuracy in detecting the language.<b>

### <b>Thank You!</b>
