In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
import seaborn as sns
from string import punctuation
from tensorflow.keras import layers, models

In [3]:
#loading data set
dataset = pd.read_csv("C:\\Users\\rogathemollel\\Desktop\\archive\\Language Detection.csv")
dataset.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [4]:
missing_values = dataset.isnull().sum()
missing_values

Text        0
Language    0
dtype: int64

In [5]:
dataset.duplicated()

0        False
1        False
2        False
3        False
4        False
         ...  
10332    False
10333    False
10334    False
10335    False
10336    False
Length: 10337, dtype: bool

In [6]:
dataset.columns

Index(['Text', 'Language'], dtype='object')

In [7]:
#Removing punctuation marks
def removePunctuation(text):
    clean_text =""

    for i in text:
        if i not in punctuation:
            clean_text+=i
    return clean_text
dataset['clean_text']=dataset['Text'].apply(lambda x:removePunctuation(x.lower()))
dataset

Unnamed: 0,Text,Language,clean_text
0,"Nature, in the broadest sense, is the natural...",English,nature in the broadest sense is the natural p...
1,"""Nature"" can refer to the phenomena of the phy...",English,nature can refer to the phenomena of the physi...
2,"The study of nature is a large, if not the onl...",English,the study of nature is a large if not the only...
3,"Although humans are part of nature, human acti...",English,although humans are part of nature human activ...
4,[1] The word nature is borrowed from the Old F...,English,1 the word nature is borrowed from the old fre...
...,...,...,...
10332,ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...,Kannada,ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...
10333,ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...,Kannada,ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...
10334,ಹೇಗೆ ' ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎ...,Kannada,ಹೇಗೆ ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎಲ...
10335,ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...,Kannada,ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...


In [8]:
#Defining input output data
text = dataset['clean_text']
language = dataset['Language']

In [9]:
#Data Preprocessing
#Vectorize the text and give a label to every language
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

vectorizer = CountVectorizer()
label_encoder = LabelEncoder()

text_v = vectorizer.fit_transform(text)
language_v = label_encoder.fit_transform(language)




In [10]:
#Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB 

model = MultinomialNB()
model.fit(text_v,language_v)

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(text_v, language_v, test_size=0.2, random_state=42)

# Train the model on the training data
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict on the testing data
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the model: {accuracy:.2f}")


Accuracy of the model: 0.98


In [16]:
#Creating the prediction Model
def language_detector(input_xlanguage):
    
    input_xlanguage_v = vectorizer.transform([input_xlanguage])

    language_detected = (model.predict(input_xlanguage_v)[0])

    return language_detected

In [13]:
#Adding the google trans library to find out what the language says
from googletrans import Translator
translater = Translator()

In [23]:
#Prediction
def predict(input_text):
    
    predicted_language = label_encoder.inverse_transform((language_detector(input_text)).reshape(-1))
    translation = translater.translate(input_text, dest= "Swahili")
    print(f"The predicted language is: {predicted_language} and it says ({translation.text})")
    
predict("Hola esto es una prueba")
predict("Hallo, das ist ein Test")
predict("Привет, это тест")
predict("Γεια σας αυτό είναι ένα τεστ")

The predicted language is: ['Spanish'] and it says (Hello huu ni mtihani)
The predicted language is: ['German'] and it says (Hello, huu ni mtihani)
The predicted language is: ['Russian'] and it says (Hello, huu ni mtihani)
The predicted language is: ['Greek'] and it says (Hello huu ni mtihani)


In [18]:
predict("I am going for shopping")

The predicted language is: ['English'] and it says (Naenda kufanya manunuzi)


In [22]:
predict("I miss you")

The predicted language is: ['English'] and it says (Nimekukumbuka)


In [21]:
import pickle

# Save the model
with open('language_detection_model.pkl', 'wb') as file:
    pickle.dump(model, file)

# Save the vectorizer and label encoder
with open('vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)
with open('label_encoder.pkl', 'wb') as file:
    pickle.dump(label_encoder, file)



In [19]:
import pickle

# Load the model
with open('language_detection_model.pkl', 'rb') as file:
    model = pickle.load(file)

# Load the vectorizer and label encoder
with open('vectorizer.pkl', 'rb') as file:
    vectorizer = pickle.load(file)
with open('label_encoder.pkl', 'rb') as file:
    label_encoder = pickle.load(file)

