In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import pickle

### Load the dataset

In [2]:
df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,Text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch


### Check for null values

In [3]:
df.isnull().sum()

Text        0
language    0
dtype: int64

#### There are no null values in this dataset

In [4]:
df['language'].value_counts()

language
Estonian      1000
Swedish       1000
English       1000
Russian       1000
Romanian      1000
Persian       1000
Pushto        1000
Spanish       1000
Hindi         1000
Korean        1000
Chinese       1000
French        1000
Portugese     1000
Indonesian    1000
Urdu          1000
Latin         1000
Turkish       1000
Japanese      1000
Dutch         1000
Tamil         1000
Thai          1000
Arabic        1000
Name: count, dtype: int64

In [5]:
df['language'].nunique()

22

#### This dataset contains 22 languages each with 1000 rows

### Prepare the data

In [6]:
df.columns

Index(['Text', 'language'], dtype='object')

In [7]:
X = df['Text']
y = df['language']

### Train test split

In [8]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

### Tokenize the text data(Bag of words)

In [9]:
cv = CountVectorizer()
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

In [10]:
X_train[0]

'klement gottwaldi surnukeha palsameeriti ning paigutati mausoleumi surnukeha oli aga liiga hilja ja oskamatult palsameeritud ning hakkas ilmutama lagunemise tundemärke  aastal viidi ta surnukeha mausoleumist ära ja kremeeriti zlíni linn kandis aastatel – nime gottwaldov ukrainas harkivi oblastis kandis zmiivi linn aastatel – nime gotvald'

In [11]:
X_train_cv[0].shape

(1, 238293)

### Model Training

In [12]:
model = MultinomialNB()
model.fit(X_train_cv,y_train)

### Model Evaluation

In [13]:
model.score(X_test_cv,y_test)

0.9422727272727273

### Saving the model and tokenizer

In [18]:
with open('model.pkl','wb') as f:
    pickle.dump(model,f)

with open('vectorizer.pkl','wb') as f:
    pickle.dump(cv,f)

### Testing the model

In [15]:
input = 'how are you'
token = cv.transform([input])
output = model.predict(token)

print("Language is",output[0])

Language is English
