In [1]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.simplefilter("ignore")

from sklearn import feature_extraction
from collections import Counter
from sklearn import linear_model
from string import punctuation
from sklearn import pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn import metrics

In [2]:
#Training set
train_df = pd.read_csv('train_set.csv')
#Test set
test_df = pd.read_csv('test_set.csv')

In [3]:
train_df.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [4]:
np.shape(train_df)

(33000, 2)

In [5]:
np.shape(test_df)

(5682, 2)

In [6]:
def text_clean(text):
    #let's removing punctuation
    res = text.apply(lambda x: ''.join(i for i in x if i not in punctuation))
    res = res.str.lower()
    return text

In [7]:
train_df['clean_text'] = text_clean(train_df['text'])
test_df['clean_text'] = text_clean(test_df['text'])
train_df.head()

Unnamed: 0,lang_id,text,clean_text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [8]:
test_df.head()

Unnamed: 0,index,text,clean_text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele...","Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.,Winste op buitelandse valuta.


In [9]:
X = train_df['clean_text']
y = train_df['lang_id']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size =0.2,random_state =42)

In [16]:
vectorizer = TfidfVectorizer(ngram_range=(1,5))

lr1 = Pipeline([
    ('vectorizer', vectorizer),
    ('clf', LogisticRegression(C=1000))
])

In [None]:
lr1.fit(X_train,y_train)

In [12]:
y_predicted = lr1.predict(X_test)

In [13]:
acc = (metrics.accuracy_score(y_test, y_predicted))*100
print(acc, '%')

99.46969696969697 %


In [14]:
matrix = metrics.confusion_matrix(y_test, y_predicted)
print('Confusion matrix: \n', matrix)

Confusion matrix: 
 [[581   1   0   0   0   0   0   0   0   1   0]
 [  0 615   0   0   0   0   0   0   0   0   0]
 [  0   1 577   0   0   0   0   0   0   1   4]
 [  0   0   0 621   1   0   3   0   0   0   0]
 [  0   0   0   1 617   0   0   0   0   0   0]
 [  0   0   1   0   0 579   0   0   0   0   4]
 [  1   0   0   1   1   0 595   0   0   0   0]
 [  0   0   0   0   0   0   0 561   0   0   0]
 [  0   0   0   0   0   0   0   0 634   0   0]
 [  0   0   1   0   0   0   0   0   1 604   3]
 [  0   1   4   0   0   1   0   0   0   3 581]]


In [15]:
test_pred = lr1.predict(test_df['clean_text'])
my_submission = pd.DataFrame({'index': test_df['index'], 'lang_id': test_pred})
# you could use any filename. We choose submission here
my_submission.to_csv('submission1.csv', index=False)