# Katleho_Mphuthi_Language_Identification

In [1]:
## import some important packages and libraries
import numpy as np
import pandas as pd
import nltk

#For preprocessing 
#from sklearn.model_selection import train_test_split
import re

#import for making graphs
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

#modelling
#from sklearn.linear_model import LogisticRegression

#suppress cell_warnings
import warnings
warnings.filterwarnings("ignore")
#metrics
#from sklearn.metrics import confusion_matrix

### Getting the data 

In [2]:
train = pd.read_csv('train_set.csv')

In [4]:
train.head(20)


Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...
5,nso,dinyakišišo tše tša go dirwa gabedi ka ngwaga ...
6,tsn,kgetse nngwe le nngwe e e sa faposiwang mo tsh...
7,ven,mbadelo dze dza laelwa dzi do kwama mahatulele...
8,nso,maloko a dikhuduthamaga a ikarabela mongwe le ...
9,tsn,fa le dirisiwa lebone le tshwanetse go bontsha...


In [8]:
train.shape

(33000, 2)

In [7]:
train.lang_id.value_counts()


nso    3000
ssw    3000
tsn    3000
xho    3000
eng    3000
zul    3000
sot    3000
afr    3000
tso    3000
ven    3000
nbl    3000
Name: lang_id, dtype: int64

In [9]:
train.lang_id.nunique()

11

Data looks balanced with equal observations in each class. There is about 11 classes.

Before we can train a model we must clean the data.

In [11]:
def preprocessing(string):
    #lowering each word in the sentence
    string = string.lower()
    
    #removal of punctuaction and numbers 
    string = re.sub(r'[^a-z0-9\s]','', string)
    message = re.sub(r'[0-9]+', '', string)
    return message 

In [17]:
df = train.copy()

In [18]:
X = df["text"].apply(preprocessing)
y = df["lang_id"]
print(X.shape)
print(y.shape)

(33000,)
(33000,)


In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.2, random_state=42)

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()
X1 = count_vector.fit_transform(X_train)

In [23]:
X1.shape

(26400, 126260)

In [21]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
X_train_dtm = tfidf_transformer.fit_transform(X1)

In [24]:
X_train_dtm.shape

(26400, 126260)

## Modelling 

In [25]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(multi_class='ovr')

In [26]:
logreg.fit(X_train_dtm, y_train)

LogisticRegression(multi_class='ovr', solver='liblinear')

In [28]:
#transforming the vector
print(X_test.shape)
X_test_c = count_vector.transform(X_test)
print(f'Test data after transforming data {X_test_c.shape}')

(6600,)
Test data after transforming data(6600, 126260)


## prediction

In [29]:
y_pred_logreg = logreg.predict(X_test_c)

In [30]:
#metrics
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred_logreg))
print(metrics.confusion_matrix(y_test, y_pred_logreg))


0.9839393939393939
[[581   1   0   0   0   1   0   0   0   0   0]
 [  0 615   0   0   0   0   0   0   0   0   0]
 [  1   1 555   2   3   0   0   3   0   4  14]
 [  0   0   0 619   1   0   5   0   0   0   0]
 [  0   0   0   1 617   0   0   0   0   0   0]
 [  0   3   0   0   0 567   0   2   2   0  10]
 [  1   0   0   8   2   0 587   0   0   0   0]
 [  0   0   0   0   0   0   0 561   0   0   0]
 [  0   0   0   0   0   0   0   0 634   0   0]
 [  0   0   1   2   2   0   0   3   3 593   5]
 [  0   2   4   4   5   1   0   2   0   7 565]]


### Submission
Submitting models guide:
* Model 1 = LogisticsRegression


In [31]:
test = pd.read_csv('test_set.csv')
test.head(20)

In [32]:
test.shape

(5682, 2)

In [42]:
testdf['text'] = test['text'].apply(preprocessing)
testdf.head()

0    mmasepala fa maemo a a kgethegileng a letlelel...
1    uzakwaziswa ngokufaneleko nakungafuneka eminye...
2            tshivhumbeo tshi fana na ngano dza vhathu
3    kube inja nelikati betingevakala kutsi titsini...
4                         winste op buitelandse valuta
Name: text, dtype: object

In [43]:
x_test = count_vector.transform(testdf['text'].values.astype(str))

In [44]:
x_test.shape

(5682, 126260)

In [46]:
y_pred_test1 = logreg.predict(x_test)

In [47]:
textid = testdf['text']

In [49]:
submission_logreg = pd.DataFrame(
    {'text': textid,
     'lang_id': y_pred_test1
    })

In [50]:
submission_logreg.head(20)

Unnamed: 0,text,lang_id
0,mmasepala fa maemo a a kgethegileng a letlelel...,tsn
1,uzakwaziswa ngokufaneleko nakungafuneka eminye...,nbl
2,tshivhumbeo tshi fana na ngano dza vhathu,ven
3,kube inja nelikati betingevakala kutsi titsini...,ssw
4,winste op buitelandse valuta,afr
5,ke feela dilense te hlakilego ta pono e tee go...,nso
6,fn am final gems birthing optionszulutxtfn,xho
7,ntjhafatso ya konteraka ya mosebetsi etsa bonn...,sot
8,ugems uhlinzeka ngezinzuzo zemithi yezifo ezin...,zul
9,so on occasion are statistics misused,eng


In [52]:
submission_logreg.to_csv("logreg_predictions.csv")