# Katleho_Mphuthi_Language_Identification

In [1]:
## import some important packages and libraries
import numpy as np
import pandas as pd
import nltk

#For preprocessing 
#from sklearn.model_selection import train_test_split
import re

#import for making graphs


#modelling
#from sklearn.linear_model import LogisticRegression

#suppress cell_warnings
import warnings
warnings.filterwarnings("ignore")
#metrics
#from sklearn.metrics import confusion_matrix

### Getting the data 

In [2]:
train = pd.read_csv('train_set.csv')

In [3]:
train.head(20)


Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...
5,nso,dinyakišišo tše tša go dirwa gabedi ka ngwaga ...
6,tsn,kgetse nngwe le nngwe e e sa faposiwang mo tsh...
7,ven,mbadelo dze dza laelwa dzi do kwama mahatulele...
8,nso,maloko a dikhuduthamaga a ikarabela mongwe le ...
9,tsn,fa le dirisiwa lebone le tshwanetse go bontsha...


In [4]:
train.shape

(33000, 2)

In [5]:
train.lang_id.value_counts()


nso    3000
ssw    3000
nbl    3000
afr    3000
xho    3000
sot    3000
eng    3000
ven    3000
tsn    3000
zul    3000
tso    3000
Name: lang_id, dtype: int64

In [6]:
train.lang_id.nunique()

11

Data looks balanced with equal observations in each class. There is about 11 classes.

Before we can train a model we must clean the data.

In [7]:
train.isnull().sum()

lang_id    0
text       0
dtype: int64

In [8]:
def preprocessing(string):
    #lowering each word in the sentence
    string = string.lower()
    
    #removal of punctuaction and numbers 
    string = re.sub(r'[^a-z0-9\s]','', string)
    message = re.sub(r'[0-9]+', '', string)
    return message 

In [9]:
df = train.copy()

In [10]:
X = df["text"].apply(preprocessing)
y = df["lang_id"]
print(X.shape)
print(y.shape)

(33000,)
(33000,)


In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.2, random_state=42)

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()
X1 = count_vector.fit_transform(X_train)

In [13]:
X1.shape

(26400, 126260)

In [14]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
X_train_dtm = tfidf_transformer.fit_transform(X1)

In [15]:
X_train_dtm.shape

(26400, 126260)

## Modelling 
### Model 1 : Logistics Regression Model

In [16]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(multi_class='ovr',solver = 'liblinear')

In [20]:
logreg.fit(X_train_dtm, y_train)

LogisticRegression(multi_class='ovr', solver='liblinear')

In [21]:
#transforming the vector
print(X_test.shape)
X_test_c = count_vector.transform(X_test)
print(f'Test data after transforming data {X_test_c.shape}')

(6600,)
Test data after transforming data (6600, 126260)


## prediction

In [22]:
y_pred_logreg = logreg.predict(X_test_c)

In [23]:
#metrics
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred_logreg))
print(metrics.confusion_matrix(y_test, y_pred_logreg))


0.9839393939393939
[[581   1   0   0   0   1   0   0   0   0   0]
 [  0 615   0   0   0   0   0   0   0   0   0]
 [  1   1 555   2   3   0   0   3   0   4  14]
 [  0   0   0 619   1   0   5   0   0   0   0]
 [  0   0   0   1 617   0   0   0   0   0   0]
 [  0   3   0   0   0 567   0   2   2   0  10]
 [  1   0   0   8   2   0 587   0   0   0   0]
 [  0   0   0   0   0   0   0 561   0   0   0]
 [  0   0   0   0   0   0   0   0 634   0   0]
 [  0   0   1   2   2   0   0   3   3 593   5]
 [  0   2   4   4   5   1   0   2   0   7 565]]


### Model 2: Multinomial

In [24]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [25]:
nb.fit(X_train_dtm, y_train)

MultinomialNB()

In [26]:
y_pred_nb = nb.predict(X_test_c)
print(metrics.accuracy_score(y_test, y_pred_nb))
print(metrics.confusion_matrix(y_test, y_pred_nb))

0.9972727272727273
[[583   0   0   0   0   0   0   0   0   0   0]
 [  0 615   0   0   0   0   0   0   0   0   0]
 [  0   2 580   0   0   0   0   0   0   0   1]
 [  0   0   0 623   1   0   1   0   0   0   0]
 [  0   0   0   0 618   0   0   0   0   0   0]
 [  0   1   0   0   0 581   0   1   0   0   1]
 [  1   0   0   0   0   0 597   0   0   0   0]
 [  0   0   0   0   0   0   0 561   0   0   0]
 [  0   0   0   0   0   0   0   0 634   0   0]
 [  0   0   1   0   0   0   0   1   0 606   1]
 [  0   1   3   1   0   0   0   0   0   1 584]]


### Model 3:

In [27]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=42)

In [29]:
tree.fit(X_train_dtm, y_train)

DecisionTreeClassifier(random_state=42)

In [31]:
y_pred_tree = tree.predict(X_test_c)
print(metrics.accuracy_score(y_test, y_pred_tree))
print(metrics.confusion_matrix(y_test, y_pred_tree))

0.9427272727272727
[[580   0   0   0   0   3   0   0   0   0   0]
 [  1 613   0   0   0   0   0   0   0   0   1]
 [  1   1 504   0  14  18   0   2   0  11  32]
 [  0   0   1 602   3   0  15   4   0   0   0]
 [  0   0   1   2 610   0   5   0   0   0   0]
 [  0   2   9   0   1 533   0   1   0   4  34]
 [  1   1   1   3   5   0 586   1   0   0   0]
 [  1   0   0   1   0   3   1 553   1   1   0]
 [  0   0   0   0   2   0   0   5 627   0   0]
 [  0   3  26   0  12  22   1   2   0 517  26]
 [  0   1  24   0   5  51   0   1   0  11 497]]


### Model 4

In [32]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=100, random_state=42)

In [34]:
forest.fit(X_train_dtm, y_train)

RandomForestClassifier(random_state=42)

In [None]:
y_pred_forest = forest.predict(X_test_c2)
print(metrics.accuracy_score(y_test, y_pred_forest))
print(metrics.confusion_matrix(y_test, y_pred_forest))

### Tuning Models 

In [None]:
count_vector_2 = CountVectorizer(ngram_range =(1,2))
X_train_c2= count_vector_2.fit_transform(X_train)

In [None]:
tfidf2 =  TfidfTransformer()
X_train_dtm2 = tfidf2.fit_transform(X_train_c2)

In [None]:
nb_ngrams = MultinomialNB()

In [None]:
nb_ngrams.fit(X_train_dtm2, y_train)

In [None]:
X_test_c2=count_vector_2.transform(X_test)

In [None]:
y_pred_nb_grams = nb_ngrams.predict(X_test_c2)
print(metrics.accuracy_score(y_test,y_pred_nb_grams))
print(metrics.confusion_matrix(y_test, y_pred_nb_grams))

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
params = 

### Submission
Submitting models guide:
* Model 1 = LogisticsRegression
* Model 2 = MultinomialNB
* Model 3 = DecisionTree
* Model 4 = MultinomialNB(changed Count Vector)

In [None]:
test = pd.read_csv('test_set.csv')
test.head(20)

In [None]:
test.shape

In [None]:
testdf = test.copy()

In [None]:
testdf['text'] = test['text'].apply(preprocessing)
testdf.head()

In [None]:
x_test = count_vector.transform(testdf['text'].values.astype(str))

In [None]:
x_test.shape

In [None]:
x_test2 = count_vector_2.transform(testdf['text'].values.astype(str))

### Submission 1

In [None]:
y_pred_test1 = logreg.predict(x_test)

In [None]:
textid = testdf['index']

In [None]:
submission_logreg = pd.DataFrame(
    {'index': textid,
     'lang_id': y_pred_test1
    })

In [None]:
submission_logreg.head(20)

In [None]:
submission_logreg.to_csv("logreg_predictions.csv",index = False)

### Submission 2


In [None]:
x_test_dtm = tfidf_transformer.transform(x_test) 

In [None]:
y_pred_test2 = nb.predict(x_test_dtm)

In [None]:
submission_nb = pd.DataFrame(
    {'index': textid,
     'lang_id': y_pred_test2
    })
submission_nb.head(20)

In [None]:
submission_nb.to_csv("multinomialNB.csv",index = False)

### submission 3

In [None]:
y_pred_test3 = nb_ngrams.predict(x_test2)
submission_ngrams = pd.DataFrame(
    {'index': textid,
     'lang_id': y_pred_test3
    })
submission_nb.head()

In [None]:
submission_ngrams.to_csv("multinomialnb2.csv",index = False)