# South Africa Language Identification Hackathon

### Importing Packages

In [87]:
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from matplotlib.colors import ListedColormap
%matplotlib inline

# imports for Natural Language  Processing
import re
import os
import nltk
import string
import time
import spacy.cli
from langdetect import detect
import unicodedata
import numpy as np
import pandas as pd
from sklearn import metrics
from nltk.corpus import stopwords
from html.parser import HTMLParser
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

# Classification Models

from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier

# Performance Evaluation
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from scikitplot.metrics import plot_roc, plot_confusion_matrix
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, classification_report, confusion_matrix,  make_scorer, fbeta_score

# Import library for train test split
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV


# Ignore warnings
import warnings
warnings.simplefilter(action='ignore')

#spacy
#spacy.cli.download('en_core_web_sm')

### Loading the Data

In [88]:
#Loading the test and train data
df_train = pd.read_csv(r'C:\Users\Dorcas Oduor\Documents\EDSA\Advanced Classification\south-african-language-identification\train_set.csv')

df_test = pd.read_csv(r'C:\Users\Dorcas Oduor\Documents\EDSA\Advanced Classification\south-african-language-identification\test_set.csv')

df_train.head()


Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [89]:
df_test.head()

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.


What does the data look like?

In [90]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33000 entries, 0 to 32999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   lang_id  33000 non-null  object
 1   text     33000 non-null  object
dtypes: object(2)
memory usage: 515.8+ KB


In [91]:
df_train.isnull().sum()

lang_id    0
text       0
dtype: int64

#### Checking for Class Balance

In [92]:
df_train['lang_id'].value_counts()

nso    3000
eng    3000
ven    3000
ssw    3000
zul    3000
nbl    3000
tso    3000
sot    3000
xho    3000
afr    3000
tsn    3000
Name: lang_id, dtype: int64

Excellent! Our data is balanced.
This means I do not need to standardize the data before fitting it into a model.

### Validation Split

In [93]:
X = df_train.text
y = df_train.lang_id
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

## Multinomial Naive Bayes

In [94]:
nb = Pipeline([('tfidf', TfidfVectorizer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
report = classification_report(y_test, y_pred,output_dict=True)
results = pd.DataFrame(report).transpose()
results

accuracy 0.9985858585858586


Unnamed: 0,precision,recall,f1-score,support
afr,0.99777,1.0,0.998884,895.0
eng,0.996711,1.0,0.998353,909.0
nbl,0.993213,0.998862,0.996029,879.0
nso,1.0,0.998937,0.999468,941.0
sot,0.998912,1.0,0.999456,918.0
ssw,1.0,1.0,1.0,908.0
tsn,1.0,0.998866,0.999433,882.0
tso,1.0,1.0,1.0,857.0
ven,1.0,1.0,1.0,936.0
xho,0.998914,0.997831,0.998372,922.0


## Linear SVC

In [95]:
linsvc = Pipeline([('tfidf', TfidfVectorizer()),
                   ('clf', LinearSVC())])

linsvc.fit(X_train, y_train)

y_pred = linsvc.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
report = classification_report(y_test, y_pred,output_dict=True)
results = pd.DataFrame(report).transpose()
results

accuracy 0.9977777777777778


Unnamed: 0,precision,recall,f1-score,support
afr,0.998883,0.998883,0.998883,895.0
eng,0.998901,1.0,0.99945,909.0
nbl,0.994318,0.995449,0.994883,879.0
nso,1.0,0.997875,0.998936,941.0
sot,0.997826,1.0,0.998912,918.0
ssw,0.998897,0.997797,0.998347,908.0
tsn,0.998865,0.997732,0.998298,882.0
tso,1.0,1.0,1.0,857.0
ven,1.0,1.0,1.0,936.0
xho,0.993521,0.997831,0.995671,922.0


Pretty good accuracy for both models but we have to see how it perfoms on unseen data.
I will use the Naive Bayes as it does better than the Linear SVC model.

### Fitting the models to unseen data

In [96]:
X_test_2 = df_test.text

y_pred_2 = linsvc.predict(X_test_2)


In [97]:
df_test['lang_id'] = y_pred_2

In [98]:
df_test.head()

Unnamed: 0,index,text,lang_id
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele...",tsn
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...,nbl
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.,ven
3,4,Kube inja nelikati betingevakala kutsi titsini...,ssw
4,5,Winste op buitelandse valuta.,afr


In [99]:
df_final = df_test.drop(['text'], axis = 1)

In [100]:
df_final.head()

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl
2,3,ven
3,4,ssw
4,5,afr


In [101]:
#downloading the final csv for upload
df_final.to_csv(r'C:\Users\Dorcas Oduor\Documents\EDSA\Advanced Classification\south-african-language-identification\Final_nb.csv' , index = False)

This model get's an f1-score of 0.93 on Kaggle. Can this perfomance get better? 

Let's see what a little pre-processing will do.

### Pre-Processing the Data

In [102]:
df_train = pd.read_csv(r'C:\Users\Dorcas Oduor\Documents\EDSA\Advanced Classification\south-african-language-identification\train_set.csv')

df_test = pd.read_csv(r'C:\Users\Dorcas Oduor\Documents\EDSA\Advanced Classification\south-african-language-identification\test_set.csv')

df_train.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [103]:
working_df = df_train.copy()

In [104]:
working_df.shape

(33000, 2)

Let's try cleaning up the data.

In [105]:
def clean_data (text):
    
    #convert to lowercase
    text = text.lower()
    
    #remove punctuation
    text = re.sub(r"[,.;':@#?!\&/$]+\ *", ' ', text)   
    
    #remove numbers 
    text = re.sub(r'\d+','', text)
    
    #remove extra whitespaces 
    text = re.sub(r'\s\s+', ' ', text)
    
    #remove space in front of tweet
    text = text.lstrip(' ')
    
    return text

In [106]:
working_df['text'] = working_df['text'].apply(clean_data) #train data
df_test['text'] = df_test['text'].apply(clean_data)

In [107]:
working_df.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [108]:
duplicate = working_df[working_df.duplicated('text')]
duplicate

Unnamed: 0,lang_id,text
940,xho,xa umntu lowo ephuma kule ndawo yihlola-hlolen...
1208,nbl,ngokwesekhtjheni yomthetho ophathelene nalokhu...
1252,ssw,kwabelana ngemininingwane enhlanganweni kutakw...
1304,xho,umntu ocela ukukhuselwa ngumbuso ngumntu obale...
1460,ven,kha vha ḓivhe hezwi a vha nga ḓo ita ndingo ya...
...,...,...
32980,ssw,inhloso ye-wua kutsi yente bantfu bendzawo let...
32983,ssw,timiso tesigatjana titawusebenta ngetingucuko ...
32985,nso,ge o nyaka go kgopela phihlelelo ya direkoto t...
32989,ssw,imenenja yesigodzi utakwatisa ngembhalo uma le...


In [109]:
working_df = working_df.drop_duplicates(keep='first', inplace=False)

In [110]:
working_df.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [111]:
working_df.shape

(29948, 2)

In [112]:
df_test.shape

(5682, 2)

Let's replicate that on the test data.

In [113]:
duplicate = df_test[df_test.duplicated('text')]
duplicate

Unnamed: 0,index,text
51,52,icandelwana () lithathelwe indawo licandelo lo...
327,328,kl () e emetswe ke k ya molaotheo tlhabololo w...
362,363,tumelelo pele e a hlokega
566,567,icandelwana () lithathelwe indawo licandelo lo...
569,570,werk aan die gang
...,...,...
5600,5601,<fn>gov-za en -- en txt< fn>
5609,5610,tirelo yeo e kgahlišago le go kgotsofala ga ma...
5619,5620,klaargemaakte produkte wat nie deur hierdie bo...
5652,5653,ngitjheja kobana isifungo sisibopho kusazelo s...


In [114]:
df_test = df_test.drop_duplicates(keep='first', inplace=False)

In [115]:
df_test.shape

(5682, 2)

## Validation Split on Cleaned Data

In [116]:
X_clean = working_df['text']

y_clean = working_df['lang_id']

X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(X_clean, y_clean, test_size=0.3, random_state = 42)

In [117]:
nb = Pipeline([('tfidf', TfidfVectorizer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train_clean, y_train_clean)

y_pred_clean = nb.predict(X_test_clean)

print('accuracy %s' % accuracy_score(y_pred_clean, y_test_clean))
report = classification_report(y_test_clean, y_pred_clean,output_dict=True)
results = pd.DataFrame(report).transpose()
results

accuracy 0.9969949916527546


Unnamed: 0,precision,recall,f1-score,support
afr,1.0,1.0,1.0,806.0
eng,0.989201,1.0,0.994571,916.0
nbl,0.998536,0.982709,0.990559,694.0
nso,0.998818,0.998818,0.998818,846.0
sot,0.998843,1.0,0.999421,863.0
ssw,0.998641,0.997286,0.997963,737.0
tsn,1.0,1.0,1.0,844.0
tso,0.997528,1.0,0.998762,807.0
ven,1.0,1.0,1.0,765.0
xho,0.998739,0.994975,0.996853,796.0


In [128]:
linsvc = Pipeline([('tfidf', TfidfVectorizer()),
                   ('clf', LinearSVC())])

linsvc.fit(X_train_clean, y_train_clean)

y_pred_clean = linsvc.predict(X_test_clean)

print('accuracy %s' % accuracy_score(y_pred_clean, y_test_clean))
report = classification_report(y_test_clean, y_pred_clean,output_dict=True)
results = pd.DataFrame(report).transpose()
results

accuracy 0.9958820255982193


Unnamed: 0,precision,recall,f1-score,support
afr,1.0,0.998759,0.999379,806.0
eng,0.996736,1.0,0.998365,916.0
nbl,0.992722,0.982709,0.98769,694.0
nso,1.0,0.997636,0.998817,846.0
sot,0.998843,1.0,0.999421,863.0
ssw,0.993225,0.994573,0.993898,737.0
tsn,0.998817,1.0,0.999408,844.0
tso,1.0,1.0,1.0,807.0
ven,1.0,1.0,1.0,765.0
xho,0.991228,0.993719,0.992472,796.0


The F1 score drops slighly after cleaning up the data. This will most like drop when we upload to kaggle. Let's try hyper-parameter tuning on the Naive Bayes model and train the model on uncleaned data.

In [119]:
MultinomialNB().get_params().keys()

dict_keys(['alpha', 'class_prior', 'fit_prior'])

In [126]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])
tuned_parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': [1, 1e-1, 1e-2]
}

In [127]:
from sklearn.metrics import classification_report
ftwo_scorer = make_scorer(fbeta_score, beta=2, average='micro')
clf = GridSearchCV(text_clf, tuned_parameters, cv=10, scoring=ftwo_scorer)

clf.fit(X_train, y_train)

print(classification_report(y_test, clf.predict(X_test), digits=4))

              precision    recall  f1-score   support

         afr     0.9989    1.0000    0.9994       895
         eng     0.9989    1.0000    0.9995       909
         nbl     0.9977    1.0000    0.9989       879
         nso     1.0000    0.9989    0.9995       941
         sot     0.9989    1.0000    0.9995       918
         ssw     1.0000    1.0000    1.0000       908
         tsn     1.0000    0.9989    0.9994       882
         tso     1.0000    1.0000    1.0000       857
         ven     1.0000    1.0000    1.0000       936
         xho     1.0000    0.9978    0.9989       922
         zul     0.9988    0.9977    0.9982       853

    accuracy                         0.9994      9900
   macro avg     0.9994    0.9994    0.9994      9900
weighted avg     0.9994    0.9994    0.9994      9900



It get's better! Now let us see how this will do on the unseen data. 

In [129]:
X_test_final = df_test['text']
y_pred_3 = clf.predict(X_test_final)

In [130]:
df_test['lang_id'] = y_pred_3

In [131]:
df_test.head()

Unnamed: 0,index,text,lang_id
0,1,mmasepala fa maemo a a kgethegileng a letlelel...,tsn
1,2,uzakwaziswa ngokufaneleko nakungafuneka eminye...,nbl
2,3,tshivhumbeo tshi fana na ngano dza vhathu,ven
3,4,kube inja nelikati betingevakala kutsi titsini...,ssw
4,5,winste op buitelandse valuta,afr


In [132]:
df_final2 = df_test.drop(['text'], axis = 1)

In [133]:
df_final2.head()

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl
2,3,ven
3,4,ssw
4,5,afr


In [134]:
df_final2.to_csv(r'C:\Users\Dorcas Oduor\Documents\EDSA\Advanced Classification\south-african-language-identification\Final tuned.csv' , index = False)

The tuned hyperparameters give an f1_score of 0.96 on Kaggle. That's pretty good. 

There's still more room for improvement like maybe trying tuning the hyperparameters on a different model. The Gris Search CV was however resource and time consuming and I chose to conclude it there. 