Importing Python Libraries

In [99]:

# Loading Data
import pandas as pd
import numpy as np
import nltk
import string
import re
import time

# Data Preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.utils import resample
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer

# Model Building
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline

# Model Evaluation
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
#from scikitplot.metrics import plot_roc, plot_confusion_matrix

# Explore Data Analysis
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from wordcloud import WordCloud, STOPWORDS
from matplotlib.pyplot import rcParams

from sklearn.feature_extraction.text import CountVectorizer




Load the data as a Pandas DataFrame

In [2]:
train = pd.read_csv('train_set.csv')
test = pd.read_csv('test_set.csv') 

In [3]:
train.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [4]:
train.lang_id.value_counts()

tso    2675
xho    2672
afr    2663
zul    2657
nso    2651
tsn    2649
nbl    2636
eng    2635
sot    2635
ven    2608
ssw    2606
Name: lang_id, dtype: int64

Checking for null values in the data

In [5]:
train.isnull().sum()

lang_id    0
text       0
dtype: int64

In [75]:
test.isnull().sum()

index    0
text     0
dtype: int64

An overview of text statistics

In [77]:
#brief description of the train data
train.text.describe()

count                                                 29087
unique                                                26681
top       ngokwesekhtjheni yomthetho ophathelene nalokhu...
freq                                                     15
Name: text, dtype: object

In [79]:
#brief description of the test data
test.text.describe()

count                        5682
unique                       5459
top       peomolao ya bosetšhaba.
freq                            6
Name: text, dtype: object

In [80]:
#description of the data per sentiment class
train[['lang_id','text']].groupby('lang_id').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
lang_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
afr,2663,2374,al die vorms van promosie kos jou onderneming ...,5
eng,2635,2634,provincial revenue includes budgeted equitable...,2
nbl,2636,2132,ngokwesekhtjheni yomthetho ophathelene nalokhu...,15
nso,2651,2553,bolwetši bo phatlalala ka phetelo ya lerothiny...,4
sot,2635,2501,tip-offs anonymous ke tshebeletso ya mohala wa...,6
ssw,2606,2182,lesifo sisabalala ngematfonsi ematse k k uma n...,5
tsn,2649,2544,diforomo tsa kopo di ka tsewa gongwe go isiwa ...,9
tso,2675,2474,ku ya hi xiyenge xa xa nawu u ni mfanelo yo ko...,9
ven,2608,2299,vhabebi vhanu vha tea u ṋekana nga vhuṱanzi ha...,7
xho,2672,2396,ukuthintelwa kweempawu ezibonisa ukungalungi k...,6


In [6]:
X = train['text']
y = train['lang_id']

Splitting the training data into a training and validation set

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.3, random_state=42)

Pipeline

In [47]:
#Logistic Regression
lr = Pipeline([('tfidf', TfidfVectorizer()),
               ('clf', LogisticRegression())])

#Linear SVC
rf = Pipeline([('tfidf', TfidfVectorizer()),
                   ('clf', RandomForestClassifier())])


#Random Forest Classifier
dt = Pipeline([('tfidf', TfidfVectorizer()),
               ('clf', DecisionTreeClassifier())])

#Linear SVC
linsvc = Pipeline([('tfidf', TfidfVectorizer()),
                   ('clf', LinearSVC())])

#Multinomial Naive Bayes
param_grid = {'alpha':[0.01, 0.1, 1]}
MNB = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(1,3), min_df=2)),                         
                  ('clf', GridSearchCV(MultinomialNB(),
                                       param_grid=param_grid,
                                       cv=5,
                                       n_jobs=-1) )])   

Random Forest Classifier

In [48]:
# Random Forest Classifier
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

Logistic Regression Classifier

In [49]:
# Logistic regression
lr.fit(X_train, y_train) # fit the model
y_pred_lr = lr.predict(X_test) # form a prediction set

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Decision Tree Classifier

In [50]:
# Decision Tree
dt.fit(X_train, y_train) # fit the model
y_pred_dt = dt.predict(X_test) # form a prediction set

LinearSVC

In [51]:
# Linear SVC
linsvc.fit(X_train, y_train)
y_pred_linsvc = linsvc.predict(X_test)

MultinomialNB

In [52]:
# MultinomialNB
MNB.fit(X_train, y_train)
y_pred_MNB = MNB.predict(X_test)

Model Evaluation

The base model will be evaluated using the validation dataset that was kept aside from the training data. After a test dataset will be used to make predictions. That will help us to understand whether we are overfitting our model or not.

Model evaluation using validation data

The training data set was split into a training set and an evaluation set. The evaluation set will be used to evaluate the model before evaluated using the test dataset.

Model evaluation using test data

Data tranformation with Vectorizer

Making predictions on the test set

Model Analysis

The performance of a clssification model is based on the counts of test record corrently and incorrectly predicted by the model.

Classification Report

Random Forest Classifier

In [53]:
print("Classification Report for Validation Dataset")
print(classification_report(y_test, y_pred_rf))

Classification Report for Validation Dataset
              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       831
         eng       0.99      1.00      1.00       782
         nbl       0.97      0.96      0.97       751
         nso       1.00      0.99      1.00       792
         sot       1.00      1.00      1.00       782
         ssw       0.99      0.95      0.97       787
         tsn       0.99      1.00      1.00       804
         tso       1.00      1.00      1.00       789
         ven       1.00      1.00      1.00       809
         xho       0.98      0.97      0.98       823
         zul       0.92      0.97      0.95       777

    accuracy                           0.99      8727
   macro avg       0.99      0.99      0.99      8727
weighted avg       0.99      0.99      0.99      8727



Logistic Classifier

In [54]:
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

         afr       1.00      0.99      1.00       831
         eng       1.00      1.00      1.00       782
         nbl       0.99      0.98      0.99       751
         nso       1.00      0.99      1.00       792
         sot       1.00      1.00      1.00       782
         ssw       0.99      0.99      0.99       787
         tsn       1.00      1.00      1.00       804
         tso       1.00      1.00      1.00       789
         ven       1.00      1.00      1.00       809
         xho       0.99      1.00      0.99       823
         zul       0.98      0.98      0.98       777

    accuracy                           0.99      8727
   macro avg       0.99      0.99      0.99      8727
weighted avg       0.99      0.99      0.99      8727



Decision Tree Classifier

In [55]:
print(classification_report(y_test, y_pred_dt))

              precision    recall  f1-score   support

         afr       1.00      0.99      1.00       831
         eng       0.98      1.00      0.99       782
         nbl       0.88      0.90      0.89       751
         nso       0.98      0.96      0.97       792
         sot       0.98      0.98      0.98       782
         ssw       0.95      0.86      0.90       787
         tsn       0.96      0.98      0.97       804
         tso       0.98      0.99      0.98       789
         ven       0.98      0.99      0.98       809
         xho       0.93      0.89      0.91       823
         zul       0.80      0.87      0.83       777

    accuracy                           0.95      8727
   macro avg       0.95      0.95      0.95      8727
weighted avg       0.95      0.95      0.95      8727



LinearSVC

In [56]:
print("Classification Report for Validation Dataset")
print(classification_report(y_test, y_pred_linsvc))

Classification Report for Validation Dataset
              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       831
         eng       1.00      1.00      1.00       782
         nbl       0.99      0.99      0.99       751
         nso       1.00      1.00      1.00       792
         sot       1.00      1.00      1.00       782
         ssw       1.00      0.99      1.00       787
         tsn       1.00      1.00      1.00       804
         tso       1.00      1.00      1.00       789
         ven       1.00      1.00      1.00       809
         xho       1.00      1.00      1.00       823
         zul       0.99      0.99      0.99       777

    accuracy                           1.00      8727
   macro avg       1.00      1.00      1.00      8727
weighted avg       1.00      1.00      1.00      8727



In [57]:
print("Classification Report for Validation Dataset")
print(classification_report(y_test, y_pred_MNB))

Classification Report for Validation Dataset
              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       831
         eng       0.99      1.00      1.00       782
         nbl       1.00      0.99      1.00       751
         nso       1.00      1.00      1.00       792
         sot       1.00      1.00      1.00       782
         ssw       1.00      1.00      1.00       787
         tsn       1.00      1.00      1.00       804
         tso       1.00      1.00      1.00       789
         ven       1.00      1.00      1.00       809
         xho       1.00      1.00      1.00       823
         zul       0.99      0.99      0.99       777

    accuracy                           1.00      8727
   macro avg       1.00      1.00      1.00      8727
weighted avg       1.00      1.00      1.00      8727



Overall f1-score

In [58]:
# Random Forest
rfc_f1=f1_score(y_test, y_pred_rf, average="macro")
# Logistic Model
lmc_f1=f1_score(y_test, y_pred_lr, average="macro")
#Decision Tree
dtc_f1=f1_score(y_test, y_pred_dt, average="macro")
#LinearSVC
linSVC_f1=f1_score(y_test, y_pred_linsvc, average="macro")
#MultinomialNB
MNB_f1=f1_score(y_test, y_pred_MNB, average="macro")

 Submissions


In [59]:
df1 = test.copy()
df1

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.
...,...,...
5677,5678,You mark your ballot in private.
5678,5679,Ge o ka kgetha ka bowena go se šomiše Mofani k...
5679,5680,"E Ka kopo etsa kgetho ya hao ka hloko, hobane ..."
5680,5681,"TB ke bokudi ba PMB, mme Morero o tla lefella ..."


In [60]:
test_X = df1.text

In [104]:
y_pred =MNB.predict(test_X)

In [105]:
df1['lang_id'] = y_pred

In [106]:
df1.head()

Unnamed: 0,index,text,lang_id
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele...",tsn
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...,nbl
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.,ven
3,4,Kube inja nelikati betingevakala kutsi titsini...,ssw
4,5,Winste op buitelandse valuta.,afr


Creating an output csv for submission

In [64]:
df1[['index', 'lang_id']].to_csv('submission7.csv', index=False)

Hyperparameter Tuning

In [84]:
tfid = TfidfVectorizer()

text = tfid.fit_transform(train['text'])
X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(text, y, test_size = 0.3, random_state=42)
params = {'alpha': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]}

grid_MNB = GridSearchCV(MultinomialNB(), params)
grid_MNB.fit(X_train_h, y_train_h)
print(grid_MNB.best_params_)

{'alpha': 0.1}


In [110]:
#Multinomial Naive Bayes
MNB = Pipeline([('tfidf', TfidfVectorizer()),                         
                  ('clf', MultinomialNB(alpha = 0.1))])   

In [97]:
# MultinomialNB
MNB.fit(X_train, y_train)
y_pred_MNB = MNB.predict(X_test)

In [109]:
df1[['index', 'lang_id']].to_csv('submission9.csv', index=False)
