# Importing necessary libraries

In [2]:
import pandas as pd
import numpy as np
import nltk
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.metrics import f1_score

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Data Description

The dataset used for this challenge is the NCHLT Text Corpora collected by the South African Department of Arts and Culture & Centre for Text Technology (CTexT, North-West University, South Africa). The training set was improved through additional cleaning done by Praekelt.

The data is in the form Language ID, Text. The text is in various states of cleanliness. Some NLP techniques will be necessary to clean up the data.


**File descriptions**

train_set.csv - the training set

test_set.csv - the test set

sample_submission.csv - a sample submission file in the correct format


**Language IDs**

afr - Afrikaans

eng - English

nbl - isiNdebele

nso - Sepedi

sot - Sesotho

ssw - siSwati

tsn - Setswana

tso - Xitsonga

ven - Tshivenda

xho - isiXhosa

zul - isiZulu

# Loading dataset

In [3]:
train = pd.read_csv('train_set.csv')
test = pd.read_csv('test_set.csv')

In [4]:
train.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


# Data  preprocessing

In [1]:
'''
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

#loading the en_core_web_sm_model
stopwords = STOP_WORDS
nlp = spacy.load('en_core_web_sm')


def preprocess(train):
    #creating a Doc object
    doc = nlp(train, disable = ['ner', 'parser'])
    #Generating lemmas
    lemmas = [token.lemma_ for token in doc]
    #remove stopwords and non-alphabetic characters
    a_lemma = [lemma for lemma in lemmas
              if lemma.isalpha() and lemma not in stopwords ]
    return ' ' .join(a_lemma)

#apply preprocessing to posts
train['text_new']= train['text'].apply(preprocess)
'''

"\nimport spacy\nfrom spacy.lang.en.stop_words import STOP_WORDS\n\n#loading the en_core_web_sm_model\nstopwords = STOP_WORDS\nnlp = spacy.load('en_core_web_sm')\n\n\ndef preprocess(train):\n    #creating a Doc object\n    doc = nlp(train, disable = ['ner', 'parser'])\n    #Generating lemmas\n    lemmas = [token.lemma_ for token in doc]\n    #remove stopwords and non-alphabetic characters\n    a_lemma = [lemma for lemma in lemmas\n              if lemma.isalpha() and lemma not in stopwords ]\n    return ' ' .join(a_lemma)\n\n#apply preprocessing to posts\ntrain['text_new']= train['text'].apply(preprocess)\n"

In [5]:
train.lang_id.value_counts()

tso    3000
tsn    3000
nbl    3000
eng    3000
xho    3000
afr    3000
ssw    3000
nso    3000
zul    3000
sot    3000
ven    3000
Name: lang_id, dtype: int64

# Random Forrest Classifier

In [7]:
# Seperate features and tagret variables
y = train['lang_id']
X = train['text']

In [9]:

vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=2, stop_words="english")
X_vectorized = vectorizer.fit_transform(X)

In [10]:
# Split the train data to create validation dataset
X_train,X_val,y_train,y_val = train_test_split(X_vectorized,y,test_size=.3,shuffle=True, stratify=y, random_state=11)

In [35]:

rfc = RandomForestClassifier(max_features=4, random_state=42)
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_val)

In [36]:
f1_score(y_val, rfc_pred, average="macro")

0.9964655269584973

In [37]:
testx = test['text']
test_vect = vectorizer.transform(testx)

In [38]:
y_pred = rfc.predict(test_vect)

In [39]:
test['lang_id'] = y_pred

In [42]:
test.head()

Unnamed: 0,index,text,lang_id
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele...",ssw
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...,nbl
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.,ven
3,4,Kube inja nelikati betingevakala kutsi titsini...,ssw
4,5,Winste op buitelandse valuta.,ssw


In [41]:
test[['index','lang_id']].to_csv('test_rfc_submission.csv', index=False)


# Gradient Boosting Classifier

In [45]:
from sklearn.ensemble import GradientBoostingClassifier

In [46]:
# Fit Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1)

gb_model.fit(X_train, y_train)

GradientBoostingClassifier(learning_rate=1.0, max_depth=1)

In [48]:
import sklearn.metrics as metrics


In [49]:
# Create a prediction set:
pred_gb = gb_model.predict(X_val)

# Print a confusion matrix for NB Model
print(metrics.confusion_matrix(y_val,pred_gb))

[[881   0  19   0   0   0   0   0   0   0   0]
 [  3 829  68   0   0   0   0   0   0   0   0]
 [  1   1 799   8   0  35   2   3   0  36  15]
 [  0   0  14 860   1   0  25   0   0   0   0]
 [  0   0  14  35 833   0  17   1   0   0   0]
 [  0   7 392   2   0 478   0   4   3   0  14]
 [  0   3  16  49   1   0 831   0   0   0   0]
 [  0   0   5   5   1   0   2 873  14   0   0]
 [  0   0  12   0  12   0   4   1 871   0   0]
 [  0   5 165   4   0   0   0   1   0 715  10]
 [  0   6 416   6   0   8   0   3   0  22 439]]


In [53]:
f1_score(y_val, pred_gb, average="macro")

0.8567063173542224

In [50]:
# Using the test data to test the Gradient Boosting model
test_gb = test['text']
test_vect_gb = vectorizer.transform(test_gb)

In [51]:
# Predict the sentiment using the test data
y_pred_gb = gb_model.predict(test_vect_gb)

In [52]:
# Assign a new column on the test data by using the predicted sentiment from the tweets from test data
test['lang_id'] = y_pred_gb

In [54]:
# save the csv file and submit it. 
test[['index','lang_id']].to_csv('test_gb_submission.csv', index=False)

# Linear SVC

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
lsvc = LinearSVC()
#print(lsvc)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

# fit the model on train data and check the model accuracy score.
lsvc.fit(X_train, y_train)
score = lsvc.score(X_train, y_train)

In [12]:
#apply a cross-validation training method to the model and check the training score.
cv_scores = cross_val_score(lsvc, X_train, y_train, cv=10)

In [13]:
from sklearn.metrics import confusion_matrix
#Now, we can predict the test data by using the trained model. After the prediction, we'll check the accuracy level by using the confusion matrix functionfrom sklearn.metrics import confusion_matrix

ypred = lsvc.predict(X_val)

cm = confusion_matrix(y_val, ypred)

In [14]:
#We can also create a classification report by using classification_report() function on predicted data to check the other accuracy metrics
#Classification report
from sklearn.metrics import classification_report
print(classification_report(y_val, ypred))

              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       900
         eng       1.00      1.00      1.00       900
         nbl       0.99      0.99      0.99       900
         nso       1.00      1.00      1.00       900
         sot       1.00      1.00      1.00       900
         ssw       1.00      1.00      1.00       900
         tsn       1.00      1.00      1.00       900
         tso       1.00      1.00      1.00       900
         ven       1.00      1.00      1.00       900
         xho       1.00      1.00      1.00       900
         zul       0.99      0.99      0.99       900

    accuracy                           1.00      9900
   macro avg       1.00      1.00      1.00      9900
weighted avg       1.00      1.00      1.00      9900



In [15]:
### Using the test data to test the lsv model
test_lsv = test['text']
test_vecto = vectorizer.transform(test_lsv)

In [16]:
# Predict the sentiment using the test data
y_pred_lsv = lsvc.predict(test_vecto)

In [17]:
test['lang_id'] = y_pred_lsv

In [18]:
# save the csv file and submit it. 
test[['index','lang_id']].to_csv('test_lsv2_submission.csv', index=False)

# Multinomial NB

In [19]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB().fit(X_train, y_train)

In [20]:
predicted = model.predict(X_val)

print(np.mean(predicted == y_val))

0.9988888888888889


In [21]:
print(confusion_matrix(y_val, predicted))

[[900   0   0   0   0   0   0   0   0   0   0]
 [  0 900   0   0   0   0   0   0   0   0   0]
 [  1   0 896   0   0   0   0   0   0   0   3]
 [  0   0   0 900   0   0   0   0   0   0   0]
 [  0   0   0   0 900   0   0   0   0   0   0]
 [  0   0   0   0   0 900   0   0   0   0   0]
 [  1   0   0   1   0   0 898   0   0   0   0]
 [  0   0   0   0   0   0   0 900   0   0   0]
 [  0   0   0   0   0   0   0   0 900   0   0]
 [  0   1   0   0   0   0   0   0   0 898   1]
 [  0   2   1   0   0   0   0   0   0   0 897]]


In [22]:
# save the csv file and submit it. 
test[['index','lang_id']].to_csv('test_mnb_submission.csv', index=False)

# SGD Classifier

In [23]:
#define the classifier by using the SGDClassifier class. Then fit it on the train data
from sklearn.linear_model import SGDClassifier
sgdc = SGDClassifier(max_iter=5000, tol=0.01)
print(sgdc)
 
sgdc.fit(X_train, y_train)

SGDClassifier(max_iter=5000, tol=0.01)


SGDClassifier(max_iter=5000, tol=0.01)

In [24]:
#After the training the classifier, we'll check the model accuracy score
score = sgdc.score(X_train, y_train)
print("Training score: ", score)

Training score:  0.9998268398268398


In [25]:
# Now, we can predict the test data by using the trained model. After the prediction, we'll check the accuracy level by using the confusion matrix function

ypred = sgdc.predict(X_val)

cm = confusion_matrix(y_val, ypred)
print(cm)

[[900   0   0   0   0   0   0   0   0   0   0]
 [  0 900   0   0   0   0   0   0   0   0   0]
 [  1   0 888   0   0   0   0   0   0   2   9]
 [  0   0   0 898   0   0   2   0   0   0   0]
 [  0   0   0   1 899   0   0   0   0   0   0]
 [  0   0   0   0   0 897   0   0   0   0   3]
 [  1   0   0   0   0   0 899   0   0   0   0]
 [  0   0   0   0   0   0   0 900   0   0   0]
 [  0   0   0   0   0   0   0   0 900   0   0]
 [  0   2   1   0   0   0   0   0   0 896   1]
 [  0   1   6   0   0   1   0   0   0   5 887]]


In [26]:
#We can also create a classification report by using classification_report() function on predicted data to check the other accuracy metrics.

cr = classification_report(y_val, ypred)
print(cr)

              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       900
         eng       1.00      1.00      1.00       900
         nbl       0.99      0.99      0.99       900
         nso       1.00      1.00      1.00       900
         sot       1.00      1.00      1.00       900
         ssw       1.00      1.00      1.00       900
         tsn       1.00      1.00      1.00       900
         tso       1.00      1.00      1.00       900
         ven       1.00      1.00      1.00       900
         xho       0.99      1.00      0.99       900
         zul       0.99      0.99      0.99       900

    accuracy                           1.00      9900
   macro avg       1.00      1.00      1.00      9900
weighted avg       1.00      1.00      1.00      9900



In [27]:
test_sgdc = test['text']
test_vector = vectorizer.transform(test_sgdc)

In [28]:
# Predict the sentiment using the test data
y_pred_sgdc = sgdc.predict(test_vector)

In [29]:
test['lang_id'] = y_pred_sgdc

In [31]:
# save the csv file and submit it. 
test[['index','lang_id']].to_csv('test_sgdc_submission5.csv', index=False)

# KNN

In [32]:
from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier(n_neighbors = 4)
print(knc)
 
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=4, p=2,
                     weights='uniform')

KNeighborsClassifier(n_neighbors=4)


KNeighborsClassifier(n_neighbors=4)

In [33]:
#We'll fit the model on the train data. After the training the classifier, we'll check the model accuracy score.

knc.fit(X_train, y_train)
 
score = knc.score(X_train, y_train)
print("Training score: ", score)

Training score:  0.9824675324675325


In [34]:
# Now, we can predict the test data by using the trained model. After the prediction, we'll check the accuracy level by using the confusion matrix function
from sklearn.metrics import confusion_matrix
ypred = knc.predict(X_val)

cm = confusion_matrix(y_val, ypred)
print(cm)

[[899   0   0   0   1   0   0   0   0   0   0]
 [  2 896   0   0   0   2   0   0   0   0   0]
 [  4   5 853   0   0   3   0   0   0  10  25]
 [  1   1   0 880   8   0  10   0   0   0   0]
 [  1   1   0  16 875   0   7   0   0   0   0]
 [  0   9   9   0   0 869   0   0   0   4   9]
 [  3   1   0  30  38   0 827   0   0   0   1]
 [  0   0   0   0   1   0   0 899   0   0   0]
 [  0   1   3   1   0   2   0   0 893   0   0]
 [  1   7  21   0   1   2   0   2   0 852  14]
 [  0  11  43   0   0  22   2   1   1  43 777]]


In [35]:
#We can also create a classification report by using classification_report() function on predicted data to check the other accuracy metrics
from sklearn.metrics import classification_report
cr = classification_report(y_val, ypred)
print(cr)

              precision    recall  f1-score   support

         afr       0.99      1.00      0.99       900
         eng       0.96      1.00      0.98       900
         nbl       0.92      0.95      0.93       900
         nso       0.95      0.98      0.96       900
         sot       0.95      0.97      0.96       900
         ssw       0.97      0.97      0.97       900
         tsn       0.98      0.92      0.95       900
         tso       1.00      1.00      1.00       900
         ven       1.00      0.99      1.00       900
         xho       0.94      0.95      0.94       900
         zul       0.94      0.86      0.90       900

    accuracy                           0.96      9900
   macro avg       0.96      0.96      0.96      9900
weighted avg       0.96      0.96      0.96      9900



In [36]:
test_knc = test['text']
test_vector = vectorizer.transform(test_knc)

In [37]:
y_pred_knc = knc.predict(test_vector)

In [38]:
test['lang_id'] = y_pred_knc

In [39]:
test[['index','lang_id']].to_csv('test_knc_submission.csv', index=False)