<a href="https://colab.research.google.com/github/JasonAlexanderDunbar/classification-predict-streamlit-template/blob/master/Jason_Dunbar_Classification_Hack.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **EDSA CLASSIFICATION HACK - USE VARIOUS NLP TECHNIQUES TO IDENTIFY A TEXT DOCUMENT AS ONE OF SOUTH AFRICA'S OFFICIAL LANGUAGES**

# PACKAGE IMPORTS

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import nltk
import re
from string import punctuation

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

# DATA IMPORTS

In [2]:
df_train = pd.read_csv('train_set.csv')
df_test = pd.read_csv('test_set.csv')

# INITIAL EXPLORATORY DATA ANALYSIS

In [3]:
df_train.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [4]:
df_train.shape

(33000, 2)

In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33000 entries, 0 to 32999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   lang_id  33000 non-null  object
 1   text     33000 non-null  object
dtypes: object(2)
memory usage: 515.8+ KB


In [6]:
df_train['lang_id'].value_counts()

zul    3000
sot    3000
eng    3000
ssw    3000
ven    3000
xho    3000
tsn    3000
tso    3000
afr    3000
nso    3000
nbl    3000
Name: lang_id, dtype: int64

In [7]:
df_test.reset_index()

Unnamed: 0,level_0,index,text
0,0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,3,4,Kube inja nelikati betingevakala kutsi titsini...
4,4,5,Winste op buitelandse valuta.
...,...,...,...
5677,5677,5678,You mark your ballot in private.
5678,5678,5679,Ge o ka kgetha ka bowena go se šomiše Mofani k...
5679,5679,5680,"E Ka kopo etsa kgetho ya hao ka hloko, hobane ..."
5680,5680,5681,"TB ke bokudi ba PMB, mme Morero o tla lefella ..."


In [8]:
np.array(df_test.index)

array([   0,    1,    2, ..., 5679, 5680, 5681])

# TEXT PRE-PROCESSING

In [9]:
def no_capitals(df):
    
    df['no_capitals'] = df['text'].str.lower()
    
    return df



In [10]:
no_capitals(df_train).head(15)

Unnamed: 0,lang_id,text,no_capitals
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,khomishini ya ndinganyiso ya mbeu yo ewa maana...
5,nso,dinyakišišo tše tša go dirwa gabedi ka ngwaga ...,dinyakišišo tše tša go dirwa gabedi ka ngwaga ...
6,tsn,kgetse nngwe le nngwe e e sa faposiwang mo tsh...,kgetse nngwe le nngwe e e sa faposiwang mo tsh...
7,ven,mbadelo dze dza laelwa dzi do kwama mahatulele...,mbadelo dze dza laelwa dzi do kwama mahatulele...
8,nso,maloko a dikhuduthamaga a ikarabela mongwe le ...,maloko a dikhuduthamaga a ikarabela mongwe le ...
9,tsn,fa le dirisiwa lebone le tshwanetse go bontsha...,fa le dirisiwa lebone le tshwanetse go bontsha...


In [11]:
no_capitals(df_test)

Unnamed: 0,index,text,no_capitals
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele...","mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...,uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.,tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...,kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.,winste op buitelandse valuta.
...,...,...,...
5677,5678,You mark your ballot in private.,you mark your ballot in private.
5678,5679,Ge o ka kgetha ka bowena go se šomiše Mofani k...,ge o ka kgetha ka bowena go se šomiše mofani k...
5679,5680,"E Ka kopo etsa kgetho ya hao ka hloko, hobane ...","e ka kopo etsa kgetho ya hao ka hloko, hobane ..."
5680,5681,"TB ke bokudi ba PMB, mme Morero o tla lefella ...","tb ke bokudi ba pmb, mme morero o tla lefella ..."


In [12]:
print(punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [13]:
def remove_punctuation(text_data):
    return ''.join([l for l in text_data if l not in punctuation])

In [14]:
df_train['no_punctuation'] = df_train['no_capitals'].apply(remove_punctuation)
df_train.head(15)

Unnamed: 0,lang_id,text,no_capitals,no_punctuation
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,umgaqo-siseko wenza amalungiselelo kumaziko ax...,umgaqosiseko wenza amalungiselelo kumaziko axh...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,i-dha iya kuba nobulumko bokubeka umsebenzi na...,idha iya kuba nobulumko bokubeka umsebenzi nap...
2,eng,the province of kwazulu-natal department of tr...,the province of kwazulu-natal department of tr...,the province of kwazulunatal department of tra...
3,nso,o netefatša gore o ba file dilo ka moka tše le...,o netefatša gore o ba file dilo ka moka tše le...,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,khomishini ya ndinganyiso ya mbeu yo ewa maana...,khomishini ya ndinganyiso ya mbeu yo ewa maana...
5,nso,dinyakišišo tše tša go dirwa gabedi ka ngwaga ...,dinyakišišo tše tša go dirwa gabedi ka ngwaga ...,dinyakišišo tše tša go dirwa gabedi ka ngwaga ...
6,tsn,kgetse nngwe le nngwe e e sa faposiwang mo tsh...,kgetse nngwe le nngwe e e sa faposiwang mo tsh...,kgetse nngwe le nngwe e e sa faposiwang mo tsh...
7,ven,mbadelo dze dza laelwa dzi do kwama mahatulele...,mbadelo dze dza laelwa dzi do kwama mahatulele...,mbadelo dze dza laelwa dzi do kwama mahatulele...
8,nso,maloko a dikhuduthamaga a ikarabela mongwe le ...,maloko a dikhuduthamaga a ikarabela mongwe le ...,maloko a dikhuduthamaga a ikarabela mongwe le ...
9,tsn,fa le dirisiwa lebone le tshwanetse go bontsha...,fa le dirisiwa lebone le tshwanetse go bontsha...,fa le dirisiwa lebone le tshwanetse go bontsha...


In [15]:
df_test['no_punctuation'] = df_test['no_capitals'].apply(remove_punctuation)

# INITIAL MACHINE LEARNING MODELIING

In [16]:
vect = CountVectorizer(max_features=2000)

In [17]:
X = vect.fit_transform(df_train['no_punctuation']).toarray()

In [None]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [18]:
le = LabelEncoder()

In [None]:
df_train['lang_id']

0        xho
1        xho
2        eng
3        nso
4        ven
        ... 
32995    tsn
32996    sot
32997    eng
32998    xho
32999    sot
Name: lang_id, Length: 33000, dtype: object

In [18]:
y = le.fit_transform(df_train['lang_id'])

y

array([9, 9, 1, ..., 1, 9, 4])

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

NameError: ignored

In [None]:
y_t_df  = pd.DataFrame(y_train)
y_t_df.value_counts()

7     2300
5     2268
2     2266
10    2265
6     2254
0     2246
4     2241
9     2239
1     2238
3     2218
8     2215
dtype: int64

In [None]:
logreg = LogisticRegression(multi_class='ovr')
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='ovr', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
y_pred = logreg.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

0.9798787878787879

In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       754
           1       1.00      1.00      1.00       762
           2       0.95      0.95      0.95       734
           3       1.00      0.99      1.00       782
           4       1.00      1.00      1.00       759
           5       0.96      0.96      0.96       732
           6       1.00      1.00      1.00       746
           7       1.00      1.00      1.00       700
           8       1.00      1.00      1.00       785
           9       0.95      0.95      0.95       761
          10       0.93      0.93      0.93       735

    accuracy                           0.98      8250
   macro avg       0.98      0.98      0.98      8250
weighted avg       0.98      0.98      0.98      8250



### using the above algorithm to make predictions of of the test dataset:

In [None]:
X_df_test = vect.fit_transform(df_test['no_punctuation']).toarray()

In [None]:
y_pred_test = logreg.predict(X_df_test)

In [None]:
y_pred_test = le.inverse_transform(y_pred_test)

In [None]:
y_pred_test

array(['ssw', 'ssw', 'xho', ..., 'ssw', 'ssw', 'ssw'], dtype=object)

In [None]:
submission_hackathon = pd.DataFrame({
    'index' : df_test['index'],
    'lang_id' : y_pred_test
})

In [None]:
submission_hackathon.to_csv('submission_hackathon.csv', index=False)

In [None]:
from google.colab import files

files.download('submission_hackathon.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Naive Bayes algortithm

In [19]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

In [20]:
vect = CountVectorizer(stop_words='english', max_features=19000, ngram_range=(1,3), max_df=0.5)

In [21]:
X = vect.fit_transform(df_train['no_punctuation']).toarray()

In [19]:
df_train['lang_id']

0        xho
1        xho
2        eng
3        nso
4        ven
        ... 
32995    tsn
32996    sot
32997    eng
32998    xho
32999    sot
Name: lang_id, Length: 33000, dtype: object

In [22]:
y = le.fit_transform(df_train['lang_id'].values.astype(str))
y

array([9, 9, 1, ..., 1, 9, 4])

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

In [24]:
model.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [25]:
y_pred = model.predict(X_test)

In [26]:
accuracy_score(y_test, y_pred)

0.9970909090909091

In [27]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       783
           1       1.00      1.00      1.00       752
           2       0.99      0.99      0.99       762
           3       1.00      1.00      1.00       725
           4       1.00      1.00      1.00       707
           5       1.00      1.00      1.00       786
           6       1.00      1.00      1.00       741
           7       1.00      1.00      1.00       729
           8       1.00      1.00      1.00       746
           9       0.99      1.00      1.00       745
          10       0.99      0.98      0.99       774

    accuracy                           1.00      8250
   macro avg       1.00      1.00      1.00      8250
weighted avg       1.00      1.00      1.00      8250



In [28]:
X_t = vect.transform(df_test['no_punctuation']).toarray()

In [29]:
y_pred_test = model.predict(X_t)

In [30]:
y_pred_test = le.inverse_transform(y_pred_test)
y_pred_test

array(['tsn', 'nbl', 'ven', ..., 'sot', 'sot', 'xho'], dtype='<U3')

In [31]:
submission_nayes = pd.DataFrame({
    'index': df_test['index'],
    'lang_id': y_pred_test
})

In [32]:
submission_nayes.to_csv('submission_nayes.csv', index=False)

In [33]:
from google.colab import files

files.download('submission_nayes.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# ITERATIVE MACHINE LEARNING MODELLING


In [None]:
vect = CountVectorizer(max_features=10000)

In [None]:
X = vect.fit_transform(df_train['no_punctuation']).toarray()

In [None]:
le = LabelEncoder()

In [None]:
y = le.fit_transform(df_train['lang_id'])
y

array([9, 9, 1, ..., 1, 9, 4])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X[: 20000], y[: 20000], test_size=0.25, random_state=42)

In [None]:
model_names = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Naive_Bayes', 'Gradient Boosting']

classifiers = [
               LogisticRegression(multi_class='ovr', solver='liblinear'),
               DecisionTreeClassifier(max_depth=5),
               RandomForestClassifier(max_depth=5, n_estimators=100, max_features=1),
               MultinomialNB(),
               GradientBoostingClassifier()
]

In [None]:
results = []

models = {}
confusion = {}
class_report = {}


for name, clf in zip(model_names, classifiers):    
    print ('Fitting {:s} model...'.format(name))
    clf.fit(X_train, y_train)
    
    print ('... predicting')
    y_pred = clf.predict(X_train)   
    y_pred_test = clf.predict(X_test)
    
    print ('... scoring')
    accuracy  = accuracy_score(y_train, y_pred)
    precision = metrics.precision_score(y_train, y_pred, average='weighted')
    recall    = metrics.recall_score(y_train, y_pred, average='weighted')
    
    f1        = metrics.f1_score(y_train, y_pred, average='weighted')    
    f1_test   = metrics.f1_score(y_test, y_pred_test, average='weighted')    
    
    # Save the results to dictionaries
    models[name] = clf    
    confusion[name] = metrics.confusion_matrix(y_train, y_pred)
    class_report[name] = metrics.classification_report(y_train, y_pred)
    
    results.append([name, accuracy, precision, recall, f1, f1_test])

    
results = pd.DataFrame(results, columns=['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1 Train', 'F1 Test'])
results.set_index('Classifier', inplace= True)

print ('... All done!')

Fitting Logistic Regression model...
... predicting
... scoring
Fitting Decision Tree model...
... predicting
... scoring
Fitting Random Forest model...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


... predicting
... scoring
Fitting Naive_Bayes model...
... predicting
... scoring
Fitting Gradient Boosting model...
... predicting
... scoring
... All done!


In [None]:
results

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Train,F1 Test
Classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Logistic Regression,0.999933,0.999933,0.999933,0.999933,0.992998
Decision Tree,0.570733,0.713231,0.570733,0.520612,0.517733
Random Forest,0.495867,0.794503,0.495867,0.537461,0.52262
Naive_Bayes,0.9986,0.998602,0.9986,0.998599,0.996598
Gradient Boosting,0.9856,0.985878,0.9856,0.985645,0.965784


### Deeper analysis of SVMs

In [None]:
vect = CountVectorizer(stop_words='english', max_features=20000, ngram_range=(1,3))

In [None]:
X = vect.fit_transform(df_train['no_punctuation']).toarray()

In [None]:
y = le.fit_transform(df_train['lang_id'])
y

array([9, 9, 1, ..., 1, 9, 4])

In [None]:
svm = SVC()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X[: 5000], y[: 5000], test_size=0.25, random_state=42)

In [None]:
svm.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [None]:
y_pred = svm.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

0.976

In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00       112
           1       0.94      1.00      0.97       117
           2       0.97      0.92      0.94       109
           3       0.96      1.00      0.98       110
           4       1.00      0.97      0.98       125
           5       1.00      0.92      0.96       111
           6       0.99      0.99      0.99       110
           7       1.00      0.98      0.99       119
           8       0.98      1.00      0.99       118
           9       0.93      0.99      0.96       112
          10       0.96      0.97      0.97       107

    accuracy                           0.98      1250
   macro avg       0.98      0.98      0.98      1250
weighted avg       0.98      0.98      0.98      1250



In [None]:
X_t = vect.transform(df_test['no_punctuation']).toarray()

In [None]:
y_pred_t = svm.predict(X_t)

In [None]:
y_pred_t = le.inverse_transform(y_pred_t)
y_pred_t

In [None]:
submission_svm = pd.DataFrame({
    'index': df_test['index'],
    'lang_id': y_pred_t
})

In [None]:
submission_svm.to_csv('submission_svm.csv', index=False)

In [None]:
files.download('submission_svm.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### AdaBoost

In [None]:
from sklearn.ensemble import GradientBoostingClassifier



In [None]:
vect = CountVectorizer(stop_words='english', max_features=5000, ngram_range=(1,3))

In [None]:
X = vect.fit_transform(df_train['no_punctuation']).toarray()

In [None]:
y = le.fit_transform(df_train['lang_id'])
y

array([9, 9, 1, ..., 1, 9, 4])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
gbc = GradientBoostingClassifier()

In [None]:
gbc.fit(X[:3300], y[:3300])

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [None]:
y_pred = gbc.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

0.9353939393939394

# SUBMISSION PREDICTIONS