In [98]:
import pandas as pd
import time
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import pickle
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.linear_model import LogisticRegression, SGDClassifier

import string
import re

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline


In [99]:
train = pd.read_csv('train_set.csv')
test = pd.read_csv('test_set.csv')

In [100]:
train.head(10)

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...
5,nso,dinyakišišo tše tša go dirwa gabedi ka ngwaga ...
6,tsn,kgetse nngwe le nngwe e e sa faposiwang mo tsh...
7,ven,mbadelo dze dza laelwa dzi do kwama mahatulele...
8,nso,maloko a dikhuduthamaga a ikarabela mongwe le ...
9,tsn,fa le dirisiwa lebone le tshwanetse go bontsha...


In [101]:
test.head(10)

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.
5,6,"Ke feela dilense tše hlakilego, tša pono e tee..."
6,7,<fn>(762010101403 AM) 1495 Final Gems Birthing...
7,8,Ntjhafatso ya konteraka ya mosebetsi: Etsa bon...
8,9,u-GEMS uhlinzeka ngezinzuzo zemithi yezifo ezi...
9,10,"So, on occasion, are statistics misused."


In [102]:
train["lang_id"].value_counts()

afr    3000
nso    3000
sot    3000
ven    3000
tsn    3000
ssw    3000
xho    3000
nbl    3000
zul    3000
tso    3000
eng    3000
Name: lang_id, dtype: int64

In [103]:
train.shape

(33000, 2)

In [104]:
train.describe()

Unnamed: 0,lang_id,text
count,33000,33000
unique,11,29948
top,afr,ngokwesekhtjheni yomthetho ophathelene nalokhu...
freq,3000,17


# Cleaning the Data

In [105]:
def data_clean(text):
    text = text.lower().replace('-', ' ')
    
    translation_table = str.maketrans('\n', ' ', string.punctuation+string.digits)
    
    text = text.translate(translation_table)
    return text


In [106]:
train['text'] = train['text'].apply(data_clean)
test['text'] = test['text'].apply(data_clean)

In [107]:
train.head(10)

Unnamed: 0,lang_id,text
0,xho,umgaqo siseko wenza amalungiselelo kumaziko ax...
1,xho,i dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...
5,nso,dinyakišišo tše tša go dirwa gabedi ka ngwaga ...
6,tsn,kgetse nngwe le nngwe e e sa faposiwang mo tsh...
7,ven,mbadelo dze dza laelwa dzi do kwama mahatulele...
8,nso,maloko a dikhuduthamaga a ikarabela mongwe le ...
9,tsn,fa le dirisiwa lebone le tshwanetse go bontsha...


# Feature Engineering 

In [108]:
X = train.text
y = train.lang_id

In [109]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.10)

In [110]:
x_train.shape, x_test.shape

((29700,), (3300,))

# Classifiers

In [111]:
classifiers = [
               MultinomialNB(),
               ComplementNB(),
               LinearSVC(random_state=0)
               ]


In [112]:
def models_building(classifiers, x_train, y_train, x_test, y_test):
    class_models = {}
    for clf in classifiers:
        clf_text = Pipeline([('tfidf', TfidfVectorizer(min_df=1,
                                                       max_df=0.9,
                                                       ngram_range=(1, 2))),
                             ('clf', clf)])
        start_time = time.time()
        clf_text.fit(x_train, y_train)
        predictions = clf_text.predict(x_test)
        run_time = time.time()-start_time

        # Output for each model
        class_models[clf.__class__.__name__] = {
            'F1-Macro': metrics.f1_score(y_test,
                                         predictions,
                                         average='macro'),
            'F1-Accuracy': metrics.f1_score(y_test, predictions,
                                            average='micro'),
            'F1-Weighted': metrics.f1_score(y_test,
                                            predictions,
                                            average='weighted'),
            'Execution Time': run_time}

    return pd.DataFrame.from_dict(class_models, orient='index')


In [113]:
class_df = models_building(classifiers, x_train, y_train, x_test, y_test)
sort_df = class_df.sort_values('F1-Macro', ascending=False)
sort_df

Unnamed: 0,F1-Macro,F1-Accuracy,F1-Weighted,Execution Time
MultinomialNB,0.998716,0.998788,0.998787,5.887033
ComplementNB,0.998114,0.998182,0.998181,5.761574
LinearSVC,0.997776,0.997879,0.997883,9.496264


# Building the Models

In [39]:
cv= CountVectorizer()
x_train_cv= cv.fit_transform(x_train)
x_test_cv= cv.transform(x_test)

In [391]:
rf=RandomForestClassifier()
rf.fit(x_train_cv,y_train)

RandomForestClassifier()

In [392]:
rf.score(x_test_cv,y_test)

0.9854545454545455

In [393]:
sv=SVC(kernel='linear')
sv.fit(x_train_cv,y_train)

SVC(kernel='linear')

In [394]:
sv.score(x_test_cv,y_test)

0.9948484848484849

In [40]:
nv=MultinomialNB()
nv.fit(x_train_cv,y_train)

MultinomialNB()

In [41]:
nv.score(x_test_cv,y_test)

0.9990909090909091

# Paramters/Tuning

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01)

In [80]:
#text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()),
#                     ('clf', nv),
#])
###########################################################
param_grid = {'alpha': [0.1, 1, 5, 10]}
#param_grid = [{'alpha': [0.1, 1, 10, 1000]}]
tuned_mnb = Pipeline([('tfidf', TfidfVectorizer(min_df=2,
                                                max_df=0.9,
                                                ngram_range=(1, 2))),
                      ('mnb', GridSearchCV(MultinomialNB(),
                                           param_grid=param_grid,
                                           cv=5,
                                           n_jobs=-1,
                                           scoring='f1_weighted')),
                      ])

tuned_mnb.fit(X_train, y_train)

y_pred = tuned_mnb.predict(X_test)

In [81]:
tuned_mnb.fit(X_train, y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_df=0.9, min_df=2, ngram_range=(1, 2))),
                ('mnb',
                 GridSearchCV(cv=5, estimator=MultinomialNB(), n_jobs=-1,
                              param_grid={'alpha': [0.1, 1, 5, 10]},
                              scoring='f1_weighted'))])

In [100]:
predictions = tuned_mnb.predict(X_test)

In [82]:
print(metrics.confusion_matrix(y_test,y_pred))

[[29  0  0  0  0  0  0  0  0  0  0]
 [ 0 30  0  0  0  0  0  0  0  0  0]
 [ 0  0 43  0  0  0  0  0  0  0  0]
 [ 0  0  0 33  0  0  0  0  0  0  0]
 [ 0  0  0  0 39  0  0  0  0  0  0]
 [ 0  0  0  0  0 23  0  0  0  0  0]
 [ 0  0  0  0  0  0 28  0  0  0  0]
 [ 0  0  0  0  0  0  0 34  0  0  0]
 [ 0  0  0  0  0  0  0  0 15  0  0]
 [ 0  0  0  0  0  0  0  0  0 22  0]
 [ 0  0  0  0  0  0  0  0  0  0 34]]


In [83]:
print(metrics.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         afr       1.00      1.00      1.00        29
         eng       1.00      1.00      1.00        30
         nbl       1.00      1.00      1.00        43
         nso       1.00      1.00      1.00        33
         sot       1.00      1.00      1.00        39
         ssw       1.00      1.00      1.00        23
         tsn       1.00      1.00      1.00        28
         tso       1.00      1.00      1.00        34
         ven       1.00      1.00      1.00        15
         xho       1.00      1.00      1.00        22
         zul       1.00      1.00      1.00        34

    accuracy                           1.00       330
   macro avg       1.00      1.00      1.00       330
weighted avg       1.00      1.00      1.00       330



In [84]:
print(metrics.accuracy_score(y_test,y_pred))

1.0


In [85]:
submission_df = pd.DataFrame(test['index'])
submission_df['lang_id'] = tuned_mnb.predict(test['text'])
submission_df.to_csv('Jason_Farrell_MultinomialNB.csv', index=False)

In [None]:
submission_df.head()

In [410]:
do_law_of_zipf(train)

NameError: name 'do_law_of_zipf' is not defined

## Randomd testing stuff below

In [9]:
# converting categorical variables to numerical

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [10]:
data_list = []
for text in X:
    text = re.sub(r'[!@#$(),\n"%^*?\:;~`0-9]', ' ', text)
    text = re.sub(r'[[]]', ' ', text)
    text = text.lower()
    data_list.append(text)

In [11]:
# creating bag of words using countvectorizer


cv = CountVectorizer()
X = cv.fit_transform(data_list).toarray()

In [12]:
X.shape

(33000, 141958)

In [14]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

MemoryError: Unable to allocate 31.4 GiB for an array with shape (29700, 141958) and data type int64