In [195]:
# imports for Natural Language  Processing
import pandas as pd
import numpy as np
import nltk
import re
from sklearn.pipeline import Pipeline
import pickle

# Adding a comment


# feature extractioin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


# classification models
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.neighbors import KNeighborsClassifier


# Hyperparameter tunning methods
#import parfit.parfit as pf
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from imblearn.over_sampling import SMOTE

# metrics

from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer


In [165]:
# Importing the train & test data sets
train = pd.read_csv('train_set.csv')
test = pd.read_csv('test_set.csv')


In [166]:
#Checking if there are missing values in the Train dataset
train.isna().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33000 entries, 0 to 32999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   lang_id  33000 non-null  bool 
 1   text     33000 non-null  bool 
dtypes: bool(2)
memory usage: 64.6 KB


In [167]:
#Cheching if there are missing values in the Test dataset
test.isna().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5682 entries, 0 to 5681
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   index   5682 non-null   bool 
 1   text    5682 non-null   bool 
dtypes: bool(2)
memory usage: 11.2 KB


In [168]:
# Part of Speech for modeling
def POS(word):
    """
    This function gets the part of speech
    """
    pos_counts = Counter()
    probable_part_of_speech = wordnet.synsets(word)
    pos_counts["n"] = len([i for i in probable_part_of_speech if i.pos()=="n"])
    pos_counts["v"] = len([i for i in probable_part_of_speech if i.pos()=="v"])
    pos_counts["a"] = len([i for i in probable_part_of_speech if i.pos()=="a"])
    pos_counts["r"] = len([i for i in probable_part_of_speech if i.pos()=="r"])
    part_of_speech = pos_counts.most_common(1)[0][0]
    return part_of_speech

In [169]:
# Clean text
def text_lang(df):
    '''
    This function cleans the tweets by tokenizing, removing punctuation, 
    removing digits and removing 1 character tokens
    
    '''

    # tokenizing the tweets
    text_lang = df['text'].apply(TweetTokenizer().tokenize) ## first we tokenize

    # remove punctuation
    text_lang = text_lang.apply(lambda x : [token for token in x if token not in string.punctuation])

    # removing digits from the tweets
    text_lang = text_lang.apply(lambda x: [token for token in x if token not in list(string.digits)])

    # lastly we remove all one character tokens
    text_lang = text_lang.apply(lambda x: [token for token in x if len(token) > 1])
    
    df['cleaned_text'] = text_lang
    
    return df['cleaned_text']

In [170]:
text_lang(train)

0        [umgaqo-siseko, wenza, amalungiselelo, kumazik...
1        [i-dha, iya, kuba, nobulumko, bokubeka, umsebe...
2        [the, province, of, kwazulu-natal, department,...
3        [netefatša, gore, ba, file, dilo, ka, moka, tš...
4        [khomishini, ya, ndinganyiso, ya, mbeu, yo, ew...
                               ...                        
32995    [popo, ya, dipolateforomo, tse, ke, go, tlisa,...
32996    [modise, mosadi, na, ntse, sa, utlwe, hore, th...
32997    [closing, date, for, the, submission, of, comp...
32998    [nawuphina, umntu, ofunyenwe, enetyala, phants...
32999    [mafapha, mang, le, ona, lokela, ho, etsa, dit...
Name: cleaned_text, Length: 33000, dtype: object

In [171]:
train

Unnamed: 0,lang_id,text,cleaned_text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,"[umgaqo-siseko, wenza, amalungiselelo, kumazik..."
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,"[i-dha, iya, kuba, nobulumko, bokubeka, umsebe..."
2,eng,the province of kwazulu-natal department of tr...,"[the, province, of, kwazulu-natal, department,..."
3,nso,o netefatša gore o ba file dilo ka moka tše le...,"[netefatša, gore, ba, file, dilo, ka, moka, tš..."
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,"[khomishini, ya, ndinganyiso, ya, mbeu, yo, ew..."
...,...,...,...
32995,tsn,popo ya dipolateforomo tse ke go tlisa boetele...,"[popo, ya, dipolateforomo, tse, ke, go, tlisa,..."
32996,sot,modise mosadi na o ntse o sa utlwe hore thaban...,"[modise, mosadi, na, ntse, sa, utlwe, hore, th..."
32997,eng,closing date for the submission of completed t...,"[closing, date, for, the, submission, of, comp..."
32998,xho,nawuphina umntu ofunyenwe enetyala phantsi kwa...,"[nawuphina, umntu, ofunyenwe, enetyala, phants..."


In [183]:
y = train['lang_id'].values
X = train['text'].values

In [184]:
# vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=5,
# stop_words="english")
vectorizer = TfidfVectorizer(sublinear_tf=True,
                             smooth_idf=True,
                             max_df=0.3,
                            # min_df=1,
                             strip_accents='ascii',
                             ngram_range=(1, 2))
X_vectorized = vectorizer.fit_transform(X)

In [185]:
print('Shape of the vectorised data: {}'.format(X_vectorized.shape))

Shape of the vectorised data: (33000, 710950)


In [186]:
testx = test['text']
test_vect = vectorizer.transform(testx.values)
test_vect.data

array([0.334003  , 0.43226427, 0.23104263, ..., 0.57380339, 0.53319736,
       0.19925329])

In [188]:
# initializing a variable to the desired SMOTE
smote = SMOTE(random_state=2)#sampling_strategy='minority')

# fit SMOTE to training dataset
X_smote, y_smote = smote.fit_resample(X_vectorized, y)

In [189]:
X_smote.shape

(33000, 710950)

In [190]:
y_smote.shape

(33000,)

In [191]:
X_train, X_val, y_train, y_val = train_test_split(X_vectorized, y,
                                                  #X_smote,y_smote,
                                                  test_size=.1,
                                                  random_state=42
                                                  )

In [89]:
# Complement Naive Bayes
cnb = ComplementNB()
cnb.fit(X_train, y_train)
y_pred = cnb.predict(X_val)

print('Getting the Best Model Performance' + '\n')
print('Accuracy: {}'.format(accuracy_score(y_val, y_pred)))
print('F1: {}'.format(f1_score(y_val, y_pred, average='macro')))
print('\n' + classification_report(y_val, y_pred))

Getting the Best Model Performance

Accuracy: 0.996969696969697
F1: 0.9970141197848694

              precision    recall  f1-score   support

         afr       0.99      1.00      1.00       281
         eng       0.99      1.00      0.99       297
         nbl       0.99      0.99      0.99       327
         nso       1.00      1.00      1.00       322
         sot       1.00      1.00      1.00       307
         ssw       1.00      1.00      1.00       286
         tsn       1.00      1.00      1.00       297
         tso       1.00      1.00      1.00       253
         ven       1.00      1.00      1.00       322
         xho       1.00      1.00      1.00       313
         zul       1.00      0.99      0.99       295

    accuracy                           1.00      3300
   macro avg       1.00      1.00      1.00      3300
weighted avg       1.00      1.00      1.00      3300



In [90]:
from sklearn.neighbors import KNeighborsClassifier

In [138]:
# Carrying out cross-validation and checking F1 score for different classifiers
random_state = 42
kf = KFold(n_splits=10,
           random_state=random_state,
           shuffle=True)  # Define number of KFolds

In [142]:
params = {'alpha': [0.1, 0.5, 1, 10],
          'norm': [True, False]}

clf2 = GridSearchCV(ComplementNB(),
                    param_grid=params,
                    cv=kf,
                    scoring=make_scorer(f1_score,
                                        average='macro'))
# Fit the gridsearch on the dataset
clf2 = clf2.fit(X_train, y_train)

In [143]:
y_pred = clf2.predict(X_val)

print('Getting the Best Model Performance' + '\n')
print('Accuracy: {}'.format(accuracy_score(y_val, y_pred)))
print('F1: {}'.format(f1_score(y_val, y_pred, average='macro')))
print('\n' + classification_report(y_val, y_pred))

Getting the Best Model Performance

Accuracy: 0.9975757575757576
F1: 0.9976053957960019

              precision    recall  f1-score   support

         afr       0.99      1.00      1.00       281
         eng       1.00      1.00      1.00       297
         nbl       0.99      1.00      1.00       327
         nso       1.00      0.99      1.00       322
         sot       0.99      1.00      1.00       307
         ssw       1.00      1.00      1.00       286
         tsn       1.00      1.00      1.00       297
         tso       1.00      1.00      1.00       253
         ven       1.00      1.00      1.00       322
         xho       1.00      1.00      1.00       313
         zul       1.00      0.99      0.99       295

    accuracy                           1.00      3300
   macro avg       1.00      1.00      1.00      3300
weighted avg       1.00      1.00      1.00      3300



In [144]:
testx = test['text']
test_vect = vectorizer.transform(testx.values)
# Predict the sentiment using the test data
y_pred = clf2.predict(test_vect)
# Assign a new column on the test data by using ...
# the predicted sentiment from the tweets from test data
test['lang_id'] = y_pred
# Look into the data that will be submitted on Kaggle as csv
test[['index', 'lang_id']].head()
# save the csv file and submit it.
test[['index', 'lang_id']].to_csv('test_ComplementNB_submission.csv', index=False)

In [96]:
cnb = ComplementNB(alpha=clf2.best_params_['alpha'],
                   norm=clf2.best_params_['norm'])
cnb.fit(X_train, y_train)
y_pred = cnb.predict(X_val)

cnb_tuned = ComplementNB()
cnb_tuned.fit(X_train, y_train)
y_pred_tuned = cnb_tuned.predict(X_val)

In [98]:
testx = test['text']
test_vect = vectorizer.transform(testx.values)
# Predict the sentiment using the test data
y_pred = cnb.predict(test_vect)
# Assign a new column on the test data by using ...
# the predicted sentiment from the tweets from test data
test['lang_id'] = y_pred
# Look into the data that will be submitted on Kaggle as csv
test[['index', 'lang_id']].head()
# save the csv file and submit it.
test[['index', 'lang_id']].to_csv('test_ComplementNBtuned_submission.csv', index=False)

In [139]:
# Specify the range of 'C' parameters for LinearSVC
params = {'C': [0.1, 0.5, 1, 5, 10]}

# Setting the GridSearch for the best parameters
clf = GridSearchCV(LinearSVC(max_iter=4000, multi_class='ovr'),
                   param_grid=params, cv=kf,
                   scoring=make_scorer(f1_score, average='macro'))

# Fit the gridsearch on the dataset
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
print('Getting the Best Model Performance' + '\n')
print('Accuracy: {}'.format(accuracy_score(y_val, y_pred)))
print('F1: {}'.format(f1_score(y_val, y_pred, average='macro')))
print('\n' + classification_report(y_val, y_pred))

Getting the Best Model Performance

Accuracy: 0.9972727272727273
F1: 0.9973020448773352

              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       281
         eng       1.00      1.00      1.00       297
         nbl       1.00      0.99      1.00       327
         nso       1.00      0.99      1.00       322
         sot       0.99      1.00      1.00       307
         ssw       1.00      1.00      1.00       286
         tsn       1.00      1.00      1.00       297
         tso       1.00      1.00      1.00       253
         ven       1.00      1.00      1.00       322
         xho       0.99      1.00      1.00       313
         zul       0.99      1.00      0.99       295

    accuracy                           1.00      3300
   macro avg       1.00      1.00      1.00      3300
weighted avg       1.00      1.00      1.00      3300



In [141]:
testx = test['text']
test_vect = vectorizer.transform(testx.values)
# Predict the sentiment using the test data
y_pred = clf.predict(test_vect)
# Assign a new column on the test data by using ...
# the predicted sentiment from the tweets from test data
test['lang_id'] = y_pred
# Look into the data that will be submitted on Kaggle as csv
test[['index', 'lang_id']].head()
# save the csv file and submit it.
test[['index', 'lang_id']].to_csv('test_LinearSVC_submission.csv', index=False)