In [None]:
import csv
import re

import numpy as np
import pandas as pd

from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import preprocessing, linear_model, pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, log_loss, make_scorer, recall_score, precision_score, f1_score

import matplotlib.pyplot as plt

RANDOM_STATE = 1

In [None]:
url_train_dev = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vTOZ2rC82rhNsJduoyKYTsVeH6ukd7Bpxvxn_afOibn3R-eadZGXu82eCU9IRpl4CK_gefEGsYrA_oM/pub?gid=1863430984&single=true&output=tsv'
url_test = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vT-KNR9nuYatLkSbzSRgpz6Ku1n4TN4w6kKmFLkA6QJHTfQzmX0puBsLF7PAAQJQAxUpgruDd_RRgK7/pub?gid=417546901&single=true&output=tsv'

In [None]:
from io import StringIO
import requests

def load_dataset(url):
    r = requests.get(url)
    data = r.content.decode('utf8')
    df = pd.read_csv(StringIO(data), sep='\t')
    df.columns = ['tweet', 'label']
    return df

In [None]:
df_train_dev = load_dataset(url_train_dev)
df_test = load_dataset(url_test)

In [None]:
df_train_dev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52675 entries, 0 to 52674
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweet   52675 non-null  object
 1   label   52675 non-null  object
dtypes: object(2)
memory usage: 823.2+ KB


In [None]:
df_train_dev.head()

Unnamed: 0,tweet,label
0,يا من أناديها ويخنقني البكاء ويكاد صمت الدمع ...,ar
1,فيه فرق بين اهل غزة اللى مطحونين من ناحيتين وب...,ar
2,ﻋﻦ ﺍﻟﻠﺤﻈﺔ اﻟﺤﻠﻮﺓﺓ ﺍﻟﻠﻲ ﺑﺘﻐﻤﺾ ﻓﻴﻬﺎ ﻋﻴﻨﻴﻚ ﺑﺘﻔﻜﺮ ...,ar
3,يا ابو سلو عرفتني,ar
4,ب50 ريال أكفل معتمر في رمضان ، ولك بإذن الله م...,ar


In [None]:
labels = list(df_train_dev.label.unique())

In [None]:
# clean data, remove @someone and urls 
def clean(s):
  s = re.sub('http://\S+|https://\S+', '', s)
  s = re.sub('@\S+|@\S+', '', s)
  s = re.sub('#\S+|#\S+','',s)
  return s
  
df_train_dev['tweet'] = df_train_dev['tweet'].apply(lambda s: clean(s))
df_test['tweet'] = df_test['tweet'].apply(lambda s: clean(s))

In [None]:
# Preprocess the training and testing dataframe. 

# shuffle datasets
df_train = shuffle(df_train_dev, random_state = RANDOM_STATE)
df_test = shuffle(df_test, random_state = RANDOM_STATE)
  
X_train = df_train['tweet']
y_train = df_train['label']
X_test = df_test['tweet']
y_test = df_test['label']
  
# use bag of words method to vectorize features based on the corpus in training set
vectorizer = CountVectorizer().fit(X_train)
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

# numeric encode for y
lbl_enc = preprocessing.LabelEncoder().fit(y_train.values)
y_train = lbl_enc.transform(y_train.values)



In [None]:
# define a scoring for grid search cv
f1 = make_scorer(f1_score , average='macro')

In [None]:
# search hyperparameter space and get the optimal combination of hyperparameters for the SGDClassifier

# initiate SGDClassifier
model = linear_model.SGDClassifier()

# create pipeline 
clf = pipeline.Pipeline([('sgdc', model)])

# define hyperparameter space
param_grid = {'sgdc__loss':['hinge', 'log'], 'sgdc__penalty':['l2','l1'], 'sgdc__alpha':[0.0001, 0.1, 100], 'sgdc__early_stopping':[True, False]} 

# initiate grid search
grid_search_sgdc = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=f1,
                                 verbose=10, n_jobs=-1, iid=True, refit=True, cv=3)

# fit model on training set
grid_search_sgdc.fit(X_train, y_train) 

print("Best score: %0.3f" % grid_search_sgdc.best_score_)
print("Best parameters set:", grid_search_sgdc.best_params_)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:   26.6s
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   35.1s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   51.7s
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  2.8min finished


Best score: 0.291
Best parameters set: {'sgdc__alpha': 0.0001, 'sgdc__early_stopping': False, 'sgdc__loss': 'hinge', 'sgdc__penalty': 'l2'}


In [None]:
# training Multinomial Naive Baye's classifier, get optimal hyperparameter by grid search

# initiate MNB
nb_model = MultinomialNB()

# create pipeline 
clf = pipeline.Pipeline([('nb', nb_model)])

# define hyperparameter space
param_grid = {'nb__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# initiate Grid Search Model
grid_search_nb = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=f1,
                                 verbose=10, n_jobs=-1, iid=True, refit=True, cv=3)

# fit grid search model: after finding the best estimator, retrain it on the whole training data
grid_search_nb.fit(X_train, y_train)  

print("Best f1 score: %.3f" % grid_search_nb.best_score_)
print("Best parameters set: %s" % grid_search_nb.best_params_)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    5.1s finished


Best f1 score: 0.349
Best parameters set: {'nb__alpha': 0.1}


In [None]:
# predict the labels of test set by the two best models of SGDClassifier and Multinomial Naive Bayes

# get the best model of SGDClassifier
model_sgdc = grid_search_sgdc.best_estimator_
# predict test labels by the best model of SGDClassifier
y_test_pred_sgdc = model_sgdc.predict(X_test)
# inversely transform the numeric labels back to string labels
label_pred_sgdc = lbl_enc.inverse_transform(y_test_pred_sgdc)

# get the best model of MNB
model_nb = grid_search_nb.best_estimator_
# predict test labels by the best multinomial naive bayes model
y_test_pred_nb = model_nb.predict(X_test)
# inversely transform the numeric labels back to string labels
label_pred_nb = lbl_enc.inverse_transform(y_test_pred_nb)


In [None]:
# compare the test accuray of the two best models
accur_sgdc = accuracy_score(y_test, label_pred_sgdc)
accur_nb = accuracy_score(y_test, label_pred_nb)
print("Accuracy of best SGDClassifier: %0.3f \nAccuracy of best Multinomial Naive Bayes classifier: %0.3f" %(accur_sgdc, accur_nb))

Accuracy of best SGDClassifier: 0.821 
Accuracy of best Multinomial Naive Bayes classifier: 0.731


In [None]:
# confusion matrices
test_labels = np.unique(y_test)

cm =  confusion_matrix(y_test, label_pred_sgdc, labels=test_labels)
cm_sgdc = pd.DataFrame(cm, index=test_labels, columns=test_labels)

cm =  confusion_matrix(y_test, label_pred_nb, labels=test_labels)
cm_nb = pd.DataFrame(cm, index=test_labels, columns=test_labels)

# print out the confusion matrices into a file for easy reading
cm_sgdc.to_csv("confusion matrix_SGDC.csv",index=True, header=True)
cm_nb.to_csv("confusion matrix_NB.csv",index=True, header=True)

In [None]:
# average macro precision and recall of the two models
ave_precision_sgdc = precision_score(y_test, label_pred_sgdc, labels=test_labels, average='macro')
ave_precision_nb = precision_score(y_test, label_pred_nb, labels=test_labels, average='macro')

ave_recall_sgdc = recall_score(y_test, label_pred_sgdc, labels=test_labels, average='macro')
ave_recall_nb = recall_score(y_test, label_pred_nb, labels=test_labels, average='macro')

print("average macro (precision, recall) of SGDClassifier: (%.3f, %.3f) " % (ave_precision_sgdc, ave_recall_sgdc))
print("average macro (precision, recall) of MNB: (%.3f, %.3f) " % (ave_precision_nb, ave_recall_nb))

  _warn_prf(average, modifier, msg_start, len(result))


average macro (precision, recall) of SGDClassifier: (0.377, 0.227) 
average macro (precision, recall) of MNB: (0.397, 0.264) 


In [None]:
# precision and recall of each label
precision_sgdc = precision_score(y_test, label_pred_sgdc, labels=labels, average=None).reshape(-1,1)
precision_nb = precision_score(y_test, label_pred_nb, labels=labels, average=None).reshape(-1,1)

recall_sgdc = recall_score(y_test, label_pred_sgdc, labels=labels, average=None).reshape(-1,1)
recall_nb = recall_score(y_test, label_pred_nb, labels=labels, average=None).reshape(-1,1)

# concatenate precision and recall of each model
metrics_sgdc = np.concatenate([precision_sgdc, recall_sgdc],axis=1)
metrics_nb = np.concatenate([precision_nb, recall_nb],axis=1)

# add column names and row index labels, print out metrics of the two models
metrics_sgdc = pd.DataFrame(metrics_sgdc, columns=['precision', 'recall'])
metrics_nb = pd.DataFrame(metrics_nb, columns=['precision', 'recall'])
metrics_sgdc.index = labels
metrics_nb.index = labels
print('Metrics of SGDClassifer:')
print(metrics_sgdc)
print('\nMetrics of Multinomial Naive Bayes: \n', metrics_nb)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Metrics of SGDClassifer:
         precision    recall
ar        0.986333  0.818526
ar_LATN   0.000000  0.000000
az        0.000000  0.000000
bg        0.000000  0.000000
bn        0.000000  0.000000
...            ...       ...
vi        0.000000  0.000000
wo        0.000000  0.000000
xh        0.000000  0.000000
zh-CN     0.000000  0.000000
zh-TW     0.000000  0.000000

[69 rows x 2 columns]

Metrics of Multinomial Naive Bayes: 
          precision   recall
ar        0.984252  0.94518
ar_LATN   0.000000  0.00000
az        0.000000  0.00000
bg        0.000000  0.00000
bn        0.000000  0.00000
...            ...      ...
vi        1.000000  0.20000
wo        0.000000  0.00000
xh        0.000000  0.00000
zh-CN     0.000000  0.00000
zh-TW     0.000000  0.00000

[69 rows x 2 columns]


In [None]:
# number of 'tweet' for each label in training set
df_train.groupby(['label']).count()

Unnamed: 0_level_0,tweet
label,Unnamed: 1_level_1
ar,2199
ar_LATN,12
az,1
bg,2
bn,8
...,...
vi,16
wo,1
xh,1
zh-CN,25
