In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from matplotlib import pyplot as plt
import seaborn as sns

from scipy.sparse import vstack, hstack
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.dummy import DummyClassifier
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import OneHotEncoder

In [2]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
path = '/content/drive/My Drive/BT4222 Notebooks/final_data.csv'
df = pd.read_csv(path)

# Examine first five rows to check if the csv file is read properly into pandas
df.head(5)

Unnamed: 0.1,Unnamed: 0,keyword,location,text,target
0,0,no_keyword,no_location,Our Deeds are the Reason of this earthquake Ma...,1
1,1,no_keyword,no_location,Forest fire near La Ronge Sask Canada,1
2,2,no_keyword,no_location,All residents asked to shelter in place are be...,1
3,3,no_keyword,no_location,people receive wildfires evacuation orders in ...,1
4,4,no_keyword,no_location,Just got sent this photo from Ruby Alaska as s...,1


In [4]:
df.shape

(22624, 5)

In [5]:
df = df.dropna()

In [6]:
df.shape

(22616, 5)

In [7]:
X = df.text
y = df.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=1)

In [8]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)
print(y_train.shape)
print(y_val.shape)
print(y_test.shape)

(13569,)
(4523,)
(4524,)
(13569,)
(4523,)
(4524,)


In [9]:
vect = CountVectorizer()
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)

**Multinomial Naive Bayes Model**

In [10]:
nb = MultinomialNB()
%time nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_val_dtm)
from sklearn import metrics

CPU times: user 8.84 ms, sys: 2.91 ms, total: 11.7 ms
Wall time: 14.4 ms


In [11]:
y_pred_prob = nb.predict_proba(X_val_dtm)[:, 1]


**Initial Validation AUC score and accuracy score**

In [None]:
print('auc score: ', metrics.roc_auc_score(y_val, y_pred_prob))
print('accuracy score: ', metrics.accuracy_score(y_val, y_pred_class))

auc score:  0.8380485723843927
accuracy score:  0.8461198319699315


**Tuning with lowercase = false for countvectorizer** <br>
**Result: failed to increase validation accuracy** 

In [None]:
vect = CountVectorizer(lowercase=False)
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, nb.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, nb.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, nb.predict(X_test_dtm)))


validation accuracy: 0.8416979880610215
training accuracy: 0.9165008475200825
test accuracy: 0.8541114058355438


**Tuning with stopword='english' for countvectorizer** <br>
**Result: failed to increase validation accuracy** 

In [None]:
vect = CountVectorizer(stop_words='english')
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, nb.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, nb.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, nb.predict(X_test_dtm)))

validation accuracy: 0.8432456334291399
training accuracy: 0.9263026015181665
test accuracy: 0.8430592396109637


**Tuning with ngram(1,2) (1,3) (2,2) (2,3)** <br>
**Result: ngram(1,2) successfully increased validation accuracy by the highest margin** 

In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, nb.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, nb.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, nb.predict(X_test_dtm)))

validation accuracy: 0.851204952465178
training accuracy: 0.9702999484118211
test accuracy: 0.8541114058355438


In [None]:
vect = CountVectorizer(ngram_range=(1,3))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, nb.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, nb.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, nb.predict(X_test_dtm)))

validation accuracy: 0.8505416758788414
training accuracy: 0.9903456407988798
test accuracy: 0.8499115826702034


In [None]:
vect = CountVectorizer(ngram_range=(2,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, nb.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, nb.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, nb.predict(X_test_dtm)))

validation accuracy: 0.8310855626796374
training accuracy: 0.9900508512049525
test accuracy: 0.8344385499557914


In [None]:
vect = CountVectorizer(ngram_range=(2,3))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, nb.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, nb.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, nb.predict(X_test_dtm)))

validation accuracy: 0.8308644704841919
training accuracy: 0.994693787309308
test accuracy: 0.8320070733863837


**So we finalised the countvectorizer hyperparameters to be ngram(1,2), and the rest are default** <br>
**First, tune the hyperparameter of fit_prior = False** <br>
**Result: failed to increase validation accuracy, because the accuracy remained the same** 


In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
nb = MultinomialNB(fit_prior=False)
nb.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, nb.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, nb.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, nb.predict(X_test_dtm)))

validation accuracy: 0.851204952465178
training accuracy: 0.9691944874345936
test accuracy: 0.8552166224580018


**Secondly, tune the hyperparameter of alpha with varying values. ** <br>
**alpha is the smoothing parameter.** <br>
**Result: alpha = 0.33 gives the highest validation accuracy of 0.8536**

In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
nb = MultinomialNB(alpha=0.1)
nb.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, nb.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, nb.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, nb.predict(X_test_dtm)))

validation accuracy: 0.848994030510723
training accuracy: 0.9922617731594074
test accuracy: 0.8428381962864722


In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
nb = MultinomialNB(alpha = 0.2)
nb.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, nb.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, nb.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, nb.predict(X_test_dtm)))

validation accuracy: 0.851204952465178
training accuracy: 0.9899771538064707
test accuracy: 0.8477011494252874


In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
nb = MultinomialNB(alpha = 0.25)
nb.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, nb.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, nb.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, nb.predict(X_test_dtm)))

validation accuracy: 0.85208932124696
training accuracy: 0.9890190876262068
test accuracy: 0.8488063660477454


In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
nb = MultinomialNB(alpha = 0.3)
nb.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, nb.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, nb.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, nb.predict(X_test_dtm)))

validation accuracy: 0.853415874419633
training accuracy: 0.988061021445943
test accuracy: 0.8505747126436781


In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
nb = MultinomialNB(alpha = 0.33)
nb.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, nb.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, nb.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, nb.predict(X_test_dtm)))

validation accuracy: 0.8536369666150785
training accuracy: 0.9873977448596064
test accuracy: 0.8521220159151194


In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
nb = MultinomialNB(alpha = 0.34)
nb.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, nb.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, nb.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, nb.predict(X_test_dtm)))

validation accuracy: 0.8527525978332965
training accuracy: 0.9870292578671973
test accuracy: 0.8521220159151194


In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
nb = MultinomialNB(alpha = 0.35)
nb.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, nb.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, nb.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, nb.predict(X_test_dtm)))

validation accuracy: 0.852531505637851
training accuracy: 0.9866607708747881
test accuracy: 0.8521220159151194


In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
nb = MultinomialNB(alpha = 0.4)
nb.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, nb.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, nb.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, nb.predict(X_test_dtm)))

validation accuracy: 0.852531505637851
training accuracy: 0.9851868229051515
test accuracy: 0.8538903625110522


**Finalised Model for NaiveBayes Classifier:** <br>
**For countvectorizer - ngram(1,2)**<br>
**For nb model - alpha = 0.33**<br>
**Rationale: based on highest validation accuracy**

In [13]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
nb = MultinomialNB(alpha = 0.33)
nb.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, nb.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, nb.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, nb.predict(X_test_dtm)))
print('auc score: ', metrics.roc_auc_score(y_test, nb.predict(X_test_dtm)))
print('f1 score: ', metrics.f1_score(y_test, nb.predict(X_test_dtm)))

validation accuracy: 0.8536369666150785
training accuracy: 0.9873977448596064
test accuracy: 0.8521220159151194
auc score:  0.777458862389304
f1 score:  0.6842850401132609


**Training Accuracy: 0.9873977448596064**

**Validation Accuracy: 0.8536369666150785**

**Test Accuracy: 0.8521220159151194**

**AUC: 0.777458862389304**

**F1 Score: 0.6842850401132609**