In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from matplotlib import pyplot as plt
import seaborn as sns

from scipy.sparse import vstack, hstack
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.dummy import DummyClassifier
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder

In [2]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
path = '/content/drive/My Drive/BT4222 Notebooks/final_data.csv'
df = pd.read_csv(path)

# Examine first five rows to check if the csv file is read properly into pandas
df.head(5)

Unnamed: 0.1,Unnamed: 0,keyword,location,text,target
0,0,no_keyword,no_location,Our Deeds are the Reason of this earthquake Ma...,1
1,1,no_keyword,no_location,Forest fire near La Ronge Sask Canada,1
2,2,no_keyword,no_location,All residents asked to shelter in place are be...,1
3,3,no_keyword,no_location,people receive wildfires evacuation orders in ...,1
4,4,no_keyword,no_location,Just got sent this photo from Ruby Alaska as s...,1


In [4]:
df.shape

(22624, 5)

In [5]:
df = df.dropna()

In [6]:
df.shape

(22616, 5)

In [7]:
X = df.text
y = df.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=1)

In [8]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)
print(y_train.shape)
print(y_val.shape)
print(y_test.shape)

(13569,)
(4523,)
(4524,)
(13569,)
(4523,)
(4524,)


In [9]:
vect = CountVectorizer()
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)

**Logistic Regression Model- set max_iter to 700 since default value is insufficient**

In [10]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(max_iter = 700)
%time logreg.fit(X_train_dtm, y_train)
y_pred_class = logreg.predict(X_val_dtm)
y_pred_prob = logreg.predict_proba(X_val_dtm)[:, 1]

CPU times: user 1.51 s, sys: 2.23 s, total: 3.74 s
Wall time: 1.97 s


In [11]:
print('auc score: ', metrics.roc_auc_score(y_val, y_pred_prob))
print('accuracy score: ', metrics.accuracy_score(y_val, y_pred_class))

auc score:  0.8563847037604262
accuracy score:  0.843687817820031


**Base Validation Accuracy: 0.843687817820031**

**Tuning with lowercase = false for countvectorizer** <br>
**Result: failed to increase validation accuracy** 

In [12]:
vect = CountVectorizer(lowercase=False)
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
logreg = LogisticRegression(max_iter = 700)
logreg.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, logreg.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, logreg.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, logreg.predict(X_test_dtm)))

validation accuracy: 0.8423612646473579
training accuracy: 0.9728793573586852
test accuracy: 0.8463748894783377


**Tuning with stopword='english' for countvectorizer** <br>
**Result: failed to increase validation accuracy** 

In [None]:
vect = CountVectorizer(stop_words='english')
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
logreg = LogisticRegression(max_iter = 700)
logreg.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, logreg.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, logreg.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, logreg.predict(X_test_dtm)))

validation accuracy: 0.8366128675657749
training accuracy: 0.9607192866091827
test accuracy: 0.8390804597701149


**Tuning with ngram(1,2) (1,3) (2,2) (2,3)** <br>
**Result: ngram(1,2) successfully increased validation accuracy by the highest margin** 

In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
logreg = LogisticRegression(max_iter = 700)
logreg.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, logreg.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, logreg.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, logreg.predict(X_test_dtm)))

validation accuracy: 0.8500994914879505
training accuracy: 0.996904709263763
test accuracy: 0.8465959328028294


In [None]:
vect = CountVectorizer(ngram_range=(1,3))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
logreg = LogisticRegression(max_iter = 700)
logreg.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, logreg.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, logreg.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, logreg.predict(X_test_dtm)))

validation accuracy: 0.849436214901614
training accuracy: 0.9980838676394723
test accuracy: 0.8448275862068966


In [None]:
vect = CountVectorizer(ngram_range=(2,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
logreg = LogisticRegression(max_iter = 700)
logreg.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, logreg.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, logreg.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, logreg.predict(X_test_dtm)))

validation accuracy: 0.8173778465620164
training accuracy: 0.993367234136635
test accuracy: 0.8176392572944297


In [None]:
vect = CountVectorizer(ngram_range=(2,3))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
logreg = LogisticRegression(max_iter = 700)
logreg.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, logreg.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, logreg.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, logreg.predict(X_test_dtm)))

validation accuracy: 0.8085341587441963
training accuracy: 0.9966099196698357
test accuracy: 0.8101237842617153


**So we finalised the countvectorizer hyperparameters to be ngram(1,2), and the rest are default** <br>


**For the logreg model, first we tune the hyperparameter penalty = 'none'** <br>
**Result: failed to increase validation accuracy** <br>
**We can observe that the training accuracy reached 99.93%, which suggests that the model is severely overfitting the training data**

In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
logreg = LogisticRegression(max_iter = 700, penalty='none')
logreg.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, logreg.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, logreg.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, logreg.predict(X_test_dtm)))

validation accuracy: 0.839708158302012
training accuracy: 0.9992630260151817
test accuracy: 0.8368700265251989


**Next, we can try to tune the tol parameter with varying values, since it is the tolerance for stopping criteria of the model** <br>
**Result: different values of tol failed to increase validation accuracy**


In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
logreg = LogisticRegression(max_iter = 700, tol=1e-3)
logreg.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, logreg.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, logreg.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, logreg.predict(X_test_dtm)))

validation accuracy: 0.8500994914879505
training accuracy: 0.996904709263763
test accuracy: 0.8465959328028294


In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
logreg = LogisticRegression(max_iter = 700, tol=1e-2)
logreg.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, logreg.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, logreg.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, logreg.predict(X_test_dtm)))

validation accuracy: 0.8500994914879505
training accuracy: 0.996904709263763
test accuracy: 0.8465959328028294


In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
logreg = LogisticRegression(max_iter = 700, tol=1e-1)
logreg.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, logreg.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, logreg.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, logreg.predict(X_test_dtm)))

validation accuracy: 0.8500994914879505
training accuracy: 0.996904709263763
test accuracy: 0.8465959328028294


In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
logreg = LogisticRegression(max_iter = 700, tol=2e-1)
logreg.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, logreg.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, logreg.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, logreg.predict(X_test_dtm)))

validation accuracy: 0.8500994914879505
training accuracy: 0.996904709263763
test accuracy: 0.8465959328028294


In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
logreg = LogisticRegression(max_iter = 700, tol=5e-1)
logreg.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, logreg.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, logreg.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, logreg.predict(X_test_dtm)))

validation accuracy: 0.8500994914879505
training accuracy: 0.996904709263763
test accuracy: 0.846816976127321


In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
logreg = LogisticRegression(max_iter = 700, tol=1e-5)
logreg.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, logreg.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, logreg.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, logreg.predict(X_test_dtm)))

validation accuracy: 0.8500994914879505
training accuracy: 0.996904709263763
test accuracy: 0.8465959328028294


In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
logreg = LogisticRegression(max_iter = 700, tol=1e-6)
logreg.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, logreg.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, logreg.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, logreg.predict(X_test_dtm)))

validation accuracy: 0.8500994914879505
training accuracy: 0.996904709263763
test accuracy: 0.8465959328028294


**Now, we can tune the C parameter, which is the inverse of regularization strength of our model.** <br>
**Smaller values specify stronger regularization.**

**Results: values of C = 0.9, 0.87, 0.83 yields us the highest validation accuracy**<br>
**However, we hope for the model to have higher regularization strength, therefore we should choose C=0.83**<br>
**New validation accuracy: 0.851204952465178**

In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
logreg = LogisticRegression(max_iter = 700, C=1.2)
logreg.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, logreg.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, logreg.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, logreg.predict(X_test_dtm)))

validation accuracy: 0.849436214901614
training accuracy: 0.9974942884516177
test accuracy: 0.8465959328028294


In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
logreg = LogisticRegression(max_iter = 700, C=1.1)
logreg.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, logreg.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, logreg.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, logreg.predict(X_test_dtm)))

validation accuracy: 0.849878399292505
training accuracy: 0.9972731962561722
test accuracy: 0.8470380194518126


In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
logreg = LogisticRegression(max_iter = 700, C=0.95)
logreg.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, logreg.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, logreg.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, logreg.predict(X_test_dtm)))

validation accuracy: 0.8505416758788414
training accuracy: 0.9966099196698357
test accuracy: 0.846816976127321


In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
logreg = LogisticRegression(max_iter = 700, C=0.9)
logreg.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, logreg.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, logreg.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, logreg.predict(X_test_dtm)))

validation accuracy: 0.851204952465178
training accuracy: 0.996020340481981
test accuracy: 0.846816976127321


In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
logreg = LogisticRegression(max_iter = 700, C=0.87)
logreg.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, logreg.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, logreg.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, logreg.predict(X_test_dtm)))

validation accuracy: 0.851204952465178
training accuracy: 0.9957255508880537
test accuracy: 0.8465959328028294


In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
logreg = LogisticRegression(max_iter = 700, C=0.83)
logreg.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, logreg.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, logreg.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, logreg.predict(X_test_dtm)))

validation accuracy: 0.851204952465178
training accuracy: 0.9955044586926082
test accuracy: 0.8470380194518126


In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
logreg = LogisticRegression(max_iter = 700, C=0.8)
logreg.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, logreg.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, logreg.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, logreg.predict(X_test_dtm)))

validation accuracy: 0.8509838602697325
training accuracy: 0.9954307612941263
test accuracy: 0.8470380194518126


In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
logreg = LogisticRegression(max_iter = 700, C=0.7)
logreg.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, logreg.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, logreg.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, logreg.predict(X_test_dtm)))

validation accuracy: 0.850762768074287
training accuracy: 0.9940305107229714
test accuracy: 0.8474801061007957


**Now, we can tune fit_intercept to False to determine if a constant should be added into the decision function of our model**<br>
**Result: failed to increase validation accuracy**

In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
logreg = LogisticRegression(max_iter = 700, C=0.83, fit_intercept=False)
logreg.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, logreg.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, logreg.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, logreg.predict(X_test_dtm)))

validation accuracy: 0.848994030510723
training accuracy: 0.9969784066622448
test accuracy: 0.8428381962864722


**We can tune the parameter of intercept_scaling, to see if we need to lessen the effect of regularization **<br>
**Result: various values of scaling failed to increase validation accuracy**

In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
logreg = LogisticRegression(max_iter = 700, C=0.83, intercept_scaling=1.1)
logreg.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, logreg.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, logreg.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, logreg.predict(X_test_dtm)))

validation accuracy: 0.851204952465178
training accuracy: 0.9955044586926082
test accuracy: 0.8470380194518126


In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
logreg = LogisticRegression(max_iter = 700, C=0.83, intercept_scaling=1.2)
logreg.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, logreg.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, logreg.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, logreg.predict(X_test_dtm)))

validation accuracy: 0.851204952465178
training accuracy: 0.9955044586926082
test accuracy: 0.8470380194518126


In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
logreg = LogisticRegression(max_iter = 700, C=0.83, intercept_scaling=1.9)
logreg.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, logreg.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, logreg.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, logreg.predict(X_test_dtm)))

validation accuracy: 0.851204952465178
training accuracy: 0.9955044586926082
test accuracy: 0.8470380194518126


In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
logreg = LogisticRegression(max_iter = 700, C=0.83, intercept_scaling=2.5)
logreg.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, logreg.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, logreg.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, logreg.predict(X_test_dtm)))

validation accuracy: 0.851204952465178
training accuracy: 0.9955044586926082
test accuracy: 0.8470380194518126


In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
logreg = LogisticRegression(max_iter = 700, C=0.83, intercept_scaling=5)
logreg.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, logreg.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, logreg.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, logreg.predict(X_test_dtm)))

validation accuracy: 0.851204952465178
training accuracy: 0.9955044586926082
test accuracy: 0.8470380194518126


In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
logreg = LogisticRegression(max_iter = 700, C=0.83, intercept_scaling=12)
logreg.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, logreg.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, logreg.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, logreg.predict(X_test_dtm)))

validation accuracy: 0.851204952465178
training accuracy: 0.9955044586926082
test accuracy: 0.8470380194518126


In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
logreg = LogisticRegression(max_iter = 700, C=0.83, intercept_scaling=0.6)
logreg.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, logreg.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, logreg.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, logreg.predict(X_test_dtm)))

validation accuracy: 0.851204952465178
training accuracy: 0.9955044586926082
test accuracy: 0.8470380194518126


In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
logreg = LogisticRegression(max_iter = 700, C=0.83, intercept_scaling=0.2)
logreg.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, logreg.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, logreg.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, logreg.predict(X_test_dtm)))

validation accuracy: 0.851204952465178
training accuracy: 0.9955044586926082
test accuracy: 0.8470380194518126


**Now we tune the hyperparameter of class_weight, to see if the model should have a more balanced mode.**<br>
**Result: failed to increase validation accuracy**


In [None]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
logreg = LogisticRegression(max_iter = 700, C=0.83, class_weight='balanced')
logreg.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, logreg.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, logreg.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, logreg.predict(X_test_dtm)))


validation accuracy: 0.8461198319699315
training accuracy: 0.9966836170683175
test accuracy: 0.8439434129089302


**Therefore, the finalised model is:**<br>
**countvectorizer with ngram(1,2) and logistic regression model with c=0.83**<br>
**Criterion to choose is based on the highest validation accuracy**

**Training Accuracy: 0.9955044586926082**<br>
**Validation Accuracy: 0.851204952465178** <br>
**Testing Accuracy: 0.8470380194518126** <br>
**AUC: 0.7495415502410765** <br>
**F1: 0.6483739837398373**

In [15]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
logreg = LogisticRegression(max_iter = 700, C=0.83)
logreg.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, logreg.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, logreg.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, logreg.predict(X_test_dtm)))
print('auc score: ', metrics.roc_auc_score(y_test, logreg.predict(X_test_dtm)))
print('f1 score: ', metrics.f1_score(y_test, logreg.predict(X_test_dtm)))



validation accuracy: 0.851204952465178
training accuracy: 0.9955044586926082
test accuracy: 0.8470380194518126
auc score:  0.7495415502410765
f1 score:  0.6483739837398373


In [None]:
# false positive
pd.set_option('display.max_colwidth', -1)

print(X_test[logreg.predict(X_test_dtm) > y_test].sample(10))
# has some disaster related words but the context is different, discussion with friends,
# not obvious in story books and science books
# some negative meaning phrases in front, e.g. protect from ...
# disasters used to describe people but not quite informative about disasters msgs, e.g. people reporting ..., charges on ...
# ... years past the disasters

8592     Dont forget my friend When there is crisis in India RSS always comes for help Bridge collapse floods                                                                                                                                                                         
11691    Injury Units treat broken bones sprains burns and other nonlife threatening injuries Charges have recently been reduced an                                                                                                                                                   
12211    Road Fatalities An Emerging Public Health Crisis In India                                                                                                                                                                                                                    
14713    HongKong reporters covering Wuhan pneumonia outbreak were briefly detained and questioned by Chinese authorities on Tuesda                                

  


In [None]:
# false negative
pd.set_option('display.max_colwidth', -1)

# msgs with implicit rhetoric skills, no explict disaster-related words. long sents, fewer disatster-related words,
# weights not that salient to predict positive
print(X_test[logreg.predict(X_test_dtm) < y_test].sample(15))

21076    RT KenyaSafi As the rainy season approaches there is need to fix Nairobis broken drainage systthem to end floodingPedestrians have been splashed and soaked during the rainy seasons due to poor drainage systthem Proper waste disposal is need since it clogs the drainages KenyaSafi
10835    Government set to revise total number of hectares destroyed during bushfire season to million after millions of hectar                                                                                                                                                                 
1104     sir just only wanted to make point about sureshpprabhu you made and said he is lying about bridge collapse                                                                                                                                                                             
6894     Australia News RT janeenorman High probability aircraft wreckage is from according to Deputy Prime                          

  


In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, logreg.predict(X_test_dtm))

array([[3194,  168],
       [ 524,  638]])