# Data Exploration and Cleaning

In [1]:
import pandas as pd
df = pd.read_csv('./final_data.csv')

In [2]:
df = df.dropna()
df.isnull().sum()

Unnamed: 0    0
keyword       0
location      0
text          0
target        0
dtype: int64

In [3]:
df.shape

(22616, 5)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,keyword,location,text,target
0,0,no_keyword,no_location,Our Deeds are the Reason of this earthquake Ma...,1
1,1,no_keyword,no_location,Forest fire near La Ronge Sask Canada,1
2,2,no_keyword,no_location,All residents asked to shelter in place are be...,1
3,3,no_keyword,no_location,people receive wildfires evacuation orders in ...,1
4,4,no_keyword,no_location,Just got sent this photo from Ruby Alaska as s...,1


# Train, Validation, Test Split

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.svm import SVC

X = df.text
y = df.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=1)

In [6]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)
print(y_train.shape)
print(y_val.shape)
print(y_test.shape)

(13569,)
(4523,)
(4524,)
(13569,)
(4523,)
(4524,)


# Default CountVectorizer()

In [7]:
vect = CountVectorizer()
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
svc = SVC()
svc.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, svc.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, svc.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, svc.predict(X_test_dtm)))

validation accuracy: 0.8432456334291399
training accuracy: 0.9372098164934778
test accuracy: 0.8437223695844386


### Tuning min_df and max_df

In [9]:
import numpy as np
for i in np.arange(0.0, 0.6, 0.2):
    vect = CountVectorizer(min_df = i)
    vect.fit(X_train)
    X_train_dtm = vect.transform(X_train)
    X_val_dtm = vect.transform(X_val)
    X_test_dtm = vect.transform(X_test)
    svc = SVC()
    svc.fit(X_train_dtm, y_train)
    print('min_df:', i, 'validation accuracy:', metrics.accuracy_score(y_val, svc.predict(X_val_dtm)))

min_df: 0.0 validation accuracy: 0.8432456334291399
min_df: 0.2 validation accuracy: 0.7441963298695556
min_df: 0.4 validation accuracy: 0.7426486845014371


In [10]:
for i in range(1,10):
    vect = CountVectorizer(min_df = i)
    vect.fit(X_train)
    X_train_dtm = vect.transform(X_train)
    X_val_dtm = vect.transform(X_val)
    X_test_dtm = vect.transform(X_test)
    svc = SVC()
    svc.fit(X_train_dtm, y_train)
    print('min_df:', i, 'validation accuracy:', metrics.accuracy_score(y_val, svc.predict(X_val_dtm)))

min_df: 1 validation accuracy: 0.8432456334291399
min_df: 2 validation accuracy: 0.8416979880610215
min_df: 3 validation accuracy: 0.8423612646473579
min_df: 4 validation accuracy: 0.8432456334291399
min_df: 5 validation accuracy: 0.842803449038249
min_df: 6 validation accuracy: 0.841476895865576
min_df: 7 validation accuracy: 0.8425823568428035
min_df: 8 validation accuracy: 0.841476895865576
min_df: 9 validation accuracy: 0.8408136192792395


In [11]:
for i in np.arange(1.0, 0.0, -0.2):
    vect = CountVectorizer(max_df = i)
    vect.fit(X_train)
    X_train_dtm = vect.transform(X_train)
    X_val_dtm = vect.transform(X_val)
    X_test_dtm = vect.transform(X_test)
    svc = SVC()
    svc.fit(X_train_dtm, y_train)
    print('max_df:', i, 'validation accuracy:', metrics.accuracy_score(y_val, svc.predict(X_val_dtm)))

max_df: 1.0 validation accuracy: 0.8432456334291399
max_df: 0.8 validation accuracy: 0.8432456334291399
max_df: 0.6000000000000001 validation accuracy: 0.8432456334291399
max_df: 0.40000000000000013 validation accuracy: 0.8439089100154764
max_df: 0.20000000000000018 validation accuracy: 0.8439089100154764


Highest validation accuracy when setting max_df=0.4 (0.8439), tuning min_df did not help to improve validation accuracy.

# Lowercase = False

In [12]:
vect = CountVectorizer(lowercase=False)
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
svc = SVC()
svc.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, svc.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, svc.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, svc.predict(X_test_dtm)))

validation accuracy: 0.8377183285430024
training accuracy: 0.9382415800722235
test accuracy: 0.8370910698496905


### Tuning min_df and max_df

In [13]:
for i in range(1,10):
    vect = CountVectorizer(lowercase=False, min_df = i)
    vect.fit(X_train)
    X_train_dtm = vect.transform(X_train)
    X_val_dtm = vect.transform(X_val)
    X_test_dtm = vect.transform(X_test)
    svc = SVC()
    svc.fit(X_train_dtm, y_train)
    print('min_df:', i, 'validation accuracy:', metrics.accuracy_score(y_val, svc.predict(X_val_dtm)))

min_df: 1 validation accuracy: 0.8377183285430024
min_df: 2 validation accuracy: 0.837055051956666
min_df: 3 validation accuracy: 0.8368339597612204
min_df: 4 validation accuracy: 0.8363917753703294
min_df: 5 validation accuracy: 0.8368339597612204
min_df: 6 validation accuracy: 0.8341808534158744
min_df: 7 validation accuracy: 0.8330753924386469
min_df: 8 validation accuracy: 0.8324121158523105
min_df: 9 validation accuracy: 0.8319699314614194


In [14]:
for i in np.arange(1.0, 0.0, -0.2):
    vect = CountVectorizer(lowercase=False, max_df = i)
    vect.fit(X_train)
    X_train_dtm = vect.transform(X_train)
    X_val_dtm = vect.transform(X_val)
    X_test_dtm = vect.transform(X_test)
    svc = SVC()
    svc.fit(X_train_dtm, y_train)
    print('max_df:', i, 'validation accuracy:', metrics.accuracy_score(y_val, svc.predict(X_val_dtm)))

max_df: 1.0 validation accuracy: 0.8377183285430024
max_df: 0.8 validation accuracy: 0.8377183285430024
max_df: 0.6000000000000001 validation accuracy: 0.8377183285430024
max_df: 0.40000000000000013 validation accuracy: 0.8377183285430024
max_df: 0.20000000000000018 validation accuracy: 0.8368339597612204


Tuning min_df and max_df did not help to improve validation accuracy. From CountVectorizer() and CountVectorizer(lowercase=False), we can see that tuning min_df and max_df did not help much in improving validation accuracy. Even for CountrVectorizer(), setting max_df=0.4 only increase the validation accuracy by less than 0.1% (from 0.8432 to 0.8439). Currently, the highest validation accuracy is achieved by setting CountVectorizer(max_df=0.4), subsequently, if the validation accuracy is not higher, we will not be tuning on min_df and max_df.

# stopword = english

In [15]:
vect = CountVectorizer(stop_words='english')
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
svc = SVC()
svc.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, svc.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, svc.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, svc.predict(X_test_dtm)))

validation accuracy: 0.83882378952023
training accuracy: 0.9520966909868082
test accuracy: 0.8384173297966402


# Lowercase=False, stopwords=english

In [16]:
vect = CountVectorizer(stop_words='english', lowercase=False)
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
svc = SVC()
svc.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, svc.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, svc.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, svc.predict(X_test_dtm)))

validation accuracy: 0.8372761441521114
training accuracy: 0.9522440857837718
test accuracy: 0.8348806366047745


# ngram=(1,2)

In [17]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
svc = SVC()
svc.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, svc.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, svc.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, svc.predict(X_test_dtm)))

validation accuracy: 0.8410347114746849
training accuracy: 0.9523914805807355
test accuracy: 0.8408488063660478


# Default TfidfVectorizer()

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
svc = SVC()
svc.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, svc.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, svc.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, svc.predict(X_test_dtm)))

validation accuracy: 0.851204952465178
training accuracy: 0.9691207900361117
test accuracy: 0.8516799292661361


### Tuning min_df and max_df

In [19]:
for i in range(1,10):
    vect =TfidfVectorizer(min_df=i)
    vect.fit(X_train)
    X_train_dtm = vect.transform(X_train)
    X_val_dtm = vect.transform(X_val)
    X_test_dtm = vect.transform(X_test)
    svc = SVC()
    svc.fit(X_train_dtm, y_train)
    print('min_df:', i, 'validation accuracy:', metrics.accuracy_score(y_val, svc.predict(X_val_dtm)))

min_df: 1 validation accuracy: 0.851204952465178
min_df: 2 validation accuracy: 0.8518682290515145
min_df: 3 validation accuracy: 0.850320583683396
min_df: 4 validation accuracy: 0.849436214901614
min_df: 5 validation accuracy: 0.8485518461198319
min_df: 6 validation accuracy: 0.8456776475790405
min_df: 7 validation accuracy: 0.8476674773380499
min_df: 8 validation accuracy: 0.8485518461198319
min_df: 9 validation accuracy: 0.8485518461198319


In [20]:
for i in np.arange(1.0, 0.0, -0.2):
    vect =TfidfVectorizer(max_df=i)
    vect.fit(X_train)
    X_train_dtm = vect.transform(X_train)
    X_val_dtm = vect.transform(X_val)
    X_test_dtm = vect.transform(X_test)
    svc = SVC()
    svc.fit(X_train_dtm, y_train)
    print('max_df:', i, 'validation accuracy:', metrics.accuracy_score(y_val, svc.predict(X_val_dtm)))

max_df: 1.0 validation accuracy: 0.851204952465178
max_df: 0.8 validation accuracy: 0.851204952465178
max_df: 0.6000000000000001 validation accuracy: 0.851204952465178
max_df: 0.40000000000000013 validation accuracy: 0.850762768074287
max_df: 0.20000000000000018 validation accuracy: 0.8492151227061685


# lowercase=False

In [21]:
vect = TfidfVectorizer(lowercase=False)
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
svc = SVC()
svc.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, svc.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, svc.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, svc.predict(X_test_dtm)))

validation accuracy: 0.8483307539243865
training accuracy: 0.9719212911784214
test accuracy: 0.8428381962864722


# stopwords=english

In [22]:
vect = TfidfVectorizer(stop_words='english')
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
svc = SVC()
svc.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, svc.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, svc.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, svc.predict(X_test_dtm)))

validation accuracy: 0.8432456334291399
training accuracy: 0.9708895275996757
test accuracy: 0.8448275862068966


# ngram=(1,2)

In [23]:
vect = TfidfVectorizer(ngram_range=(1,2))
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)
svc = SVC()
svc.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, svc.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, svc.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, svc.predict(X_test_dtm)))

validation accuracy: 0.8461198319699315
training accuracy: 0.98231262436436
test accuracy: 0.8443854995579133


# Tuning SVC()

We will be using vect=TfidfVectorizer(), which gave us the highest validation accrucay in the tuning process (0.8512).

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm = vect.transform(X_test)

### Sigmoid kernel

In [25]:
svc = SVC(kernel='sigmoid')
svc.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, svc.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, svc.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, svc.predict(X_test_dtm)))

validation accuracy: 0.8467831085562679
training accuracy: 0.8974132213132876
test accuracy: 0.8474801061007957


### Regularization

In [26]:
svc = SVC(C=0.1)
svc.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, svc.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, svc.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, svc.predict(X_test_dtm)))

validation accuracy: 0.7426486845014371
training accuracy: 0.7381531431940452
test accuracy: 0.7431476569407603


In [27]:
svc = SVC(C=10)
svc.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, svc.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, svc.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, svc.predict(X_test_dtm)))

validation accuracy: 0.8562900729604245
training accuracy: 0.9991893286166998
test accuracy: 0.8510167992926614


In [28]:
for i in range(1,10):
    svc = SVC(C=i)
    svc.fit(X_train_dtm, y_train)
    print('regulariaztion:', i, 'validation accuracy:', metrics.accuracy_score(y_val, svc.predict(X_val_dtm)))

regulariaztion: 1 validation accuracy: 0.851204952465178
regulariaztion: 2 validation accuracy: 0.857837718328543
regulariaztion: 3 validation accuracy: 0.8576166261330975
regulariaztion: 4 validation accuracy: 0.8562900729604245
regulariaztion: 5 validation accuracy: 0.85651116515587
regulariaztion: 6 validation accuracy: 0.8562900729604245
regulariaztion: 7 validation accuracy: 0.85651116515587
regulariaztion: 8 validation accuracy: 0.8567322573513155
regulariaztion: 9 validation accuracy: 0.8567322573513155


In [29]:
for i in np.arange(1.0,2.0,0.1):
    svc = SVC(C=i)
    svc.fit(X_train_dtm, y_train)
    print('regulariaztion:', i, 'validation accuracy:', metrics.accuracy_score(y_val, svc.predict(X_val_dtm)))

regulariaztion: 1.0 validation accuracy: 0.851204952465178
regulariaztion: 1.1 validation accuracy: 0.8536369666150785
regulariaztion: 1.2000000000000002 validation accuracy: 0.855184611983197
regulariaztion: 1.3000000000000003 validation accuracy: 0.855184611983197
regulariaztion: 1.4000000000000004 validation accuracy: 0.8558478885695335
regulariaztion: 1.5000000000000004 validation accuracy: 0.857395533937652
regulariaztion: 1.6000000000000005 validation accuracy: 0.8585009949148795
regulariaztion: 1.7000000000000006 validation accuracy: 0.858722087110325
regulariaztion: 1.8000000000000007 validation accuracy: 0.857395533937652
regulariaztion: 1.9000000000000008 validation accuracy: 0.858279902719434


In [30]:
for i in np.arange(0.1,1.0,0.1):
    svc = SVC(C=i)
    svc.fit(X_train_dtm, y_train)
    print('regulariaztion:', i, 'validation accuracy:', metrics.accuracy_score(y_val, svc.predict(X_val_dtm)))

regulariaztion: 0.1 validation accuracy: 0.7426486845014371
regulariaztion: 0.2 validation accuracy: 0.7548087552509396
regulariaztion: 0.30000000000000004 validation accuracy: 0.7879725845677648
regulariaztion: 0.4 validation accuracy: 0.8129560026531063
regulariaztion: 0.5 validation accuracy: 0.8260004421843908
regulariaztion: 0.6 validation accuracy: 0.8355074065885474
regulariaztion: 0.7000000000000001 validation accuracy: 0.8421401724519124
regulariaztion: 0.8 validation accuracy: 0.8470042007517135
regulariaztion: 0.9 validation accuracy: 0.8485518461198319


# Final Model 

In [31]:
# with TfidfiVectorizer()
svc = SVC(C=1.7)
svc.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, svc.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, svc.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, svc.predict(X_test_dtm)))

validation accuracy: 0.858722087110325
training accuracy: 0.9959466430834991
test accuracy: 0.8538903625110522


In [32]:
# F1 score
y_pred_class = svc.predict(X_test_dtm)
metrics.f1_score(y_test, y_pred_class)

0.6659929257200606

### Analyse FP and FN

In [33]:
# False positive
X_test[y_test < y_pred_class]

17635         donate the funds to the Australia wild fires
22252    MetropolitanPolice hunt man charged with sexua...
12462    We were called to food truck alight on Canvey ...
5066     Reddit Will Now Quarantine Offensive onlinecom...
3505                                            Im On Fire
                               ...                        
16349    BREAKING SUSPECTED IRANIAN SUICIDE BOMBER HEAD...
22227    katrinabhaydon Police said first words in the ...
22404    couple years ago in Ukraine there were well ov...
18098    RT drnickgreiner Were in headlong retreat and ...
20256    RT If you were silent when the same Myanmar mi...
Name: text, Length: 158, dtype: object

In [34]:
print(X_test[17635]) #fire
print(X_test[5066]) #quarantine
print(X_test[3505]) #fire
print(X_test[18098]) #fire
print(X_test[20256]) #Myanmar

donate the funds to the Australia wild fires
Reddit Will Now Quarantine Offensive onlinecommunities reddit amageddon freespeech
Im On Fire
RT drnickgreiner Were in headlong retreat and the rearguard is getting annihilated by enthemy fire
RT If you were silent when the same Myanmar military committed Genocide against the Rohingya amp displaced over million of ththem under dthemocratically elected government then youre Hypocrite


In [35]:
# False negative
X_test[y_test > y_pred_class]

15432    What is that like times in as many months that...
14532    Hillary Clinton If Im President We Will Attack...
1830     GREAT CONDITION Easton Cyclone Softball Bat Fa...
13491    USA Given the vulnerabilities of PuertoRico el...
1541     xDescry was wrong to call it trusty actually c...
                               ...                        
5245     AHMazing story of the power animal rescuers ha...
1280     Bush Fires are scaryeven scarier when you go d...
2785     Believe it or not weve had too MUCH rain here ...
7608     Islamic Jihadist mob attacked Hindus looted am...
345      AP what violent country get the army involved ...
Name: text, Length: 503, dtype: object

In [36]:
print(X_test[15432]) #riot
print(X_test[1830]) #mislabel
print(X_test[13491]) 
print(X_test[1541]) 
print(X_test[5245]) #rescue, animal
print(X_test[1280]) 
print(X_test[2785]) #rain, drowning 
print(X_test[7608]) 
print(X_test[345]) 

What is that like times in as many months that massive rioting halted major repressive or austerity measure Leb
GREAT CONDITION Easton Cyclone Softball Bat Fastpitch
USA Given the vulnerabilities of PuertoRico electric grid after Hurricane Maria the EDF says there is an urge
xDescry was wrong to call it trusty actually considering it spontaneously collapsed on me thats not very trusty
AHMazing story of the power animal rescuers have starving homeless dog with no future was rescued by person
Bush Fires are scaryeven scarier when you go down and fight ththem
Believe it or not weve had too MUCH rain here Our newly planted maple trees are actually drowning
Islamic Jihadist mob attacked Hindus looted amp burnt Houses amp their property in Bhainsa town of Nirmal distTelangana la
AP what violent country get the army involved to help control the killings and bring back peace to the poor people


### Get AUC setting probability=True

In [37]:
svc = SVC(C=1.7, probability=True, random_state=1)
svc.fit(X_train_dtm, y_train)
print('validation accuracy:', metrics.accuracy_score(y_val, svc.predict(X_val_dtm)))
print('training accuracy:', metrics.accuracy_score(y_train, svc.predict(X_train_dtm)))
print('test accuracy:', metrics.accuracy_score(y_test, svc.predict(X_test_dtm)))

validation accuracy: 0.858722087110325
training accuracy: 0.9959466430834991
test accuracy: 0.8538903625110522


In [38]:
y_pred_prob = svc.predict_proba(X_test_dtm)[:, 1]
metrics.roc_auc_score(y_test, y_pred_prob)

0.8696594826659405