In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [3]:
from sklearn import model_selection, preprocessing, decomposition
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import f1_score

#####

import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix
from keras.layers.normalization import BatchNormalization


from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils

In [4]:
%config Completer.use_jedi = False

Let's first import our training data

In [5]:
tweets_train = pd.read_csv("train.csv")
tweets_test = pd.read_csv("test.csv")

In [6]:
tweets_train.head(3)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1


Let's look at an example of each kind of tweets, as 1 means disaster and 0 means no disaster

In [7]:
print("No disaster tweet: ",tweets_train[tweets_train.target == 0].text.values[0])
print("Disaster tweet: ",tweets_train[tweets_train.target == 1].text.values[0])

No disaster tweet:  What's up man?
Disaster tweet:  Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all


In [8]:
tweets_train.dropna()

Unnamed: 0,id,keyword,location,text,target
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0
...,...,...,...,...,...
7575,10826,wrecked,TN,On the bright side I wrecked http://t.co/uEa0t...,0
7577,10829,wrecked,#NewcastleuponTyne #UK,@widda16 ... He's gone. You can relax. I thoug...,0
7579,10831,wrecked,"Vancouver, Canada",Three days off from work and they've pretty mu...,0
7580,10832,wrecked,London,#FX #forex #trading Cramer: Iger's 3 words tha...,0


In [9]:
tweets_train.shape

(7613, 5)

In [10]:
tweets_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [11]:
print("Number of tweets related to a disaster: ",tweets_train[tweets_train.target == 1].target.count())
print("Number of tweets non-related to a disaster: ",tweets_train[tweets_train.target == 0].target.count())

Number of tweets related to a disaster:  3271
Number of tweets non-related to a disaster:  4342


We have a total of 7613 tweets with an Id, the text of the tweets and the target variable (1 or 0). 7552/7613 involve key words and 5080/7613 involve a location for the tweet. We have also seen that 3271 tweets in our data set are related to a disaster while 4342 are not.

# TF-IDF

We are only going to work on the text(x) and target(y) variables. <br/>
First we are going to use TF-IDF.

In [12]:
#Data splitting
x_train, x_valid, y_train, y_valid = train_test_split(tweets_train.text.values,tweets_train.target.values , 
                                                  stratify=tweets_train.target.values, 
                                                  random_state=29, 
                                                  test_size=0.2, shuffle=True)

In [13]:
print (x_train.shape)
print (x_valid.shape)

(6090,)
(1523,)


Here, we are going to instanciate the TF-IDF vectorizer which will be later applied to our data. We are going to use those features, they work almost everytime.

In [14]:
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')


Now that our vectorizer is instanciated, we can fit TF-IDF to both our train and test data.


In [15]:
tfv.fit(list(x_train)+list(x_valid))

tfv_x_train = tfv.transform(x_train)
tfv_x_valid = tfv.transform(x_valid)

Our data is ready we are going to apply the following models:
* Logistic Regression
* Ridge Classifier
* Support Vector Machine
* Naive Bayes <br/>


The evaluation metric we are going to use is the **f1 score**.

PS: We are not going to focus on the hyperparameters.

## Logistic Regression

In [16]:
lr = LogisticRegression(C=1)
lr.fit(tfv_x_train,y_train)
predictions = lr.predict(tfv_x_valid)
f1_score(predictions,y_valid)

0.7392795883361921

## Ridge Classifier

In [17]:
rc = RidgeClassifier()
rc.fit(tfv_x_train,y_train)
predictions = rc.predict(tfv_x_valid)
f1_score(predictions,y_valid)

0.7457072771872445

## Support Vector Machine (SVM)

SVMs take a lot of time so it would be better to reduce the dimensionality of our data using decomposition.TruncatedSVD

In [18]:
svd = decomposition.TruncatedSVD(n_components=200)
svd.fit(tfv_x_train)
xtrain_svd = svd.transform(tfv_x_train)
xvalid_svd = svd.transform(tfv_x_valid)


Then, we will have to standardize our data before passing it into the SVM model since SVMs are influenced by the scale of the data, so having our data with a mean of 0 and a variance of 1 would remove all bias.

In [19]:
scaler = preprocessing.StandardScaler()
scaler.fit(xtrain_svd)
x_train_scaled = scaler.transform(xtrain_svd)
x_valid_scaled = scaler.transform(xvalid_svd)


In [20]:
svm = SVC()
svm.fit(x_train_scaled,y_train)
predictions = svm.predict(x_valid_scaled)
f1_score(predictions,y_valid)

0.7114210985178727

## Naive Bayes

In [21]:
nb = MultinomialNB()
nb.fit(tfv_x_train,y_train)
predictions = nb.predict(tfv_x_valid)
f1_score(predictions,y_valid)

0.7099099099099099

# Count Vectorizer<br/>
Now that we are done with the TF-IDF method, we are going to do the same thing with the Count Vectorizer instead.

In [22]:
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')

In [23]:
ctv.fit(list(x_train)+list(x_valid))

ctv_x_train = ctv.transform(x_train)
ctv_x_valid = ctv.transform(x_valid)

In [24]:
lr.fit(ctv_x_train,y_train)
predictions = lr.predict(ctv_x_valid)
f1_score(predictions,y_valid)

0.7293103448275862

In [25]:
rc.fit(ctv_x_train,y_train)
predictions = rc.predict(ctv_x_valid)
f1_score(predictions,y_valid)

0.7271171941830625

In [26]:
nb.fit(ctv_x_train,y_train)
predictions = nb.predict(ctv_x_valid)
f1_score(predictions,y_valid)

0.7571318427139553

In [27]:
svd = decomposition.TruncatedSVD(n_components=200)
svd.fit(ctv_x_train)
xtrain_svd = svd.transform(ctv_x_train)
xvalid_svd = svd.transform(ctv_x_valid)

In [28]:
scaler.fit(xtrain_svd)
x_train_scaled = scaler.transform(xtrain_svd)
x_valid_scaled = scaler.transform(xvalid_svd)


In [29]:
svm.fit(x_train_scaled,y_train)
predictions = svm.predict(x_valid_scaled)
f1_score(predictions,y_valid)

0.6685446009389672

# Deep Learning

In [30]:
max_words = 1000
tokenize = text.Tokenizer(num_words=max_words, char_level=False)
tokenize.fit_on_texts(x_train) # only fit on train

In [31]:
x_train_tkn = tokenize.texts_to_matrix(x_train)
x_valid_tkn = tokenize.texts_to_matrix(x_valid)

In [32]:
num_classes = np.max(y_train) + 1
y_train_tkn = utils.to_categorical(y_train, num_classes)
y_valid_tkn = utils.to_categorical(y_valid, num_classes)

In [33]:
batch_size = 32
epochs = 2

# Build the model
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [34]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [35]:
history = model.fit(x_train_tkn, y_train_tkn,
                    batch_size=batch_size,
                    epochs=5,
                    verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [36]:
score = model.evaluate(x_valid_tkn, y_valid_tkn,
                       batch_size=batch_size, verbose=1)
print('Test accuracy:', score[1])

Test accuracy: 0.8069599270820618


In [39]:
predictions = model.predict_classes(x_valid_tkn)
f1_score(predictions,y_valid)



0.7621359223300971