In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn import model_selection, preprocessing, decomposition
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import f1_score

#####

import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix
from keras.layers.normalization import BatchNormalization


from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils

In [None]:
%config Completer.use_jedi = False

Let's first import our training data

In [None]:
tweets_train = pd.read_csv("train.csv")
tweets_test = pd.read_csv("test.csv")

In [None]:
tweets_train.head(3)

Let's look at an example of each kind of tweets, as 1 means disaster and 0 means no disaster

In [None]:
print("No disaster tweet: ",tweets_train[tweets_train.target == 0].text.values[0])
print("Disaster tweet: ",tweets_train[tweets_train.target == 1].text.values[0])

In [None]:
tweets_train.dropna()

In [None]:
tweets_train.shape

In [None]:
tweets_train.info()

In [None]:
print("Number of tweets related to a disaster: ",tweets_train[tweets_train.target == 1].target.count())
print("Number of tweets non-related to a disaster: ",tweets_train[tweets_train.target == 0].target.count())

We have a total of 7613 tweets with an Id, the text of the tweets and the target variable (1 or 0). 7552/7613 involve key words and 5080/7613 involve a location for the tweet. We have also seen that 3271 tweets in our data set are related to a disaster while 4342 are not.

# TF-IDF

We are only going to work on the text(x) and target(y) variables. <br/>
First we are going to use TF-IDF.

In [None]:
#Data splitting
x_train, x_valid, y_train, y_valid = train_test_split(tweets_train.text.values,tweets_train.target.values , 
                                                  stratify=tweets_train.target.values, 
                                                  random_state=29, 
                                                  test_size=0.2, shuffle=True)

In [None]:
print (x_train.shape)
print (x_valid.shape)

Here, we are going to instanciate the TF-IDF vectorizer which will be later applied to our data. We are going to use those features, they work almost everytime.

In [None]:
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')


Now that our vectorizer is instanciated, we can fit TF-IDF to both our train and test data.


In [None]:
tfv.fit(list(x_train)+list(x_valid))

tfv_x_train = tfv.transform(x_train)
tfv_x_valid = tfv.transform(x_valid)

Our data is ready we are going to apply the following models:
* Logistic Regression
* Ridge Classifier
* Support Vector Machine
* Naive Bayes <br/>


The evaluation metric we are going to use is the **f1 score**.

PS: We are not going to focus on the hyperparameters.

## Logistic Regression

In [None]:
lr = LogisticRegression(C=1)
lr.fit(tfv_x_train,y_train)
predictions = lr.predict(tfv_x_valid)
f1_score(predictions,y_valid)

## Ridge Classifier

In [None]:
rc = RidgeClassifier()
rc.fit(tfv_x_train,y_train)
predictions = rc.predict(tfv_x_valid)
f1_score(predictions,y_valid)

## Support Vector Machine (SVM)

SVMs take a lot of time so it would be better to reduce the dimensionality of our data using decomposition.TruncatedSVD

In [None]:
svd = decomposition.TruncatedSVD(n_components=200)
svd.fit(tfv_x_train)
xtrain_svd = svd.transform(tfv_x_train)
xvalid_svd = svd.transform(tfv_x_valid)


Then, we will have to standardize our data before passing it into the SVM model since SVMs are influenced by the scale of the data, so having our data with a mean of 0 and a variance of 1 would remove all bias.

In [None]:
scaler = preprocessing.StandardScaler()
scaler.fit(xtrain_svd)
x_train_scaled = scaler.transform(xtrain_svd)
x_valid_scaled = scaler.transform(xvalid_svd)


In [None]:
svm = SVC()
svm.fit(x_train_scaled,y_train)
predictions = svm.predict(x_valid_scaled)
f1_score(predictions,y_valid)

## Naive Bayes

In [None]:
nb = MultinomialNB()
nb.fit(tfv_x_train,y_train)
predictions = nb.predict(tfv_x_valid)
f1_score(predictions,y_valid)

# Count Vectorizer<br/>
Now that we are done with the TF-IDF method, we are going to do the same thing with the Count Vectorizer instead.

In [None]:
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')

In [None]:
ctv.fit(list(x_train)+list(x_valid))

ctv_x_train = ctv.transform(x_train)
ctv_x_valid = ctv.transform(x_valid)

In [None]:
lr.fit(ctv_x_train,y_train)
predictions = lr.predict(ctv_x_valid)
f1_score(predictions,y_valid)

In [None]:
rc.fit(ctv_x_train,y_train)
predictions = rc.predict(ctv_x_valid)
f1_score(predictions,y_valid)

In [None]:
nb.fit(ctv_x_train,y_train)
predictions = nb.predict(ctv_x_valid)
f1_score(predictions,y_valid)

In [None]:
svd = decomposition.TruncatedSVD(n_components=200)
svd.fit(ctv_x_train)
xtrain_svd = svd.transform(ctv_x_train)
xvalid_svd = svd.transform(ctv_x_valid)

In [None]:
scaler.fit(xtrain_svd)
x_train_scaled = scaler.transform(xtrain_svd)
x_valid_scaled = scaler.transform(xvalid_svd)


In [None]:
svm.fit(x_train_scaled,y_train)
predictions = svm.predict(x_valid_scaled)
f1_score(predictions,y_valid)

# Deep Learning

In [None]:
max_words = 1000
tokenize = text.Tokenizer(num_words=max_words, char_level=False)
tokenize.fit_on_texts(x_train) # only fit on train

In [None]:
x_train_tkn = tokenize.texts_to_matrix(x_train)
x_valid_tkn = tokenize.texts_to_matrix(x_valid)

In [None]:
num_classes = np.max(y_train) + 1
y_train_tkn = utils.to_categorical(y_train, num_classes)
y_valid_tkn = utils.to_categorical(y_valid, num_classes)

In [None]:
batch_size = 32
epochs = 2

# Build the model

model.add(Dense(300, input_dim=300, activation='relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(300, activation='relu'))
model.add(Dropout(0.3))
model.add(BatchNormalization())

model.add(Dense(num_classes))
model.add(Activation('softmax'))

# compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

model.summary()

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
history = model.fit(x_train_tkn, y_train_tkn,
                    batch_size=batch_size,
                    epochs=5,
                    verbose=1)

In [None]:
score = model.evaluate(x_valid_tkn, y_valid_tkn,
                       batch_size=batch_size, verbose=1)
print('Test accuracy:', score[1])

In [None]:
predictions = model.predict_classes(x_valid_tkn)
f1_score(predictions,y_valid)