# Let's Import Our Libraries

In [1]:
# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding

# Plot
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
import matplotlib as plt

# NLTK
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

#Sklearn
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

# Other
import re
import string
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
from scipy import sparse
import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


# Let's start by exploring the data.

In [2]:
df = pd.read_csv("socialmedia-disaster-tweets-DFE.csv", encoding='latin-1')

In [3]:
df.shape

(10876, 13)

In [4]:
df.head(5)

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,choose_one,choose_one:confidence,choose_one_gold,keyword,location,text,tweetid,userid
0,778243823,True,golden,156,,Relevant,1.0,Relevant,,,Just happened a terrible car crash,1.0,
1,778243824,True,golden,152,,Relevant,1.0,Relevant,,,Our Deeds are the Reason of this #earthquake M...,13.0,
2,778243825,True,golden,137,,Relevant,1.0,Relevant,,,"Heard about #earthquake is different cities, s...",14.0,
3,778243826,True,golden,136,,Relevant,0.9603,Relevant,,,"there is a forest fire at spot pond, geese are...",15.0,
4,778243827,True,golden,138,,Relevant,1.0,Relevant,,,Forest fire near La Ronge Sask. Canada,16.0,


In [5]:
df.columns

Index(['_unit_id', '_golden', '_unit_state', '_trusted_judgments',
       '_last_judgment_at', 'choose_one', 'choose_one:confidence',
       'choose_one_gold', 'keyword', 'location', 'text', 'tweetid', 'userid'],
      dtype='object')

The 'tweet' column has tweets and 'choose_one' has the classification.

Let's determine the number of unique classifications.

In [6]:
df.choose_one.unique()

array(['Relevant', 'Not Relevant', "Can't Decide"], dtype=object)

# Let's work on cleaning up the data

In [7]:
df = df[["text", "choose_one"]]
df["choose_one"] = df.choose_one.replace({"Relevant": 1, "Not Relevant": 0})
df.rename(columns={"choose_one":"label"}, inplace=True)

In [8]:
df.label=pd.to_numeric(df.label, errors='coerce')
df.dropna(inplace=True)

Let's check and see how the data looks.

In [9]:
df.label.unique()

array([1., 0.])

In [10]:
df.head(5)

Unnamed: 0,text,label
0,Just happened a terrible car crash,1.0
1,Our Deeds are the Reason of this #earthquake M...,1.0
2,"Heard about #earthquake is different cities, s...",1.0
3,"there is a forest fire at spot pond, geese are...",1.0
4,Forest fire near La Ronge Sask. Canada,1.0


In [11]:
df["text"] = df["text"].str.replace(r"http\S+|http|@\S+|at", "")
df["text"] = df["text"].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
df["text"] = df["text"].str.lower()

In [12]:
df.head(5)

Unnamed: 0,text,label
0,just happened a terrible car crash,1.0
1,our deeds are the reason of this earthquake m...,1.0
2,"heard about earthquake is different cities, s...",1.0
3,"there is a forest fire spot pond, geese are f...",1.0
4,forest fire near la ronge sask canada,1.0


In [13]:
df.columns

Index(['text', 'label'], dtype='object')

# Let's Tokenzie: We'll Turn our Sentences into Lists of Words

In [14]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
df["tokens"] = df["text"].apply(tokenizer.tokenize)

Tokens will give us more insight into the data

In [15]:
all_words = [word for tokens in df["tokens"] for word in tokens]
sentence_lengths = [len(tokens) for tokens in df["tokens"]]
vocabulary = sorted(set(all_words))

In [16]:
print("%s words total, with a vocabulary size of %s." % (len(all_words), len(vocabulary)))

153824 words total, with a vocabulary size of 18078.


In [17]:
print("Max sentence length is %s." % max(sentence_lengths))

Max sentence length is 34.


# Let's Embed: Turning Words into Numbers

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

### TFIDF Tokenizer for Our Classical ML Models

In [19]:
text = df["text"].tolist()
labels = df["label"].tolist()
X_train, X_test, y_train, y_test = train_test_split(text, labels, test_size=0.2,random_state=40)

In [20]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

In [21]:
X_test_tfidf = tfidf_vectorizer.transform(X_test)
X_test_tfidf.shape

(2172, 16056)

### NN Specific Tokenizer

In [22]:
vocabulary_size = 40000
tokenizer = Tokenizer(num_words = vocabulary_size)
tokenizer.fit_on_texts(df['text'])

sequences = tokenizer.texts_to_sequences(X_train)
X_train_nn = pad_sequences(sequences, maxlen=28)

sequences = tokenizer.texts_to_sequences(X_test)
X_test_nn = pad_sequences(sequences, maxlen=28)

X_train_nn = pd.DataFrame(X_train_nn)
X_test_nn = pd.DataFrame(X_test_nn)

In [23]:
type(X_train_tfidf)

scipy.sparse.csr.csr_matrix

In [24]:
type(X_train_nn)

pandas.core.frame.DataFrame

# Here We'll Define a New Classifier

In [25]:
class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, dual=False, n_jobs=1):
        self.C = C
        self.dual = dual
        self.n_jobs = n_jobs

    def predict(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))

    def predict_proba(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))

    def fit(self, x, y):
        # Check that X and y have correct shape
        #y = y.values
        y = y
        x, y = check_X_y(x, y, accept_sparse=True)

        def pr(x, y_i, y):
            p = x[y==y_i].sum(0)
            return (p+1) / ((y==y_i).sum()+1)

        self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
        x_nb = x.multiply(self._r)
        self._clf = LogisticRegression(C=self.C, dual=self.dual, n_jobs=self.n_jobs).fit(x_nb, y)
        return self

## Let's get a baseline using Logisitc Regression

In [26]:
classifier = LogisticRegression(C=30.0, class_weight='balanced', solver='newton-cg', multi_class='multinomial', n_jobs=-1, random_state=40)
classifier.fit(X_train_tfidf, y_train)
y_predicted_tfidf = classifier.predict(X_test_tfidf)

precision = precision_score(y_test, y_predicted_tfidf, pos_label=None,average='weighted')
print(precision)

0.7745952843343159


## Now We'll Utilize Our NBSVM Classifier

In [27]:
classifier = NbSvmClassifier(C=4, dual=True, n_jobs=-1).fit(X_train_tfidf, y_train)
classifier.fit(X_train_tfidf, y_train)
y_predicted_tfidf = classifier.predict(X_test_tfidf)

precision = precision_score(y_test, y_predicted_tfidf, pos_label=None,average='weighted')
print(precision)

0.8057456468641322


## Now Let's Apply Grid Search to the Model

In [28]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'C': [3.0, 3.2, 3.25, 3.3, 3.4, 3.5],
    'dual' : [True, False]
}

In [29]:
%%time
gs_classifier = GridSearchCV(NbSvmClassifier(), param_grid, n_jobs=-1)
gs_classifier = gs_classifier.fit(X_train_tfidf, y_train)

CPU times: user 125 ms, sys: 43.9 ms, total: 169 ms
Wall time: 733 ms


In [30]:
gs_classifier.best_score_

0.8054788213627992

In [31]:
gs_classifier.best_params_

{'C': 3.2, 'dual': True}

We don't seem to be getting much extra juice from applying grid search to this model.

## Let's Try Regular SVM

In [32]:
classifier = SGDClassifier().fit(X_train_tfidf, y_train)
classifier.fit(X_train_tfidf, y_train)
y_predicted_tfidf = classifier.predict(X_test_tfidf)
precision = precision_score(y_test, y_predicted_tfidf, pos_label=None,average='weighted')
print(precision)

0.7946891903618641


## Let's Try Multinomial Naive Bayes

In [33]:
classifier = SGDClassifier().fit(X_train_tfidf, y_train)
classifier.fit(X_train_tfidf, y_train)
y_predicted_tfidf = classifier.predict(X_test_tfidf)
precision = precision_score(y_test, y_predicted_tfidf, pos_label=None,average='weighted')
print(precision)

0.7960386107807245


The last three approaches yielded pretty similar results. Let's try a deep learning model.

## Build the network with LSTM

### Network Architecture

Our network is going to start with an embedding layer. This layer lets the system expand each token into a much larger vector space. By doing so we can represent each word in a more meaningful way. The layer takes 40K as its first argument, which is the size of our vocabulary. 100 is the second argument, which is the dimension of the embeddings. The third argument is 28 which is the max number of tokens we consider from each tweet.

In [34]:
def create_lstm():
    model = Sequential()
    model.add(Embedding(40000, 100, input_length=28))
    model.add(LSTM(100, dropout=0.9, recurrent_dropout=0.5))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [35]:
classifier = KerasClassifier(build_fn=create_lstm, epochs=3, batch_size=5, verbose=0)

In [36]:
%%time
classifier.fit(X_train_nn, y_train)

CPU times: user 7min 21s, sys: 1min 24s, total: 8min 46s
Wall time: 2min 55s


<keras.callbacks.History at 0x7f92881fef98>

In [37]:
y_predicted_nn = classifier.predict(X_test_nn)
precision = precision_score(y_test, y_predicted_nn, pos_label=None,average='weighted')
print(precision)

0.8128093975364307
