In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

# WordCloud and matplotlib for visualization
import matplotlib.pyplot as plt
%matplotlib inline
from wordcloud import WordCloud

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

## TO DO

- assigning keywords to missing keyword labels - DONE
- remove symbols (#, ".. - DONE
- remove links - DONE
- validation
- make slides - IN PROCESS
- TFIDF, LSA, LSTM / RNNs, the list is long
- try different classifiers
- reading discussion board for inspiration


In [2]:
train = pd.read_csv("preprocessed_train_data.csv", index_col = 0)
test = pd.read_csv("preprocessed_test_data.csv", index_col = 0)

trained_tweets = train['keyword']+train['text']
test_tweets = test['keyword']+test['text']

In [3]:
trained_tweets.head()

0    earthquak  deed reason earthquak may allah for...
1     forest fire  forest fire near la rang ask canada
2    evacu  resid ask shelter place notifi offic ev...
3    wildfir  peopl receiv wildfir evacu order cali...
4    wildfir  got sent photo rubi alaska smoke wild...
dtype: object

In [4]:
test_tweets.head()

0                      crash  happen terribl car crash
1    earthquak  heard earthquak differ citi stay sa...
2    forest fire  forest fire spot pond gees flee a...
3            apocalyps  apocalyps light spokan wildfir
4          typhoon  typhoon soudelor kill china taiwan
dtype: object

# Encoding and Vectorizers

In [5]:
# import category_encoders as ce

# # Target encoding
# features = ['keyword']
# encoder = ce.TargetEncoder(cols=features)
# encoder.fit(train[features],train['target'])

# train = train.join(encoder.transform(train[features]).add_suffix('_target'))
# test = test.join(encoder.transform(test[features]).add_suffix('_target'))

In [6]:
X_train, X_test, y_train, y_test = train_test_split(trained_tweets, train['target'].values, 
        train_size = 0.80, test_size = 0.20, random_state = 12, shuffle=True) # We have split train-test dataset using 80:20 ratio

# TFIDF

In [7]:
#precompute vectorized representations
word_vectorizer = TfidfVectorizer(
    analyzer='word',
    stop_words='english',
    ngram_range=(1, 3),
    lowercase=True,
    min_df=5,
    max_features=30000)

char_vectorizer = TfidfVectorizer(
    analyzer='char',
    stop_words='english',
    ngram_range=(3, 6),
    lowercase=True,
    min_df=5,
    max_features=50000)

vectorizer = FeatureUnion([('word_vectorizer', word_vectorizer),  ('char_vectorizer', char_vectorizer)])
vectorizer.fit(X_train)

X_train_vectors = vectorizer.transform(X_train).toarray()
X_test_vectors = vectorizer.transform(X_test).toarray()
print(X_train_vectors.shape, X_test_vectors.shape)

#X_train_text = X_train.tolist()
#X_test_text = X_test.tolist()

(6090, 48864) (1523, 48864)


In [8]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

## Tokenize the sentences
max_features=50000
maxlen=25
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train.tolist())
X_train = tokenizer.texts_to_sequences(X_train.tolist())
X_test = tokenizer.texts_to_sequences(X_test.tolist())

## Pad the sentences 
X_train = pad_sequences(X_train, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)

Using TensorFlow backend.


In [9]:
X_train

array([[   0,    0,    0, ...,  353,  716,  838],
       [   0,    0,    0, ...,   39,  283,  403],
       [   0,    0,    0, ..., 1614,    9, 2313],
       ...,
       [   0,    0,    0, ..., 1496,  947,  310],
       [   0,    0,    0, ...,  364,  513, 1019],
       [   0,    0,    0, ...,  213,   73,  729]], dtype=int32)

# Recurrent Neural Networks

In [10]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, GlobalAveragePooling1D, GlobalMaxPooling1D, Dropout, Embedding
from tensorflow.keras import backend as K

vocab_dim = X_train.max() + 1
embed_dim = int(np.sqrt(vocab_dim))
print(vocab_dim, embed_dim)
model = Sequential([
    Embedding(input_dim=vocab_dim, output_dim=embed_dim),
    SimpleRNN(units=embed_dim, return_sequences=False),
    Dense(1, activation='sigmoid')
])
model.compile(
    loss='binary_crossentropy', 
    optimizer='adam', 
    metrics=['accuracy']
)

10401 101


In [11]:
history = model.fit(
    x=X_train, y=y_train, batch_size=64, epochs=5, verbose=True, validation_data=(X_test, y_test), shuffle=True
)
K.clear_session()

Train on 6090 samples, validate on 1523 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [12]:
model = Sequential([
    Embedding(input_dim=vocab_dim, output_dim=embed_dim),
    SimpleRNN(units=embed_dim, return_sequences=True),
    GlobalMaxPooling1D(),
    Dense(1, activation='sigmoid')
])
model.compile(
    loss='binary_crossentropy', 
    optimizer='adam', 
    metrics=['accuracy']
)

In [13]:
history = model.fit(
    x=X_train, y=y_train, batch_size=64, epochs=5, verbose=True, validation_data=(X_test, y_test), shuffle=True
)
K.clear_session()

Train on 6090 samples, validate on 1523 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Bidirectional RNN

In [14]:
from tensorflow.keras.layers import Bidirectional

model = Sequential([
    Embedding(input_dim=vocab_dim, output_dim=embed_dim),
    Bidirectional(SimpleRNN(units=embed_dim, return_sequences=True)),
    GlobalMaxPooling1D(),
    Dense(1, activation='sigmoid')
])
model.compile(
    loss='binary_crossentropy', 
    optimizer='adam', 
    metrics=['accuracy']
)

In [15]:
history = model.fit(
    x=X_train, y=y_train, batch_size=64, epochs=5, verbose=True, validation_data=(X_test, y_test), shuffle=True
)
K.clear_session()

Train on 6090 samples, validate on 1523 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# LSTM

In [16]:
from tensorflow.keras.layers import LSTM

model = Sequential([
    Embedding(input_dim=vocab_dim, output_dim=embed_dim),
    LSTM(units=embed_dim, return_sequences=False),
    Dense(1, activation='sigmoid')
])
model.compile(
    loss='binary_crossentropy', 
    optimizer='adam', 
    metrics=['accuracy']
)

In [17]:
history = model.fit(
    x=X_train, y=y_train, batch_size=64, epochs=5, verbose=True, validation_data=(X_test, y_test), shuffle=True
)
K.clear_session()

Train on 6090 samples, validate on 1523 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
