Basic Neural Network model to classify disaster tweets for Kaggle competition. 

1 Building the neural net

In [15]:
import numpy as np
import pandas as pd
import math

In [16]:
# supporting functions
def sigmoid(x):
    return 1.0/(1+ np.exp(-x))

def sigmoid_derivative(x):
    return x * (1.0 - x)

# 2 layer nn definition
class NeuralNetwork:
    # nn consists of the following components:
    # arbitraty amount of hidden layers, 2 in this case
    def __init__(self, x, y):
        # an input layer, x
        self.input      = x
        # a set of weights and biases (assumed 0 for simplicity)
        self.weights1   = np.random.rand(self.input.shape[1],4) 
        self.weights2   = np.random.rand(4,1) 
        # an output layer, y
        self.y          = y
        self.output     = np.zeros(y.shape)
    
    #calculating the predicted output y hat
    def feedforward(self):
        self.layer1 = sigmoid(np.dot(self.input, self.weights1))
        self.output = sigmoid(np.dot(self.layer1, self.weights2))
    
    # determine the gradient of loss function so we know which direction to move our predictions in
    def backprop(self):
        # application of the chain rule to find derivative of the loss function with respect to weights2 and weights1
        d_weights2 = np.dot(self.layer1.T, (2*(self.y - self.output) * sigmoid_derivative(self.output)))
        d_weights1 = np.dot(self.input.T,  (np.dot(2*(self.y - self.output) * sigmoid_derivative(self.output), self.weights2.T) * sigmoid_derivative(self.layer1)))

        # update the weights with the derivative (slope) of the loss function
        self.weights1 += d_weights1
        self.weights2 += d_weights2

In [17]:
# example application
if __name__ == "__main__":
    X = np.array([[0,0,1],
                  [0,1,1],
                  [1,0,1],
                  [1,1,1]])
    y = np.array([[0],[1],[1],[0]])
    nn = NeuralNetwork(X,y)

    for i in range(1500):
        nn.feedforward()
        nn.backprop()

    print("predictions: ",nn.output)
    print("average error: ",sum((nn.output - y)**2)/len(y))

predictions:  [[0.00928497]
 [0.97189493]
 [0.97234052]
 [0.03196776]]
average error:  [0.00066577]


2 Kaggle competiton

In [18]:
tweets = pd.read_csv(r'nlp-getting-started\train_clean_spellcheck.csv', usecols = ['id','text','target'])
ids = tweets['id']
tweets = tweets[['target','text']]

In [19]:
pd.set_option('display.max_colwidth', -1)
# looks like some text cleaning will be neccessary, keep it naive for now, but some ideas for later:
# remove special charactes, puncuation, hashtags
# fix spelling errors, remove extra letters
tweets[['target','text']].head(20)

Unnamed: 0,target,text
0,1,our deeds are the reason of this earthquake may Allah forgive us all
1,1,forest fire near la range asks Canada
2,1,all residents asked to shelter in place are being notified by officers no other evacuation or shelter in place orders are expected
3,1,13000 people receive wildfires evacuation orders in California
4,1,just got sent this photo from ruby Alaska as smoke from wildfires pours into a school
5,1,rocky fire update California hwy 20 closed in both directions due to lake county fire afire wildfires
6,1,flood disaster heavy rain causes flash flooding of streets in mangetout Colorado springs areas
7,1,mi on top of the hill and i can see a fire in the woods
8,1,theirs an emergency evacuation happening now in the building across the street
9,1,mi afraid that the tornado is coming to our area


In [20]:
# create our training data from the tweets
train_x = np.asarray(tweets.text)
# index all the sentiment labels
train_y = np.asarray([[x] for x in tweets.target])

In [21]:
# time to get serious
import json
import keras
import keras.preprocessing.text as kpt
from keras.preprocessing.text import Tokenizer

In [22]:
# only work with the 3000 most popular words found in our dataset
max_words = 3000

# create a new Tokenizer
tokenizer = Tokenizer(num_words=max_words)
# feed our tweets to the Tokenizer
tokenizer.fit_on_texts(train_x)

# Tokenizers come with a convenient list of words and IDs
dictionary = tokenizer.word_index

#remove the top words, as these likely have no predictive power
to_remove = list(dictionary.keys())[0:30]
dictionary = dict([(k,v) for k,v in dictionary.items() if not k in to_remove])

# Let's save this out so we can use it later
with open('dictionary.json', 'w') as dictionary_file:
    json.dump(dictionary, dictionary_file)


def convert_text_to_index_array(text):
    # one really important thing that `text_to_word_sequence` does
    # is make all texts the same length -- in this case, the length
    # of the longest text in the set.
    return [dictionary[word] for word in kpt.text_to_word_sequence(text) if word not in to_remove]

allWordIndices = []
# for each tweet, change each token to its ID in the Tokenizer's word_index
for text in train_x:
    wordIndices = convert_text_to_index_array(text)
    allWordIndices.append(wordIndices)

# now we have a list of all tweets converted to index arrays.
# cast as an array for future usage.
allWordIndices = np.asarray(allWordIndices)

# create one-hot matrices out of the indexed tweets
train_x = tokenizer.sequences_to_matrix(allWordIndices, mode='binary')

# create our first neural network!
if __name__ == "__main__":
    X = train_x
    y = train_y
    nn = NeuralNetwork(X,y)

    for i in range(1500):
        nn.feedforward()
        nn.backprop() 
    print("average error: ",sum((nn.output - y)**2)/len(y))

pred = [int(x) for x in nn.output.round()]
submission_1 = pd.DataFrame({"id":ids,"target":pred,'real':tweets.target})
print(sum((submission_1.real - submission_1.target)**2)/len(submission_1.real))
print(sum(submission_1.target))

looks like our model isn't working! It gives every prediction a score of 0 (i.e. not about disaster)

There's lots of room for improvements, including:
    1 raising the max word limt to > 3000 (didn't work)
    2 cleaning the raw tweets, see above for ideas (didn't work
    3 running more iterations (didn't work)

Time to try something a bit more advanced.

In [23]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation

# treat the labels as categories
train_y = keras.utils.to_categorical(train_y, 2)

model = Sequential()
model.add(Dense(512, input_shape=(max_words,), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))

In [24]:
model.compile(loss='categorical_crossentropy',
  optimizer='Adagrad',
  metrics=['accuracy'])

In [25]:
model.fit(train_x, train_y,
  batch_size=32,
  epochs=5,
  verbose=1,
  validation_split=0.1,
  shuffle=True)

Train on 6851 samples, validate on 762 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x284686dc188>

In [26]:
labels = [0,1]
pred = model.predict(train_x)
pred =  [labels[np.argmax(x)] for x in pred]

In [27]:
from sklearn.metrics import classification_report, f1_score , recall_score, precision_score
print(classification_report(tweets['target'], pred))

              precision    recall  f1-score   support

           0       0.94      0.96      0.95      4342
           1       0.94      0.92      0.93      3271

    accuracy                           0.94      7613
   macro avg       0.94      0.94      0.94      7613
weighted avg       0.94      0.94      0.94      7613



In [28]:
model_json = model.to_json()
with open('model3.json', 'w') as json_file:
    json_file.write(model_json)

model.save_weights('model3.h5')