# Simple tf.keras model
Word embedding, LSTM, concatenation

**GloVe weights info here: https://nlp.stanford.edu/projects/glove/**

Score approximately 0.81 in current version - can easily be improved by playing around with the model...

**Import libraries needed**

In [13]:
import numpy as np
import pandas as pd
import tensorflow as tf
import csv
import re

**Define functions for converting the attributes and text to decimal numbers**

In [14]:
def convert_test_attributes_to_onehot(df, value_list):
    # Return a one-hit encoded numpy array for test data keyword and location (because that is easier)
    one_hots = np.zeros((len(df), len(value_list)))
    
    for index, row in df.iterrows():
        if row['keyword'] != '':
            try:
                one_hots[index, value_list.index(row['keyword'])] = 1
            except ValueError:
#                print('could not find keyword: [' + str(row['keyword']) + '] at index [' + str(index) + ']')
                continue

        if row['keyword'] != '':
            try:
                one_hots[index, value_list.index(row['location'])] = 1
            except ValueError:
#                print('could not find location: [' + str(row['keyword']) + '] at index [' + str(index) + ']')
                continue

    return one_hots
    

def convert_attributes_to_onehot(df):
    # Return a one-hit encoded dataframe for keyword and location
    one_hot_keywords = pd.get_dummies(train_data['keyword'])
    one_hot_locations = pd.get_dummies(train_data['location'])

    return pd.concat([one_hot_keywords, one_hot_locations.reindex(one_hot_keywords.index)], axis=1)
          
    
def vectorize_tweets(dataframe, df_glove):
    # First create a blank matrix for our word vectors
    vectorized_data = np.zeros((len(dataframe), MAX_TWEET_LENGTH, VECTORS_PER_WORD))
    
    # Now loop thorugh the words and add the corresponding vectors to the matrix
    for index, row in dataframe.iterrows():
        i = 0
        # Remove special characters and convert text to lower case, then split on space
        for word in re.sub(r'\W+', ' ', row['text'].lower()).split(' '):
            try:
                vectorized_data[index, i, :] = df_glove.loc[word]
            except KeyError:
                # print(word + ' not found')
                continue
                
            i += 1
            
        # Print a message for every 100 texts processed - just to check if we are still alive
        if index % 100 == 99:
            print('Processing text number ' + str(index + 1) + ' of ' + str(len(dataframe)))
        
    return vectorized_data

**Define function for creating the neural network**

In [15]:
def build_model(lstm_shape, dense_shape):
    dropout = 0.5
    
    # First part of the network for keyword and location
    dense_input = tf.keras.layers.Input(shape=(dense_shape))
    dense1 = tf.keras.layers.Dense(15)(dense_input)
    
    # Second part of the network for the text analysis
    lstm_input = tf.keras.layers.Input(shape=(lstm_shape))
    lstm1 = tf.keras.layers.LSTM(units=350, return_sequences=True)(lstm_input)
    lstm2 = tf.keras.layers.Dropout(dropout)(lstm1)
    lstm3 = tf.keras.layers.LSTM(units=50, return_sequences=True)(lstm2)
    lstm4 = tf.keras.layers.Flatten()(lstm3)
    lstm5 = tf.keras.layers.Dense(50)(lstm4)
    
    # Concatenate part for concatenating the two parts above
    concat1 = tf.keras.layers.Concatenate()([dense1, lstm5])
    concat2 = tf.keras.layers.Dropout(dropout)(concat1)
    concat3 = tf.keras.layers.Dense(50)(concat2)
    
    # Output of model
    outputs = tf.keras.layers.Dense(1, activation='sigmoid')(concat3)
    
    # Compile the model
    model = tf.keras.Model(inputs=[dense_input, lstm_input], outputs=outputs)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00001),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['acc'])
    
    return model

In [16]:
import tensorflow as tf
dropout = 0.5

# First part of the network for keyword and location
dense_input = tf.keras.layers.Input(shape=(3212))
dense1 = tf.keras.layers.Dense(15)(dense_input)

# Second part of the network for the text analysis
lstm_input = tf.keras.layers.Input(shape=(20,20))
lstm1 = tf.keras.layers.LSTM(units=350, return_sequences=True)(lstm_input)
lstm2 = tf.keras.layers.Dropout(dropout)(lstm1)
lstm3 = tf.keras.layers.LSTM(units=50, return_sequences=True)(lstm2)
lstm4 = tf.keras.layers.Flatten()(lstm3)
lstm5 = tf.keras.layers.Dense(50)(lstm4)

# Concatenate part for concatenating the two parts above
concat1 = tf.keras.layers.Concatenate()([dense1, lstm5])
concat2 = tf.keras.layers.Dropout(dropout)(concat1)
concat3 = tf.keras.layers.Dense(50)(concat2)

# Output of model
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(concat3)

# Compile the model
model = tf.keras.Model(inputs=[dense_input, lstm_input], outputs=outputs)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00001),
          loss=tf.keras.losses.BinaryCrossentropy(),
          metrics=['acc'])

**Constants needed**

In [17]:
MAX_TWEET_LENGTH = 280 # Tweets can be a maximum of 280 characters
VECTORS_PER_WORD = 50  # The number of individual vector values that represents a word
BATCH_SIZE = 64
NUM_EPOCHS = 200

**Load the data and convert it to something that can be represented with decimal numbers**
This takes a while...

In [18]:
# Start out by loading the train and test files into Pandas
print('Loading test and training data')
test_data = pd.read_csv('test.csv')
train_data = pd.read_csv('train.csv')

# Next load the glove file 
print('Loading GloVe file')
glove_data = pd.read_csv('glove.6B.50d.txt', sep=' ', index_col=0, header = None, quoting=csv.QUOTE_NONE, encoding='utf-8')


df_one_hot_attributes = convert_attributes_to_onehot(train_data)
df_test_one_hot_attributes = convert_test_attributes_to_onehot(test_data, df_one_hot_attributes.columns.to_list())
df_one_hot_attributes.head()
# Now, vectorize the test and training data so that it can be understood by our model
vectorized_training_data = vectorize_tweets(train_data, glove_data)
vectorized_testing_data = vectorize_tweets(test_data, glove_data)
vectorized_training_data

Loading test and training data
Loading GloVe file
Processing text number 100 of 7613
Processing text number 200 of 7613
Processing text number 300 of 7613
Processing text number 400 of 7613
Processing text number 500 of 7613
Processing text number 600 of 7613
Processing text number 700 of 7613
Processing text number 800 of 7613
Processing text number 900 of 7613
Processing text number 1000 of 7613
Processing text number 1100 of 7613
Processing text number 1200 of 7613
Processing text number 1300 of 7613
Processing text number 1400 of 7613
Processing text number 1500 of 7613
Processing text number 1600 of 7613
Processing text number 1700 of 7613
Processing text number 1800 of 7613
Processing text number 1900 of 7613
Processing text number 2000 of 7613
Processing text number 2100 of 7613
Processing text number 2200 of 7613
Processing text number 2300 of 7613
Processing text number 2400 of 7613
Processing text number 2500 of 7613
Processing text number 2600 of 7613
Processing text number 

array([[[ 0.3466   ,  0.40689  , -0.079036 , ...,  0.31382  ,
         -0.18248  ,  0.10831  ],
        [ 0.04911  , -0.2102   , -0.26752  , ..., -0.17214  ,
          0.7101   ,  0.0040847],
        [ 0.96193  ,  0.012516 ,  0.21733  , ...,  0.14032  ,
         -0.38468  , -0.38712  ],
        ...,
        [ 0.       ,  0.       ,  0.       , ...,  0.       ,
          0.       ,  0.       ],
        [ 0.       ,  0.       ,  0.       , ...,  0.       ,
          0.       ,  0.       ],
        [ 0.       ,  0.       ,  0.       , ...,  0.       ,
          0.       ,  0.       ]],

       [[-0.0033744,  0.48159  , -0.38963  , ..., -0.92896  ,
          0.4294   , -0.87843  ],
        [ 0.50905  , -0.36805  ,  0.41275  , ..., -0.03356  ,
          0.056012 , -0.43283  ],
        [ 1.2142   ,  0.56772  ,  0.10257  , ..., -0.95912  ,
          0.211    , -0.89709  ],
        ...,
        [ 0.       ,  0.       ,  0.       , ...,  0.       ,
          0.       ,  0.       ],
        [ 0.

**Build the model, fit with data, make predictions and save to submit file**

In [22]:
# Build the model, show a summary and fit the model
model = build_model([MAX_TWEET_LENGTH, VECTORS_PER_WORD], len(df_one_hot_attributes.columns))
model.summary()

# Save the epoch with the lowest validation loss
checkpoint_save = tf.keras.callbacks.ModelCheckpoint('saved_model.h5', save_best_only=True, monitor='val_loss', mode='min')

# Use this to train on all training data - no validation
#model.fit(x=[df_one_hot_attributes, vectorized_training_data], y=train_data['target'], batch_size=BATCH_SIZE, epochs=NUM_EPOCHS, verbose=1, shuffle=True, callbacks=[checkpoint_save])
# Use this to split 0.9:0.1 between train and validation
model.fit(x=[df_one_hot_attributes, vectorized_training_data], y=train_data['target'], batch_size=BATCH_SIZE,
          epochs=4, verbose=1, shuffle=True, validation_split=0.01, callbacks=[checkpoint_save])

# Reload the best model for predictions
tf.keras.models.load_model('saved_model.h5')

# Create the submission file
print('Training complete. Commencing creation of submission file')
df_submission = pd.DataFrame()
df_submission['id'] = test_data['id']

predictions = model.predict([df_test_one_hot_attributes, vectorized_testing_data])
prediction_list = []
for prediction in predictions:
    if prediction < 0.5:
        prediction_list += [0]
    else:
        prediction_list += [1]
        
df_submission['target'] = prediction_list

df_submission.head(5)
df_submission.to_csv('submission.csv', index=False)

print('All done.')

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            [(None, 280, 50)]    0                                            
__________________________________________________________________________________________________
lstm_6 (LSTM)                   (None, 280, 350)     561400      input_8[0][0]                    
__________________________________________________________________________________________________
dropout_6 (Dropout)             (None, 280, 350)     0           lstm_6[0][0]                     
__________________________________________________________________________________________________
lstm_7 (LSTM)                   (None, 280, 50)      80200       dropout_6[0][0]                  
____________________________________________________________________________________________

In [1]:
vectorized_training_data.shape

NameError: name 'vectorized_training_data' is not defined

In [7]:
from keras.models import load_model

Using TensorFlow backend.


In [11]:
model = tf.keras.models.load_model('saved_model.h5')

In [21]:
predictions = model.predict([df_test_one_hot_attributes, vectorized_testing_data])
prediction_list = []
for prediction in predictions:
    if prediction < 0.5:
        prediction_list += [0]
    else:
        prediction_list += [1]
        
df_submission['target'] = prediction_list

df_submission.head(5)
df_submission.to_csv('submission-other.csv', index=False)