In [132]:
#Authors: Adam Lewandowski, Ivan Sladkov, Patrick English
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import nltk
from sklearn.utils import shuffle

In [2]:
def str_list_to_list(tweets):
    t = tweets.copy()
    import ast 
    t["tweets"] = t["tweets"].apply(lambda x: ast.literal_eval(x))
    return t
#Download this: https://drive.google.com/open?id=1-3lesjyVd1gGnjJGz_cipqO8CNeiTiPx
#Put it into data folder
tweets = pd.read_csv("data/processed_tweets.csv", encoding = "ISO-8859-1")
tweets = str_list_to_list(tweets)

In [3]:
print(tweets)

         target                                             tweets
0             0  [@switchfoot, http://twitpic.com/2y1zl, awww, ...
1             0  [upset, can't, update, facebook, texting, ...,...
2             0  [@kenichan, dive, many, time, ball, manage, sa...
3             0             [whole, body, feel, itchy, like, fire]
4             0   [@nationwideclass, behave, i'm, mad, can't, see]
...         ...                                                ...
1599995       4                [woke, school, best, feeling, ever]
1599996       4  [thewdb.com, cool, hear, old, walt, interview,...
1599997       4               [ready, mojo, makeover, ask, detail]
1599998       4  [happy, 38th, birthday, boo, alll, time, tupac...
1599999       4  [happy, #charitytuesday, @thenspcc, @sparkscha...

[1600000 rows x 2 columns]


In [71]:
def prepare_input_data(df, dictionary_size):
    # Pre-processing for word embeddings
    # Count each unique word
    raw_words = tweets['tweets']
    word_count = {}
    for tweet in raw_words:
        for word in tweet:
            if word in word_count:
                word_count[word] += 1
            else:
                word_count[word] = 1
    # Sort each unique word using the value
    sorted_word_count = sorted(word_count.items(), key=lambda key: key[1], reverse=True)
    
    # create dictionaries to convert tokens to integers and integers to tokens (needed for the embedding layer)
    # <pad> will be used for padding the data because the NN inputs have to be the same size and tweets are of varying length
    # <unk> will be used to replace tokens that were too uncommon to add to the dictionary
    word_dictionary={'<pad>': 0, '<unk>': 1}
    for i in range(dictionary_size):
        word_dictionary[sorted_word_count[i][0]]=i+2
    reverse_dictionary = dict([(value, key) for (key, value) in word_dictionary.items()])

    # Convert each token into its index in the dictionary and
    # create a new dataframe with the list of indexes and corresponding targets
    data=[]
    
    for tweet in tweets['tweets']:
        data.append(parse_tweet(tweet, word_dictionary))

    input_data=pd.DataFrame({
        'data': data,
        'targets': tweets['target']
    })
    
    return input_data, word_dictionary, reverse_dictionary

In [127]:
def split_padded_data(data, seed):
    # Get the length of the longest tweet
    max_tweet_length=len(max(data['data'], key=len))
    # Split the data 80% train data (later a subset will be selected as validation data) - 20% test data
    shuffled_dataframe=shuffle(data, random_state=seed)
    train_dataframe=shuffled_dataframe.sample(frac=0.8, random_state=seed)
    test_dataframe=shuffled_dataframe.drop(train_dataframe.index)
    
    # Pad the tweets with <pad> up to the max tweet length so all tweets have the same length
    train_data=keras.preprocessing.sequence.pad_sequences(list(train_dataframe['data']),
                                                        value=0,
                                                        padding='post',
                                                        maxlen=max_tweet_length)
    train_targets=train_dataframe['targets'].replace(4,1)
    
    test_data=keras.preprocessing.sequence.pad_sequences(list(test_dataframe['data']),
                                                        value=0,
                                                        padding='post',
                                                        maxlen=max_tweet_length)
    test_targets=test_dataframe['targets'].replace(4,1)
    
    return train_data, train_targets, test_data, test_targets, max_tweet_length
    

In [156]:
def create_model(train_data, train_targets, word_dictionary, max_tweet_length, seed):
    model = keras.models.Sequential()
    model.add(keras.layers.Embedding(len(word_dictionary), output_dim=150, input_length=max_tweet_length, trainable=True))
    model.add(keras.layers.LSTM(80))
    model.add(keras.layers.Dropout(0.2))
    model.add(keras.layers.Dense(2, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    return model

In [149]:
# Some variables used for pre-processing the data
dictionary_size=25000
seed=2020

In [150]:
# Pre-process the data.
# Create the dictionaries, split and pad the data
input_data, word_dictionary, reverse_dictionary = prepare_input_data(tweets, dictionary_size)
train_data, train_targets, test_data, test_targets, max_tweet_length = split_padded_data(input_data, seed)

model=create_model(train_data, train_targets, word_dictionary, max_tweet_length, seed)

In [155]:
model.fit(train_data, train_targets, epochs=3,batch_size=50, verbose=1, validation_split=0.1)

Train on 1152000 samples, validate on 128000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x158927ae4c8>

In [157]:
model.evaluate(test_data, test_targets, batch_size=50)



[0.43850039470940827, 0.7947969]