# Multi-layer Perceptron (with Embeddings)

This notebook is an example of a multi-layer perceptron with Keras (https://keras.io/)

In [1]:
# Import some needed packages
import matplotlib.pyplot as plt
%matplotlib notebook
from keras.models import Sequential
from keras.layers import Dense
import pandas as pd

  from ._conv import register_converters as _register_converters
Using CNTK backend


## Load data

We are going to load the tweets from SemEval 2018...

In [14]:
import pandas as pd
import glob

fpattern = '../Exercise_2-TwitterSentimentAnalysis/data/twitter-20*train-*.tsv'
filenames = [filename for filename in sorted(glob.glob(fpattern))]
# print(filenames)


In [15]:
# Load all files into a big data frame...
column_names = ['id', 'tag', 'tweet']
df = pd.concat([pd.read_csv(f, sep="\t", quoting=3, names=column_names) for f in filenames], ignore_index=True, sort=True)
# df.info()
# df.head()

In [16]:
# Drop rows having 'Not Available'...
df = df[df.tweet != 'Not Available']
# df.info()
# df.head()

## A function to convert a tweet into a set of words

In [17]:
import string
import re
#import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords

# turn a document into a list of clean tokens
def clean_doc(doc):
    # Remove links...
    doc = re.sub("\w+:\/\/\S+", " ", doc)
    # split into tokens by white space
    tokens = doc.split()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation from each word
    tokens = [re_punc.sub('', w) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

## Process all tweets, and save results in the dataframe...

In [18]:
import numpy as np

df['tokens'] = np.array([ clean_doc(tweet) for tweet in df.tweet ])
# df.info()  
# df.head()

## Perform all actions also for dev/test data...

In [19]:
fpattern = '../Exercise_2-TwitterSentimentAnalysis/data/twitter-20*dev-*.tsv'
devfs    = [filename for filename in sorted(glob.glob(fpattern))]
fpattern = '../Exercise_2-TwitterSentimentAnalysis/data/twitter-20*test-*.tsv'
testfs   = [filename for filename in sorted(glob.glob(fpattern))]
df_dev   = pd.concat([pd.read_csv(f, sep="\t", quoting=3, names=column_names) for f in devfs],  ignore_index=True, sort=True)
df_test  = pd.concat([pd.read_csv(f, sep="\t", quoting=3, names=column_names) for f in testfs], ignore_index=True, sort=True)
df_dev   = df_dev[df_dev.tweet != 'Not Available']
df_test  = df_test[df_test.tweet != 'Not Available']
df_dev['tokens']  = np.array([ clean_doc(tweet) for tweet in df_dev.tweet ])
df_test['tokens'] = np.array([ clean_doc(tweet) for tweet in df_test.tweet ])

## Extract our vocabulary...

In [21]:
from collections import Counter
import itertools

vocabulary = Counter()
for tweet_tokens in itertools.chain(df.tokens, df_dev.tokens, df_test.tokens):
    vocabulary.update(tweet_tokens)

print('Total tweets: ', sum(1 for _ in itertools.chain(df.tokens, df_dev.tokens, df_test.tokens)))
# vocabulary.most_common(10)

Total tweets:  30790


## Filter words using the vocabulary...

In [22]:
def token_to_vector_words(tokens, vocabulary):
    tokens = [w for w in tokens if w in vocabulary]
    return ' '.join(tokens)

# print(df.tweet[0])
# token_to_vector_words(df.tokens[0], vocabulary)

In [23]:
df['vector_tokens']      = np.array([ token_to_vector_words(tweet, vocabulary) for tweet in df.tokens ])
df_dev['vector_tokens']  = np.array([ token_to_vector_words(tweet, vocabulary) for tweet in df_dev.tokens ])
df_test['vector_tokens'] = np.array([ token_to_vector_words(tweet, vocabulary) for tweet in df_test.tokens ])
# df.info()
# df.head()

In [24]:
# Map tag from class (positive, negative) to numbers...
df['btag']      = df.tag.astype('category').cat.codes
df_dev['btag']  = df_dev.tag.astype('category').cat.codes
df_test['btag'] = df_test.tag.astype('category').cat.codes
# df_dev.head(6)

## Using pre-trained embeddings...

Pre-trained embeddings can be found:

GloVe: http://nlp.stanford.edu/projects/glove/

Word2Vec: https://code.google.com/archive/p/word2vec/

In [25]:
# Load embeddings into a dict...
embeddings_index = {}
glove_data = '../data/embeddings/glove.twitter.27B.50d.txt'
f = open(glove_data)
for line in f:
    values = line.split()
    word = values[0]
    value = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = value
f.close()
 
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 1193514 word vectors.


## Lets make our vectors...

In [26]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.vector_tokens)

word_index = tokenizer.word_index
embedding_dimension = 50
# The embedding_matrix matrix maps words to vectors in the specified embedding dimension (here 50):
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dimension))
for word, i in word_index.items():
    # print(word, i) <= i starts from 1
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector;#[:embedding_dimension]
print(embedding_matrix.shape)
print(embedding_matrix)

Xtrain = tokenizer.texts_to_sequences(df.vector_tokens)
Ytrain = df.btag
Xtest  = tokenizer.texts_to_sequences(df_test.vector_tokens)
Ytest  = df_test.btag
print(Xtrain[0])

## Get the longest tweet...
longest = max(df.tokens,key=len)
print(longest)
longest = max(Xtrain,key=len)
print(longest)

(23740, 50)
[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.51880002  0.039331    0.080883   ... -0.81913    -0.28933999
   0.87558001]
 [ 0.43026     0.0081207  -0.0090224  ... -0.24276    -0.51657999
   1.24720001]
 ...
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.086368    1.26409996  0.18991999 ... -0.43009001 -0.30998001
   0.047121  ]
 [-1.40139997  1.06110001 -0.14475    ... -0.16698    -0.26725999
  -0.70081002]]
[2625, 141, 358, 7, 6, 2340, 1249, 49]
['Make', 'Sure', 'To', 'Come', 'To', 'The', 'Bob', 'Jones', 'Game', 'Friday', 'Free', 'Hot', 'Dogs', 'Hamburgers', 'amp', 'Food', 'outside', 'gate', 'amp', 'watch', 'Bob', 'Jones', 'take', 'Austin', 'High']
[34, 119, 327, 29, 327, 5, 192, 1542, 15, 10, 96, 470, 1564, 14202, 11, 635, 1020, 3845, 11, 24, 192, 1542, 85, 1321, 465]


## A simple MLP model...

In [None]:
n_words = Xtest.shape[1]
# print(n_words)

In [None]:
from keras.utils.vis_utils import plot_model

# define network
model = Sequential()
model.add(Dense(units=64, activation='relu', input_shape=(n_words,)))
model.add(Dense(units=3, activation='softmax'))
# compile network
model.compile(loss='mean_squared_error', optimizer='sgd', metrics=['accuracy'])
# summarize defined model
model.summary()

from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

SVG(model_to_dot(model).create(prog='dot', format='svg'))

## Fit our network...

In [None]:
# fit network
from keras import backend as K
model.fit(K.cast_to_floatx(Xtrain), K.cast_to_floatx(Ytrain_one_hot), batch_size=10, epochs=30, verbose=2)

## Evaluate our fit network...


In [None]:
# evaluate
loss, acc = model.evaluate(K.cast_to_floatx(Xtest), K.cast_to_floatx(Ytest_one_hot), verbose=2)
print('Test Accuracy: %f' % (acc*100))

## Comparing Word Scoring Methods

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.utils import np_utils
from keras import backend as K

def prepare_data(train_df, test_df, mode):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train_df.vector_tokens)
    Xtrain = tokenizer.texts_to_matrix(train_df.vector_tokens, mode=mode)
    Ytrain = np_utils.to_categorical(train_df.btag)
    Xtest  = tokenizer.texts_to_matrix(test_df.vector_tokens, mode=mode)
    Ytest  = np_utils.to_categorical(test_df.btag)
    return K.cast_to_floatx(Xtrain), K.cast_to_floatx(Ytrain), K.cast_to_floatx(Xtest), K.cast_to_floatx(Ytest)


In [None]:
# evaluate a neural network model
def evaluate_mode(Xtrain, Ytrain, Xtest, Ytest, mode):
    scores = list()
    n_repeats = 10
    n_words = Xtest.shape[1]
    for i in range(n_repeats):
        # define network
        model = Sequential()
        model.add(Dense(units=64, input_shape=(n_words,), activation='relu'))
        model.add(Dense(units=3, activation='softmax'))
        # compile network
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        # fit network
        model.fit(Xtrain, Ytrain, epochs=10, verbose=2)
        # evaluate
        loss, acc = model.evaluate(Xtest, Ytest, verbose=0)
        scores.append(acc)
        print('%s %d accuracy: %s' % (mode, (i+1), acc))
    return scores

In [None]:
# run experiment
modes = ['binary', 'count', 'tfidf', 'freq']
results = pd.DataFrame()
for mode in modes:
    # prepare data for mode
    Xtrain, Ytrain, Xtest, Ytest = prepare_data(df, df_test, mode)
    # evaluate model on data for mode
    results[mode] = evaluate_mode(Xtrain, Ytrain, Xtest, Ytest, mode)



In [None]:
from matplotlib import pyplot
# summarise results
print(results.describe())
# plot results
results.boxplot()
pyplot.show()