In [None]:
import re

import pandas as pd
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Fetch the tweets

Load in a collections of tweets. These tweets were downloaded from Twitter's streaming API over a number of hours for any tweets containing the words *happy*, *sad*, *joy*, *anger*, *angry*

In [None]:
df = pd.read_csv("https://github.com/IBMDeveloperUK/ML-For-Everyone/blob/master/20200609-Analysing-Tweet-Sentiment-with-Keras/tweets.csv.gz?raw=true", compression='gzip')
df.dropna(inplace=True)

In [None]:
df.head(5)

Unnamed: 0,Tweet,Matched Keywords,Date,User,Source,Tweet ID,Tweet URL
1,"@VanessaMorgan Feeling hopeless, angry and afr...",angry,2020-05-31 21:53:48,davidsSon85,Twitter for iPhone,1267212712121176066,https://twitter.com/statuses/1267212712121176066
2,@Byune_hyune happy birthday ya. 🤗💕,happy,2020-05-31 21:53:48,PutriRa2721,Twitter for Android,1267212712204922885,https://twitter.com/statuses/1267212712204922885
3,"@LeaKThompson Happy birthday, Ms Thompson.",happy,2020-05-31 21:53:49,CyberWolf_CC,Twitter for Android,1267212713119399937,https://twitter.com/statuses/1267212713119399937
4,@KillMyCastaway happy birthday tom holland,happy,2020-05-31 21:53:49,tombaeholland,Twitter for iPhone,1267212713387835395,https://twitter.com/statuses/1267212713387835395
5,The way some humans behave to others based on ...,sad,2020-05-31 21:53:49,djzinc,Tweetbot for iΟS,1267212713958178816,https://twitter.com/statuses/1267212713958178816


## Pre-process the tweets

We need to tokenise the tweets, and categorise the tweets into the two classes we will be predicting *joy* and *anger*

In [None]:
df2 = pd.DataFrame()
tkn  = Tokenizer()

df2['tweet'] = df['Tweet']

tkn.fit_on_texts(df2['tweet'])
vocab_size = len(tkn.word_index) + 1
print(vocab_size)

df2['tkns'] = tkn.texts_to_sequences(df2['tweet'])

joy_re = re.compile(r"\b((?<!no )joy|(?<!not )happy|not sad|not angry|no anger)\b", re.I)
anger_re = re.compile(r"\b((?<!not )sad|(?<!not )angry|(?<!no )anger|not happy|no joy)\b", re.I)

df2['joy'] = df2['tweet'].apply(lambda x: 1 if re.search(joy_re, x) else 0)
df2['anger'] = df2['tweet'].apply(lambda x: 1 if re.search(anger_re, x) else 0)


df2.head(5)

472607


Unnamed: 0,tweet,tkns,joy,anger
1,"@VanessaMorgan Feeling hopeless, angry and afr...","[13315, 301, 3441, 36, 4, 823, 89, 34, 546, 10...",0,1
2,@Byune_hyune happy birthday ya. 🤗💕,"[3358, 3359, 1, 16, 251, 14567]",1,0
3,"@LeaKThompson Happy birthday, Ms Thompson.","[11964, 1, 16, 2414, 21397]",1,0
4,@KillMyCastaway happy birthday tom holland,"[5357, 1, 16, 1050, 2349]",1,0
5,The way some humans behave to others based on ...,"[3, 136, 106, 1219, 4347, 2, 326, 1605, 35, 70...",0,1


Calculate the length of each tweet, then calculate the 99% quantile for length and we will use that as the maximum length of the tweets we will process

In [None]:
df2['len'] = df2['tkns'].apply(lambda x: len(x))
df2.head(5)

Unnamed: 0,tweet,tkns,joy,anger,len
1,"@VanessaMorgan Feeling hopeless, angry and afr...","[13315, 301, 3441, 36, 4, 823, 89, 34, 546, 10...",0,1,15
2,@Byune_hyune happy birthday ya. 🤗💕,"[3358, 3359, 1, 16, 251, 14567]",1,0,6
3,"@LeaKThompson Happy birthday, Ms Thompson.","[11964, 1, 16, 2414, 21397]",1,0,5
4,@KillMyCastaway happy birthday tom holland,"[5357, 1, 16, 1050, 2349]",1,0,5
5,The way some humans behave to others based on ...,"[3, 136, 106, 1219, 4347, 2, 326, 1605, 35, 70...",0,1,42


In [None]:
max_len = int(df2['len'].quantile(0.99))
max_len

56

## Pre-trained word vectors

We will be using the GloVe pre-trained word vectors from:

https://nlp.stanford.edu/projects/glove/

There is a specific collection collated from Twitter. This is downloadable as a zip file of CSV data. For usage below the 100 vector set has been converted to a python dictionary mapping word to a size (100,) numpy array and then pickled. 

In [None]:

import types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
client_32ba12f9aedd46c2a5230b8cb8fe2d3f = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='<fill in API Key>',
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')

# Your data file was loaded into a botocore.response.StreamingBody object.
# Please read the documentation of ibm_boto3 and pandas to learn more about the possibilities to load the data.
# ibm_boto3 documentation: https://ibm.github.io/ibm-cos-sdk-python/
# pandas documentation: http://pandas.pydata.org/
streaming_body_1 = client_32ba12f9aedd46c2a5230b8cb8fe2d3f.get_object(Bucket='twitchdemos2-donotdelete-pr-8dngl3cqohbrop', Key='glove.pkl')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(streaming_body_1, "__iter__"): streaming_body_1.__iter__ = types.MethodType( __iter__, streaming_body_1 ) 


In [None]:
import pickle
embedding_index = pickle.loads(streaming_body_1.read())

In [None]:
# Test to see we have a vector for a common word, e.g. cat
embedding_index['cat']


array([ 0.38446  , -0.45507  ,  0.45351  ,  0.4301   , -0.050908 ,
       -0.26414  ,  0.43253  , -0.3166   ,  0.32214  ,  0.0064333,
       -0.47066  ,  0.95335  , -3.2063   ,  0.010913 , -0.27565  ,
        1.1732   ,  0.52033  , -0.045973 ,  0.094254 , -0.53846  ,
        0.0035668,  0.11934  , -0.17815  , -0.58093  ,  0.65081  ,
       -0.48746  , -0.50961  ,  0.42771  , -0.30638  ,  0.32385  ,
        0.33687  , -0.1717   , -0.39104  , -0.19038  ,  0.37016  ,
       -0.50396  ,  0.041969 , -0.20517  ,  0.3223   ,  0.41217  ,
       -0.42191  , -0.26359  , -0.1773   , -0.35658  ,  0.52145  ,
        0.57282  ,  0.60204  ,  0.74369  ,  0.33377  , -0.45041  ,
        0.015978 , -0.12575  ,  0.29786  , -0.77635  ,  0.23759  ,
        0.63821  ,  0.63726  ,  1.0079   ,  0.13714  , -0.031928 ,
       -0.21299  ,  0.52348  ,  0.67934  , -0.1427   , -0.64236  ,
       -0.47996  , -0.87915  ,  0.17501  ,  0.64517  ,  0.3778   ,
        0.53493  , -0.29723  , -0.25206  , -0.757    ,  0.3364

## Create an Embedding Matrix
We create an embedding matrix in which we convert the words into the integer token values from the tokenizers word index.

In [None]:
embedding_matrix = np.zeros((vocab_size, 100), dtype=np.float32)
for word, i in tkn.word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [None]:
# Pad the documents out to the same length, based on out max length calculated above
padded_docs = pad_sequences(df2['tkns'], maxlen=max_len, padding='post')

In [None]:
labels = df2[['joy', 'anger']].values

# inspect a single document to see if it looks reasonable
padded_docs[0]

array([13315,   301,  3441,    36,     4,   823,    89,    34,   546,
          10,   189,    72,    92,     3,   136,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0], dtype=int32)

In [None]:
embedding_matrix.shape

(472607, 100)

## Build and Train our model

We build a neural network model with Keras. The model is a two layer LSTM model with dropout layers in between to prevent overfitting.

The model is then trained and evaluated on the tweets

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, CuDNNLSTM as LSTM, Dropout
from tensorflow.keras.layers import Embedding, Flatten

model = Sequential()
model.add(Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_len, trainable=False))
model.add(LSTM(100, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(2, activation='softmax'))
# compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 56, 100)           47260700  
_________________________________________________________________
cu_dnnlstm_1 (CuDNNLSTM)     (None, 56, 100)           80800     
_________________________________________________________________
dropout_2 (Dropout)          (None, 56, 100)           0         
_________________________________________________________________
cu_dnnlstm_2 (CuDNNLSTM)     (None, 100)               80800     
_________________________________________________________________
dropout_3 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 202       
Total params: 47,422,502
Trainable params: 161,802
Non-trainable params: 47,260,700
____________________________________

In [None]:
keywords = tkn.texts_to_sequences(["joy happy sad angry anger"])[0]
keywords

[75, 1, 8, 36, 52]

In [None]:
thresh = int(len(padded_docs) * 0.8)

train_X, test_X = padded_docs[:thresh], padded_docs[thresh:]
train_y, test_y = labels[:thresh], labels[thresh:]

for keyword in keywords:
  train_X = np.where(train_X == keyword, 0, train_X)

In [None]:
train_X[-8]

array([    25,    682,   1141,    657,      7,    258,    601,    119,
         1201,      9,   1218,    505,      9,    257,     24,    716,
          115,     29,      3,      0,      4,    260,     31,     24,
         1356,      3,     72,     62,   2325,    981,   7390,      3,
          119,    545,      4,     38,     24,    149,   5908,      3,
         1030,   1008,     45,    174,    421, 389067,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0],
      dtype=int32)

In [None]:
model.fit(train_X, train_y, epochs=20, verbose=1,
         validation_split=0.1,
         batch_size=128)

Train on 357867 samples, validate on 39764 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f943e3a12b0>

## Test our model

We test our model on a random set of statements to see how the sentiment comes out from our model. The predictions return an array of [joy, anger] percentages for each statement

In [None]:
tweets = ["I love the world",
          "I hate the world",
          "I'm not happy about riots",
          "I like ice cream"
         ]

tweet_docs = tkn.texts_to_sequences(tweets)
tweet_padded_docs = pad_sequences(tweet_docs, maxlen=max_len, padding='post')

model.predict(tweet_padded_docs)

array([[0.5283357 , 0.4716643 ],
       [0.21932708, 0.78067297],
       [0.45187068, 0.5481293 ],
       [0.6312392 , 0.3687608 ]], dtype=float32)