In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import keras

Using TensorFlow backend.


In [4]:
train_set = pd.read_csv('../__DATA__/twitter-sentiment-analysis2/train.csv', encoding = 'ISO-8859-1' )
test_set = pd.read_csv('../__DATA__/twitter-sentiment-analysis2/test.csv', encoding = 'ISO-8859-1')

In [27]:
train_set.shape

(99989, 4)

In [28]:
test_set.shape

(299989, 3)

In [31]:
test_set.SentimentText[:5]

0                         is so sad for my APL frie...
1                       I missed the New Moon trail...
2                              omg its already 7:30 :O
3              .. Omgaga. Im sooo  im gunna CRy. I'...
4             i think mi bf is cheating on me!!!   ...
Name: SentimentText, dtype: object

In [32]:
train_set.SentimentText[:5]

0                         is so sad for my APL frie...
1                       I missed the New Moon trail...
2                              omg its already 7:30 :O
3              .. Omgaga. Im sooo  im gunna CRy. I'...
4             i think mi bf is cheating on me!!!   ...
Name: SentimentText, dtype: object

### Preparing Train Data

In [40]:
### Split data into train and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(train_set.SentimentText, train_set.Sentiment,
                                                    train_size = 0.5, random_state = 5)

In [41]:
import re

ndocs = 5000

X_train_cleaned = []
for i, doc in enumerate(X_train[:ndocs]):
    doc = re.sub(r"\n", " ", doc) # new lines to spaces
    doc = re.sub("[^a-zA-Z' ]+", '', doc) # rid of punctuation and numbers
    doc = re.sub(' +', ' ', doc) # stripping extra white space out
    doc = doc.strip() # stripping extra white space out
    X_train_cleaned.append(doc)

print(len(X_train_cleaned))
X_train_cleaned[:2]

5000


['Bsimi thank you Brian',
 "cavsfanatic Fantastic now I want chocolate milk And there's no Nesquik in the house"]

In [42]:
X_train_cleaned[:10]

['Bsimi thank you Brian',
 "cavsfanatic Fantastic now I want chocolate milk And there's no Nesquik in the house",
 'Suze I like the food angle to it Thanks',
 'CrazyCatLadie and cutiepie sorry had prior plans for today',
 "complianceweek CONGRATS I refollowed you several times as you got closer but guess I didn't win",
 "sigh ok where was I oh yesgoing to john's doughnut for coffee n light reading dty cleaners then the gymowww a sale",
 "AndreaDG oooh Marty's cracklins Yumo Haha I love that",
 'shoutout my mommy shes sick and it worries me please keep her in ur prayers',
 'bombDUH ugghhhh the worst',
 "TraceyMmm hahaha it's my idea my hashtag i stole it from a pub in Fortitude Valley NoUndiesSunday"]

In [43]:
X_train_cleaned[-10:]

['iremember how i was stunned by that txt msg gt still am sometimes ohhhh yea httptinyurlcomlmovyf',
 "NO MORE SICKNESS strike i say STRIKE ohhhh well 'nother day off school i guess",
 "ambermac well it's got to be better than twitterberry",
 'comefilljulia I feel some love But I have to go to work in a few minutes',
 'ConnieLeyva lol i dnt thnk so but el taquito sounds yum',
 'bubblythoughts Yup very happy But still have a thesis to finish writing SO excited to start working hows your morning going',
 'ClareBear I wonder if eckstavo will pee his pants in excitement Wanna take bets',
 "atchoo you go my girl that's the foremost reason why i love you re me being me",
 "caitlinlavergne they are going to destin I want to go to the beach so badbut I'd rather work",
 'none of the girls I know at Halmark are working My poor lunch will be warm by the time I eat it its gonna be warm']

In [44]:
import spacy
nlp = spacy.load('en')

tokenized_train_docs = []
documents = [ nlp(document) for document in X_train_cleaned ]

for text in documents:
    tokens = [token.lemma_
                    for token in text
                    if not token.is_punct
                    and not token.is_stop]
    doc_parsed = ' '.join(tokens)
    tokenized_train_docs.append(doc_parsed)

tokenized_train_docs[:2]

['Bsimi thank Brian',
 'cavsfanatic Fantastic want chocolate milk Nesquik house']

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(lowercase=True, 
                          stop_words='english',
                          ngram_range=(1, 1), 
                          analyzer=u'word', 
                          max_df=.5, 
                          min_df=1,
                          max_features=None, 
                          vocabulary=None, 
                          binary=False)

train_tfidf = vectorizer.fit_transform(tokenized_train_docs)
terms = vectorizer.get_feature_names()

### Preparing Test Data

In [49]:
import re

X_test_cleaned = []
for i, doc in enumerate(X_test[:ndocs]):
    doc = re.sub(r"\n", " ", doc) # new lines to spaces
    doc = re.sub("[^a-zA-Z' ]+", '', doc) # rid of punctuation and numbers
    doc = re.sub(' +', ' ', doc) # stripping extra white space out
    doc = doc.strip() # stripping extra white space out
    X_test_cleaned.append(doc) 

import spacy
nlp = spacy.load('en')

tokenized_test_docs = []
documents = [ nlp(document) for document in X_test_cleaned ]

for text in documents:
    tokens = [token.lemma_
                    for token in text
                    if not token.is_punct
                    and not token.is_stop]
    doc_parsed = ' '.join(tokens)
    tokenized_test_docs.append(doc_parsed)

# tfidf transformation, based on train fitting
test_tfidf = vectorizer.transform(tokenized_test_docs)

In [50]:
print(train_tfidf.shape)
print(test_tfidf.shape)

(5000, 11732)
(5000, 11732)


### Setting Up a Keras Neural Network

In [51]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from keras.layers import LSTM, Input, TimeDistributed
from keras.models import Model
from keras.optimizers import RMSprop

# Import the backend
from keras import backend as K

In [64]:
# Initializing final train and test sets
#testlen = 1000

X_Train = train_tfidf
Y_Train = keras.utils.to_categorical(Y_train[:ndocs], 2)
X_Test = test_tfidf
Y_Test = keras.utils.to_categorical(Y_test[:ndocs], 2)

X_Train = X_Train.astype('float32')
X_Test = X_Test.astype('float32')

print(X_Train.shape[0], 'train samples')
print(X_Test.shape[0], 'test samples')

# In case you're confused about normalizing tfidf vectors, just know that tfidf matrices are meant to processed as is
# No need for further normalization, because two normalizations are already baked into the term itself
# See: https://datascience.stackexchange.com/questions/33730/should-i-rescale-tfidf-features

5000 train samples
5000 test samples


In [56]:
input_shape = X_Train.shape[1]
input_shape

11732

In [67]:
model = Sequential()

model.add(Dense(512, activation='relu', input_shape=(input_shape,)))
model.add(Dropout(0.1))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(2, activation='softmax'))

model.summary()

model.compile(loss='categorical_crossentropy',
              optimizer=RMSprop(),
              metrics=['accuracy'])

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_13 (Dense)             (None, 512)               6007296   
_________________________________________________________________
dropout_9 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 32)                16416     
_________________________________________________________________
dropout_10 (Dropout)         (None, 32)                0         
_________________________________________________________________
dense_15 (Dense)             (None, 2)                 66        
Total params: 6,023,778
Trainable params: 6,023,778
Non-trainable params: 0
_________________________________________________________________


In [70]:
history = model.fit(X_Train, Y_Train,
                    batch_size=128,
                    epochs=10,
                    verbose=1,
                    validation_data=(X_Test, Y_Test))
score = model.evaluate(X_Test, Y_Test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Train on 5000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 3.1444558094024657
Test accuracy: 0.6417999863624573


### Recurrent Neural Network

We're starting from scratch! This time using GloVe word embeddings to vectorize the text and running it through a recurrent Neural Network.

In [72]:
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict
import re
#from bs4 import BeautifulSoup
import sys
import os
os.environ['KERAS_BACKEND']='theano'
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, LSTM, GRU, Bidirectional
from keras.models import Model
from keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt
plt.switch_backend('agg')
from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers
%matplotlib inline

In [73]:
train_set = pd.read_csv('../__DATA__/twitter-sentiment-analysis2/train.csv', encoding = 'ISO-8859-1' )

In [78]:
import re

ndocs = 10000

docs_cleaned = []
for i, doc in enumerate(train_set.SentimentText[:ndocs]):
    doc = re.sub(r"\n", " ", doc) # new lines to spaces
    doc = re.sub("[^a-zA-Z' ]+", '', doc) # rid of punctuation and numbers
    doc = re.sub(' +', ' ', doc) # stripping extra white space out
    doc = doc.strip() # stripping extra white space out
    docs_cleaned.append(doc)

print(len(docs_cleaned))
docs_cleaned[:2]

10000


['is so sad for my APL friend', 'I missed the New Moon trailer']

In [99]:
# Predetermined Constants
MAX_SEQUENCE_LENGTH = 117
MAX_NB_WORDS = 100000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [80]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(docs_cleaned)
sequences = tokenizer.texts_to_sequences(docs_cleaned)

word_index = tokenizer.word_index

print('Number of Unique Tokens',len(word_index))

Number of Unique Tokens 19620


In [83]:
max([len(seq) for seq in sequences]) # Setting MAX_NB_SEQUENCES thusly

117

In [100]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(train_set.Sentiment[:ndocs]))
print('Shape of Data Tensor:', data.shape)
print('Shape of Label Tensor:', labels.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

Shape of Data Tensor: (10000, 117)
Shape of Label Tensor: (10000, 2)


In [87]:
embeddings_index = {}
f = open('../__TRAINED_MODELS__/glove.twitter.27B/glove.twitter.27B.100d.txt',encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors in Glove 27B 100d.' % len(embeddings_index))

Total 1193514 word vectors in Glove 27B 100d.


In [101]:
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [102]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

In [103]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
l_lstm = Bidirectional(LSTM(100))(embedded_sequences)
preds = Dense(2, activation='softmax')(l_lstm)
model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

print("Bidirectional LSTM")
model.summary()

Bidirectional LSTM
Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 117)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 117, 100)          1962100   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 200)               160800    
_________________________________________________________________
dense_17 (Dense)             (None, 2)                 402       
Total params: 2,123,302
Trainable params: 2,123,302
Non-trainable params: 0
_________________________________________________________________


In [104]:
cp = ModelCheckpoint('model_rnn.hdf5',
                     monitor='val_acc',
                     verbose=1
                     ,save_best_only=True)
history = model.fit(x_train, y_train,
                    validation_data=(x_val, y_val),
                    epochs=10, batch_size=2,
                    callbacks=[cp])

Train on 8000 samples, validate on 2000 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.78350, saving model to model_rnn.hdf5
Epoch 2/10

Epoch 00002: val_acc improved from 0.78350 to 0.78800, saving model to model_rnn.hdf5
Epoch 3/10

Epoch 00003: val_acc improved from 0.78800 to 0.79550, saving model to model_rnn.hdf5
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.79550
Epoch 5/10

Epoch 00005: val_acc did not improve from 0.79550
Epoch 6/10

Epoch 00006: val_acc did not improve from 0.79550
Epoch 7/10

Epoch 00007: val_acc did not improve from 0.79550
Epoch 8/10

Epoch 00008: val_acc did not improve from 0.79550
Epoch 9/10

Epoch 00009: val_acc did not improve from 0.79550
Epoch 10/10

Epoch 00010: val_acc did not improve from 0.79550


Using a reccurrent neural network and GloVe word embeddings, the accuracy of predicting the sentiment for tweets increased to 79.6%. Although this cant be compared apples to apples because a different train/test split was employed for the recurrent neural net as was for the MLP.

### Convolutional Neural Network
Taking the same word embeddings and train/test split, we will train a convolutional neural network to create a classifier

In [105]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

In [117]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
l_cov1= Conv1D(128, 2, activation='relu')(embedded_sequences)
l_pool1 = MaxPooling1D(2)(l_cov1)
l_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(5)(l_cov2)
l_cov3 = Conv1D(128, 5, activation='relu')(l_pool2)
l_pool3 = MaxPooling1D(5)(l_cov3)  # global max pooling
l_flat = Flatten()(l_pool3)
l_dense = Dense(128, activation='relu')(l_flat)
preds = Dense(2, activation='softmax')(l_dense)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

print("Simplified convolutional neural network")
model.summary()
cp=ModelCheckpoint('model_cnn.hdf5',monitor='val_acc',verbose=1,save_best_only=True)

Simplified convolutional neural network
Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_14 (InputLayer)        (None, 117)               0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 117, 100)          1962100   
_________________________________________________________________
conv1d_34 (Conv1D)           (None, 116, 128)          25728     
_________________________________________________________________
max_pooling1d_29 (MaxPooling (None, 58, 128)           0         
_________________________________________________________________
conv1d_35 (Conv1D)           (None, 54, 128)           82048     
_________________________________________________________________
max_pooling1d_30 (MaxPooling (None, 10, 128)           0         
_________________________________________________________________
conv1d_36 (Conv1D) 

In [118]:
history=model.fit(x_train, y_train, 
                  validation_data=(x_val, y_val),
                  epochs=5, batch_size=2,callbacks=[cp])

Train on 8000 samples, validate on 2000 samples
Epoch 1/5

Epoch 00001: val_acc improved from -inf to 0.59000, saving model to model_cnn.hdf5
Epoch 2/5

Epoch 00002: val_acc improved from 0.59000 to 0.60200, saving model to model_cnn.hdf5
Epoch 3/5

Epoch 00003: val_acc did not improve from 0.60200
Epoch 4/5

Epoch 00004: val_acc did not improve from 0.60200
Epoch 5/5

Epoch 00005: val_acc did not improve from 0.60200


Convolutional Neural Network did not improve the accuracy; mostly likely because I don't know how to use one and toggle with the parameters.