In [1]:
import pandas as pd

data = pd.read_csv('./data/data.csv', error_bad_lines=False)
data.head()

b'Skipping line 8836: expected 4 fields, saw 5\n'
b'Skipping line 535882: expected 4 fields, saw 7\n'


Unnamed: 0,ItemID,Sentiment,SentimentSource,SentimentText
0,1,0,Sentiment140,is so sad for my APL frie...
1,2,0,Sentiment140,I missed the New Moon trail...
2,3,1,Sentiment140,omg its already 7:30 :O
3,4,0,Sentiment140,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,Sentiment140,i think mi bf is cheating on me!!! ...


In [4]:
import re
from nltk.tokenize import word_tokenize

def clean_tweet(tweet):
    tweet = ' '.join(re.sub('(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)', ' ', tweet).split())
    tweet = tweet.strip().lower()
    return ' '.join(word_tokenize(tweet))

data['CleanText'] = data['SentimentText'].map(lambda t: clean_tweet(t))
    
data.head()

Unnamed: 0,ItemID,Sentiment,SentimentSource,SentimentText,CleanText
0,1,0,Sentiment140,is so sad for my APL frie...,is so sad for my apl friend
1,2,0,Sentiment140,I missed the New Moon trail...,i missed the new moon trailer
2,3,1,Sentiment140,omg its already 7:30 :O,omg its already 7 30 o
3,4,0,Sentiment140,.. Omgaga. Im sooo im gunna CRy. I'...,omgaga im sooo im gunna cry i ve been at this ...
4,5,0,Sentiment140,i think mi bf is cheating on me!!! ...,i think mi bf is cheating on me t t


In [5]:
# Saving for future use
data.to_csv('./data/clean_data.csv', index=False)

In [11]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(data['CleanText'], 
                                                    data['Sentiment'], 
                                                    test_size=0.1, 
                                                    random_state=42,
                                                    stratify=data['Sentiment'])

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(1420750,) (157862,) (1420750,) (157862,)


In [32]:
# to be used for later
pd.DataFrame(y_test).to_csv('./predictions/y_true.csv', index=False, encoding='utf-8')

### Keras

#### Preprocessing

In [9]:
from keras.preprocessing.text import Tokenizer

MAX_NB_WORDS = 80000
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)

tokenizer.fit_on_texts(data['CleanText'])

Using TensorFlow backend.


In [15]:
train_sequences = tokenizer.texts_to_sequences(x_train)
test_sequences = tokenizer.texts_to_sequences(x_test)

In [16]:
from keras.preprocessing.sequence import pad_sequences

MAX_LENGTH = 35
padded_train_sequences = pad_sequences(train_sequences, maxlen=MAX_LENGTH)
padded_test_sequences = pad_sequences(test_sequences, maxlen=MAX_LENGTH)

In [17]:
padded_train_sequences.shape

(1420750, 35)

In [30]:
import numpy as np 
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, GRU, Bidirectional
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate

def get_simple_rnn_model():
    embedding_dim = 300
    embedding_matrix = np.random.random((MAX_NB_WORDS, embedding_dim))
    
    inp = Input(shape=(MAX_LENGTH, ))
    x = Embedding(input_dim=MAX_NB_WORDS, output_dim=embedding_dim, input_length=MAX_LENGTH, 
                  weights=[embedding_matrix], trainable=True)(inp)
    x = SpatialDropout1D(0.3)(x)
    x = Bidirectional(GRU(100, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(1, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

rnn_simple_model = get_simple_rnn_model()

In [34]:
from keras.callbacks import ModelCheckpoint
from keras.models import load_model

filepath = "./models/weights-improvement-{epoch:02d}-{val_acc:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

batch_size = 256
epochs = 2

history = rnn_simple_model.fit(x=padded_train_sequences, 
                    y=y_train, 
                    validation_data=(padded_test_sequences, y_test), 
                    batch_size=batch_size, 
                    callbacks=[checkpoint], 
                    epochs=epochs, 
                    verbose=1)


Train on 1420750 samples, validate on 157862 samples
Epoch 1/2

Epoch 00001: val_acc improved from -inf to 0.82929, saving model to weights-improvement-01-0.8293.hdf5
Epoch 2/2

Epoch 00002: val_acc did not improve from 0.82929


OSError: Unable to open file (unable to open file: name = 'weights-improvement-01-0.8262.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [36]:
best_rnn_simple_model = load_model('./models/weights-improvement-01-0.8293.hdf5')

y_pred_rnn_simple = best_rnn_simple_model.predict(padded_test_sequences, verbose=1, batch_size=2048)

y_pred_rnn_simple = pd.DataFrame(y_pred_rnn_simple, columns=['prediction'])
y_pred_rnn_simple['prediction'] = y_pred_rnn_simple['prediction'].map(lambda p: 1 if p >= 0.5 else 0)
y_pred_rnn_simple.to_csv('./predictions/y_pred_rnn_simple.csv', index=False)



In [38]:
from sklearn.metrics import accuracy_score

y_pred_rnn_simple = pd.read_csv('./predictions/y_pred_rnn_simple.csv')
print(accuracy_score(y_test, y_pred_rnn_simple))

0.829287605630234


In [42]:
data = pd.read_csv('./data/HydroOttawaAnnotatedData.csv')

# ho = hydro ottawa
ho_x_test = data['Tweet']
ho_y_test = data['Sentiment']

In [43]:
ho_test_sequences = tokenizer.texts_to_sequences(ho_x_test)
padded_ho_test_sequences = pad_sequences(ho_test_sequences, maxlen=MAX_LENGTH)

y_pred_ho = best_rnn_simple_model.predict(padded_ho_test_sequences, verbose=1, batch_size=2048)

y_pred_ho = pd.DataFrame(y_pred_ho, columns=['prediction'])
y_pred_ho['prediction'] = y_pred_ho['prediction'].map(lambda p: 1 if p >= 0.5 else 0)
print(accuracy_score(ho_y_test, y_pred_ho))

0.7310924369747899


In [44]:
# saving tokenizer
import pickle

with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)