In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/Dhrubo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
from tqdm import tqdm
import pickle
from keras.models import load_model
from sklearn.metrics import accuracy_score
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
import re
from nltk.tokenize import word_tokenize

appos = {
  "aren't" : "are not", "can't" : "cannot", "couldn't" : "could not", "didn't" : "did not",
  "doesn't" : "does not", "don't" : "do not", "hadn't" : "had not", "hasn't" : "has not",
  "haven't" : "have not", "he'd" : "he would", "he'll" : "he will", "he's" : "he is", "i'd" : "i would",
  "i'd" : "i had", "i'll" : "i will", "i'm" : "i am", "isn't" : "is not", "it's" : "it is", "it'll":"it will",
  "i've" : "I have", "let's" : "let us", "mightn't" : "might not", "mustn't" : "must not", "shan't" : "shall not",
  "she'd" : "she would", "she'll" : "she will", "she's" : "she is", "shouldn't" : "should not", "that's" : "that is",
  "there's" : "there is", "they'd" : "they would", "they'll" : "they will", "they're" : "they are", "they've" : "they have",
  "we'd" : "we would", "we're" : "we are", "weren't" : "were not", "we've" : "we have", "what'll" : "what will",
  "what're" : "what are", "what's" : "what is", "what've" : "what have", "where's" : "where is", "who'd" : "who would",
  "who'll" : "who will", "who're" : "who are", "who's" : "who is", "who've" : "who have", "won't" : "will not",
  "wouldn't" : "would not", "you'd" : "you would", "you'll" : "you will", "you're" : "you are", "you've" : "you have",
  "'re": " are", "wasn't": "was not", "we'll":" will", "didn't": "did not"
}

def clean_text(text):
  # Remove whitespaces and make strings lowercase
  text = text.strip().lower()
  words = text.split()
  # Nagation handling
  reformed = [appos[word] if word in appos else word for word in words]
  text = " ".join(reformed)
  pattern = '(@(\w+))'                # usermention (@username)
  pattern += '|(#(\w+))'              # hashtags (#somehashtag)
  pattern += '|([^\w\s])'             # emojis 😀
  pattern += '|(\\w+:\\/\\/\\S+)'     # urls (https://google.com)
  pattern += '|(\d+)'                 # numbers
  text = ' '.join(re.sub(pattern, ' ', text).split())
  return text


def get_accuracy(data_path, tokenizer_path, model_path):
  data = pd.read_csv(data_path)
  ho_x_test = data['Tweet'].map(lambda t: clean_text(t))
  ho_y_test = data['Sentiment']
  # load tokenizer: 
  # That's the one we will use to vectorize our data where we want to get the prediction
  with open(tokenizer_path, 'rb') as handle:
      tokenizer = pickle.load(handle)
  # load rnn model
  rnn_model = load_model(model_path)
  ho_test_sequences = tokenizer.texts_to_sequences(ho_x_test)
  padded_ho_test_sequences = pad_sequences(ho_test_sequences, maxlen=35)
  ho_y_pred = rnn_model.predict(padded_ho_test_sequences, verbose=1, batch_size=2048)
  ho_y_pred = pd.DataFrame(ho_y_pred, columns=['prediction'])
  ho_y_pred['prediction'] = ho_y_pred['prediction'].map(lambda p: 1 if p >= 0.5 else 0)
  return accuracy_score(ho_y_test, ho_y_pred)*100



In [11]:
# RNN-CNN epoch-1
print('Hydro Ottawa dataset accuracy: ', 
      get_accuracy('../data/HydroOttawaAnnotatedData.csv',
                   'tokenizer-rnn-cnn-w2v.pickle',
                   './models/rnn-cnn-w2v-model-01-0.8362.hdf5'))

print('Manually created dataset accuracy: ',
      get_accuracy('../data/CompiledTweets.csv',
                   'tokenizer-rnn-cnn-w2v.pickle',
                   './models/rnn-cnn-w2v-model-01-0.8362.hdf5'))

Hydro Ottawa dataset accuracy:  66.80672268907563
Manually created dataset accuracy:  88.71331828442437


In [12]:
# RNN-CNN epoch-2
print('Hydro Ottawa dataset accuracy: ', 
      get_accuracy('../data/HydroOttawaAnnotatedData.csv',
                   'tokenizer-rnn-cnn-w2v.pickle',
                   './models/rnn-cnn-w2v-model-02-0.8399.hdf5'))

print('Manually created dataset accuracy: ',
      get_accuracy('../data/CompiledTweets.csv',
                   'tokenizer-rnn-cnn-w2v.pickle',
                   './models/rnn-cnn-w2v-model-02-0.8399.hdf5'))

Hydro Ottawa dataset accuracy:  71.42857142857143
Manually created dataset accuracy:  90.51918735891647


In [13]:
# RNN-CNN epoch-3
print('Hydro Ottawa dataset accuracy: ', 
      get_accuracy('../data/HydroOttawaAnnotatedData.csv',
                   'tokenizer-rnn-cnn-w2v.pickle',
                   './models/rnn-cnn-w2v-model-03-0.8419.hdf5'))

print('Manually created dataset accuracy: ',
      get_accuracy('../data/CompiledTweets.csv',
                   'tokenizer-rnn-cnn-w2v.pickle',
                   './models/rnn-cnn-w2v-model-03-0.8419.hdf5'))

Hydro Ottawa dataset accuracy:  66.80672268907563
Manually created dataset accuracy:  91.87358916478556


In [10]:
# RNN-CNN epoch-4
print('Hydro Ottawa dataset accuracy: ', 
      get_accuracy('../data/HydroOttawaAnnotatedData.csv',
                   'tokenizer-rnn-cnn-w2v.pickle',
                   './models/rnn-cnn-w2v-model-04-0.8421.hdf5'))

print('Manually created dataset accuracy: ',
      get_accuracy('../data/CompiledTweets.csv',
                   'tokenizer-rnn-cnn-w2v.pickle',
                   './models/rnn-cnn-w2v-model-04-0.8421.hdf5'))

Hydro Ottawa dataset accuracy:  71.00840336134453
Manually created dataset accuracy:  88.48758465011286
