In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [4]:
import pandas as pd

data = pd.read_csv('/content/drive/My Drive/SentimentAnalysis/rnn/data/28500Hersh-Stanford-Airline.csv', encoding = "ISO-8859-1", error_bad_lines=False,
                        usecols=[0,1])
data.columns = ['SentimentText', 'Sentiment']

# remove neutral for now
data = data[(data['Sentiment'] != 2)]

# Replace 4 with 1 (for positive value)
data['Sentiment'] = data['Sentiment'].map(lambda s: 1 if s == 4 else 0)

data.head(20)

Unnamed: 0,SentimentText,Sentiment
1,"Its a restricted area, and inhospitablein ce...",0
3,The M.E.K. had its beginnings as a Marxist-Isl...,0
5,"But, within a few years, the group was waging ...",0
8,The M.E.K.s ties with Western intelligence de...,0
9,Funds were covertly passed to a number of diss...,0
12,"Despite the growing ties, and a much-intensifi...",0
19,"He also was told, he said, that the men doing ...",0
21,It was the ad-hoc training that provoked the w...,0
22,I told one of the guys who called me that the...,0
23,"The Iranians are very, very good at counterint...",0


In [5]:
import pandas as pd
import re
from tqdm import tqdm

appos = {
  "aren't" : "are not", "can't" : "cannot", "couldn't" : "could not", "didn't" : "did not",
  "doesn't" : "does not", "don't" : "do not", "hadn't" : "had not", "hasn't" : "has not",
  "haven't" : "have not", "he'd" : "he would", "he'll" : "he will", "he's" : "he is", "i'd" : "i would",
  "i'd" : "i had", "i'll" : "i will", "i'm" : "i am", "isn't" : "is not", "it's" : "it is", "it'll":"it will",
  "i've" : "I have", "let's" : "let us", "mightn't" : "might not", "mustn't" : "must not", "shan't" : "shall not",
  "she'd" : "she would", "she'll" : "she will", "she's" : "she is", "shouldn't" : "should not", "that's" : "that is",
  "there's" : "there is", "they'd" : "they would", "they'll" : "they will", "they're" : "they are", "they've" : "they have",
  "we'd" : "we would", "we're" : "we are", "weren't" : "were not", "we've" : "we have", "what'll" : "what will",
  "what're" : "what are", "what's" : "what is", "what've" : "what have", "where's" : "where is", "who'd" : "who would",
  "who'll" : "who will", "who're" : "who are", "who's" : "who is", "who've" : "who have", "won't" : "will not",
  "wouldn't" : "would not", "you'd" : "you would", "you'll" : "you will", "you're" : "you are", "you've" : "you have",
  "'re": " are", "wasn't": "was not", "we'll":" will", "didn't": "did not"
}

def clean_text(text):
  # Remove whitespaces and make strings lowercase
  text = text.strip().lower()
  words = text.split()
  # Nagation handling
  reformed = [appos[word] if word in appos else word for word in words]
  text = " ".join(reformed)
  pattern = '(@(\w+))'                # usermention (@username)
  pattern += '|(#(\w+))'              # hashtags (#somehashtag)
  pattern += '|([^\w\s])'             # emojis 😀
  pattern += '|(\\w+:\\/\\/\\S+)'     # urls (https://google.com)
  pattern += '|(\d+)'                 # numbers
  text = ' '.join(re.sub(pattern, ' ', text).split())
  return text

tqdm.pandas()
#data = pd.read_csv('/content/drive/My Drive/SentimentAnalysis/rnn/data/data.csv', error_bad_lines=False)
data['CleanText'] = data['SentimentText'].progress_apply(lambda t: clean_text(t))
data.head()

100%|██████████| 19533/19533 [00:00<00:00, 22834.85it/s]


Unnamed: 0,SentimentText,Sentiment,CleanText
1,"Its a restricted area, and inhospitablein ce...",0,it s a restricted area and inhospitable in cer...
3,The M.E.K. had its beginnings as a Marxist-Isl...,0,the m e k had its beginnings as a marxist isla...
5,"But, within a few years, the group was waging ...",0,but within a few years the group was waging a ...
8,The M.E.K.s ties with Western intelligence de...,0,the m e k s ties with western intelligence dee...
9,Funds were covertly passed to a number of diss...,0,funds were covertly passed to a number of diss...


In [6]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(data['CleanText'], 
                                                    data['Sentiment'], 
                                                    test_size=0.1, 
                                                    random_state=42,
                                                    stratify=data['Sentiment'])

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(17579,) (1954,) (17579,) (1954,)


In [0]:
import numpy as np


d = {'CleanText': x_test, 'Sentiment': y_test}
df = pd.DataFrame(d)
df.head(20)
df.to_csv('/content/drive/My Drive/SentimentAnalysis/rnn/data/28500HSA-test.csv', index=False);

In [0]:
import gensim
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import TaggedDocument
import multiprocessing
from sklearn import utils

In [0]:
def labelize_tweets_ug(tweets, label):
  result = []
  prefix = label
  for i, t in zip(tweets.index, tweets):
    result.append(TaggedDocument(t.split(), [prefix + '_%s' % i]))
  return result

all_x = pd.concat([x_train, x_test])
all_x_w2v = labelize_tweets_ug(all_x, 'all')

In [10]:
cores = multiprocessing.cpu_count()
# Continuous Bag Of Words
model_ug_cbow = Word2Vec(sg=0, size=100, negative=5, window=2, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_ug_cbow.build_vocab([x.words for x in tqdm(all_x_w2v)])

100%|██████████| 19533/19533 [00:00<00:00, 1772129.96it/s]


In [11]:
%%time
for epoch in range(30):
  model_ug_cbow.train(utils.shuffle([x.words for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
  model_ug_cbow.alpha -= 0.002
  model_ug_cbow.min_alpha = model_ug_cbow.alpha

100%|██████████| 19533/19533 [00:00<00:00, 1688875.28it/s]
100%|██████████| 19533/19533 [00:00<00:00, 1223435.23it/s]
100%|██████████| 19533/19533 [00:00<00:00, 1531552.54it/s]
100%|██████████| 19533/19533 [00:00<00:00, 1904356.94it/s]
100%|██████████| 19533/19533 [00:00<00:00, 1255148.99it/s]
100%|██████████| 19533/19533 [00:00<00:00, 1357401.75it/s]
100%|██████████| 19533/19533 [00:00<00:00, 2096347.07it/s]
100%|██████████| 19533/19533 [00:00<00:00, 1182844.23it/s]
100%|██████████| 19533/19533 [00:00<00:00, 1939797.33it/s]
100%|██████████| 19533/19533 [00:00<00:00, 2119998.45it/s]
100%|██████████| 19533/19533 [00:00<00:00, 1269699.19it/s]
100%|██████████| 19533/19533 [00:00<00:00, 1439493.62it/s]
100%|██████████| 19533/19533 [00:00<00:00, 1921282.77it/s]
100%|██████████| 19533/19533 [00:00<00:00, 1323393.80it/s]
100%|██████████| 19533/19533 [00:00<00:00, 1295007.27it/s]
100%|██████████| 19533/19533 [00:00<00:00, 2104099.96it/s]
100%|██████████| 19533/19533 [00:00<00:00, 1594537.56it/

CPU times: user 28.8 s, sys: 235 ms, total: 29 s
Wall time: 15.9 s


In [12]:
# Skip Gram
model_ug_sg = Word2Vec(sg=1, size=100, negative=5, window=2, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_ug_sg.build_vocab([x.words for x in tqdm(all_x_w2v)])

100%|██████████| 19533/19533 [00:00<00:00, 1270742.96it/s]


In [13]:
%%time
for epoch in range(30):
  model_ug_sg.train(utils.shuffle([x.words for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
  model_ug_sg.alpha -= 0.002
  model_ug_sg.min_alpha = model_ug_sg.alpha

100%|██████████| 19533/19533 [00:00<00:00, 1153500.04it/s]
100%|██████████| 19533/19533 [00:00<00:00, 1167288.92it/s]
100%|██████████| 19533/19533 [00:00<00:00, 1977631.50it/s]
100%|██████████| 19533/19533 [00:00<00:00, 1688005.36it/s]
100%|██████████| 19533/19533 [00:00<00:00, 1271157.00it/s]
100%|██████████| 19533/19533 [00:00<00:00, 1329708.67it/s]
100%|██████████| 19533/19533 [00:00<00:00, 1251410.46it/s]
100%|██████████| 19533/19533 [00:00<00:00, 2149985.30it/s]
100%|██████████| 19533/19533 [00:00<00:00, 1266401.93it/s]
100%|██████████| 19533/19533 [00:00<00:00, 1428325.81it/s]
100%|██████████| 19533/19533 [00:00<00:00, 1946664.92it/s]
100%|██████████| 19533/19533 [00:00<00:00, 1310879.39it/s]
100%|██████████| 19533/19533 [00:00<00:00, 1931519.71it/s]
100%|██████████| 19533/19533 [00:00<00:00, 1924125.51it/s]
100%|██████████| 19533/19533 [00:00<00:00, 1395054.06it/s]
100%|██████████| 19533/19533 [00:00<00:00, 1284006.83it/s]
100%|██████████| 19533/19533 [00:00<00:00, 2195501.66it/

CPU times: user 52.6 s, sys: 241 ms, total: 52.9 s
Wall time: 28 s


In [0]:
model_ug_cbow.save('/content/drive/My Drive/SentimentAnalysis/rnn/w2vmodels/w2v_model_ug_cbow.word2vec')
model_ug_sg.save('/content/drive/My Drive/SentimentAnalysis/rnn/w2vmodels/w2v_model_ug_sg.word2vec')

In [15]:
import numpy as np

embeddings_index = {}
for w in model_ug_cbow.wv.vocab.keys():
    embeddings_index[w] = np.append(model_ug_cbow.wv[w],model_ug_sg.wv[w])
print('Found %s word vectors.' % len(embeddings_index))

Found 8021 word vectors.


In [16]:
from keras.preprocessing.text import Tokenizer

MAX_NB_WORDS = 80000
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(data['CleanText'])

Using TensorFlow backend.


In [0]:
train_sequences = tokenizer.texts_to_sequences(x_train)
test_sequences = tokenizer.texts_to_sequences(x_test)

In [0]:
# saving tokenizer
import pickle

with open('/content/drive/My Drive/SentimentAnalysis/rnn/tokenizers/tokenizer-rnn-cnn-w2v-feb-27.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [19]:
from keras.preprocessing.sequence import pad_sequences

MAX_LENGTH = 35
padded_train_sequences = pad_sequences(train_sequences, maxlen=MAX_LENGTH)
padded_test_sequences = pad_sequences(test_sequences, maxlen=MAX_LENGTH)
padded_train_sequences.shape

(17579, 35)

In [0]:
embed_size = 200
# maximum number of words kept after tokenization based on their word frequency
MAX_NB_WORDS = 80000

num_words = MAX_NB_WORDS
embedding_matrix = np.zeros((num_words, embed_size))
for word, i in tokenizer.word_index.items():
  if i >= num_words:
    continue
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

In [0]:
import numpy as np 
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, GRU, Bidirectional
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, Conv1D

def get_rnn_cnn_model():
  embedding_dim = 200
  inp = Input(shape=(MAX_LENGTH, ))
  x = Embedding(MAX_NB_WORDS, embedding_dim, weights=[embedding_matrix], input_length=MAX_LENGTH, trainable=True)(inp)
  x = SpatialDropout1D(0.3)(x)
  x = Bidirectional(GRU(100, return_sequences=True))(x)
  x = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(x)
  avg_pool = GlobalAveragePooling1D()(x)
  max_pool = GlobalMaxPooling1D()(x)
  conc = concatenate([avg_pool, max_pool])
  outp = Dense(1, activation="sigmoid")(conc)

  model = Model(inputs=inp, outputs=outp)
  model.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])
  return model

In [22]:
from keras.callbacks import ModelCheckpoint
from keras.models import load_model

rnn_cnn_model = get_rnn_cnn_model()

filepath="/content/drive/My Drive/SentimentAnalysis/rnn/models/rnn-cnn-w2v-model-feb-27-{epoch:02d}-{val_acc:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

batch_size = 256
epochs = 4

history = rnn_cnn_model.fit(x=padded_train_sequences, 
                    y=y_train, 
                    validation_data=(padded_test_sequences, y_test), 
                    batch_size=batch_size, 
                    callbacks=[checkpoint], 
                    epochs=epochs, 
                    verbose=1)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 17579 samples, validate on 1954 samples
Epoch 1/4

Epoch 00001: val_acc improved from -inf to 0.88229, saving model to /content/drive/My Drive/SentimentAnalysis/rnn/models/rnn-cnn-w2v-model-feb-27-01-0.8823.hdf5
Epoch 2/4

Epoch 00002: val_acc improved from 0.88229 to 0.94012, saving model to /content/drive/My Drive/SentimentAnalysis/rnn/models/rnn-cnn-w2v-model-feb-27-02-0.9401.hdf5
Epoch 3/4

Epoch 00003: val_acc improved from 0.94012 to 0.95599, saving model to /content/drive/My Drive/SentimentAnalysis/rnn/models/rnn-cnn-w2v-model-feb-27-03-0.9560.hdf5
Epoch 4/4

Epoch 00004: val_acc improved from 0.95599 to 0.95701, saving model to /content/drive/My Drive/SentimentA

In [23]:
data.head()

Unnamed: 0,SentimentText,Sentiment,CleanText
1,"Its a restricted area, and inhospitablein ce...",0,it s a restricted area and inhospitable in cer...
3,The M.E.K. had its beginnings as a Marxist-Isl...,0,the m e k had its beginnings as a marxist isla...
5,"But, within a few years, the group was waging ...",0,but within a few years the group was waging a ...
8,The M.E.K.s ties with Western intelligence de...,0,the m e k s ties with western intelligence dee...
9,Funds were covertly passed to a number of diss...,0,funds were covertly passed to a number of diss...
