In [1]:
from __future__ import division, print_function
from gensim import models
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout, Reshape, Flatten, concatenate, Input, Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os
import collections
import re
import string

In [2]:
# os.chdir('C:\Leandro\L_Programming\L_Jupyter\RNN_ML_CLASS\IMBD.csv')

data = pd.read_csv('IMDB.csv')

In [3]:
data.columns
data.shape

(50000, 2)

In [4]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
data['sentiment'] = data['sentiment'].astype('category').cat.codes

In [6]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [7]:
pos = []
neg = []
for l in data.sentiment:
    if l == 0:
        pos.append(0)
        neg.append(1)
    elif l == 1:
        pos.append(1)
        neg.append(0)

In [8]:
data['Pos']= pos
data['Neg']= neg

In [9]:
data

Unnamed: 0,review,sentiment,Pos,Neg
0,One of the other reviewers has mentioned that ...,1,1,0
1,A wonderful little production. <br /><br />The...,1,1,0
2,I thought this was a wonderful way to spend ti...,1,1,0
3,Basically there's a family where a little boy ...,0,0,1
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,1,0
...,...,...,...,...
49995,I thought this movie did a down right good job...,1,1,0
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0,0,1
49997,I am a Catholic taught in parochial elementary...,0,0,1
49998,I'm going to have to disagree with the previou...,0,0,1


In [10]:
def remove_html(text):
    cleantext=re.sub('<[^<]+?>', '', text)
    return cleantext

def remove_punct(text):
    text=text.lower()
    text=re.sub('\[.*?\]', '', text)
    text=re.sub('[%s]'%re.escape(string.punctuation), '', text)
    text=re.sub('\w*\d\w*', '', text)
    return text


def clean_text2(text):
    text=re.sub('[''"",,,]', '', text)
    text=re.sub('\n', '', text)
    return text

cleaned_html=lambda x:clean_html(x)
cleaned1=lambda x:clean_text1(x)
cleaned2=lambda x:clean_text2(x)

data['review_clean'] = data['review'].apply(lambda x: remove_html(x))
data['review_clean'] = data['review_clean'].apply(lambda x: remove_punct(x))
data['review_clean'] = data['review_clean'].apply(lambda x: clean_text2(x))

In [11]:
data

Unnamed: 0,review,sentiment,Pos,Neg,review_clean
0,One of the other reviewers has mentioned that ...,1,1,0,one of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,1,1,0,a wonderful little production the filming tech...
2,I thought this was a wonderful way to spend ti...,1,1,0,i thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,0,0,1,basically theres a family where a little boy j...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,1,0,petter matteis love in the time of money is a ...
...,...,...,...,...,...
49995,I thought this movie did a down right good job...,1,1,0,i thought this movie did a down right good job...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0,0,1,bad plot bad dialogue bad acting idiotic direc...
49997,I am a Catholic taught in parochial elementary...,0,0,1,i am a catholic taught in parochial elementary...
49998,I'm going to have to disagree with the previou...,0,0,1,im going to have to disagree with the previous...


In [12]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shado\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
from nltk import word_tokenize , WordNetLemmatizer
tokens = [word_tokenize(sen) for sen in data.review_clean]
tokens

[['one',
  'of',
  'the',
  'other',
  'reviewers',
  'has',
  'mentioned',
  'that',
  'after',
  'watching',
  'just',
  'oz',
  'episode',
  'youll',
  'be',
  'hooked',
  'they',
  'are',
  'right',
  'as',
  'this',
  'is',
  'exactly',
  'what',
  'happened',
  'with',
  'methe',
  'first',
  'thing',
  'that',
  'struck',
  'me',
  'about',
  'oz',
  'was',
  'its',
  'brutality',
  'and',
  'unflinching',
  'scenes',
  'of',
  'violence',
  'which',
  'set',
  'in',
  'right',
  'from',
  'the',
  'word',
  'go',
  'trust',
  'me',
  'this',
  'is',
  'not',
  'a',
  'show',
  'for',
  'the',
  'faint',
  'hearted',
  'or',
  'timid',
  'this',
  'show',
  'pulls',
  'no',
  'punches',
  'with',
  'regards',
  'to',
  'drugs',
  'sex',
  'or',
  'violence',
  'its',
  'is',
  'hardcore',
  'in',
  'the',
  'classic',
  'use',
  'of',
  'the',
  'wordit',
  'is',
  'called',
  'oz',
  'as',
  'that',
  'is',
  'the',
  'nickname',
  'given',
  'to',
  'the',
  'oswald',
  'maxim

In [14]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shado\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
stoplist = stopwords.words('english')

In [16]:
def remove_stop_words(tokens):
    return [word for word in tokens if word not in stoplist]

In [17]:
filtered_words = [remove_stop_words(sen) for sen in tokens]
filtered_words

[['one',
  'reviewers',
  'mentioned',
  'watching',
  'oz',
  'episode',
  'youll',
  'hooked',
  'right',
  'exactly',
  'happened',
  'methe',
  'first',
  'thing',
  'struck',
  'oz',
  'brutality',
  'unflinching',
  'scenes',
  'violence',
  'set',
  'right',
  'word',
  'go',
  'trust',
  'show',
  'faint',
  'hearted',
  'timid',
  'show',
  'pulls',
  'punches',
  'regards',
  'drugs',
  'sex',
  'violence',
  'hardcore',
  'classic',
  'use',
  'wordit',
  'called',
  'oz',
  'nickname',
  'given',
  'oswald',
  'maximum',
  'security',
  'state',
  'penitentary',
  'focuses',
  'mainly',
  'emerald',
  'city',
  'experimental',
  'section',
  'prison',
  'cells',
  'glass',
  'fronts',
  'face',
  'inwards',
  'privacy',
  'high',
  'agenda',
  'em',
  'city',
  'home',
  'manyaryans',
  'muslims',
  'gangstas',
  'latinos',
  'christians',
  'italians',
  'irish',
  'moreso',
  'scuffles',
  'death',
  'stares',
  'dodgy',
  'dealings',
  'shady',
  'agreements',
  'never',

In [18]:
results = [' '.join(sen) for sen in filtered_words]

In [19]:
data['Text_Final'] = results

In [20]:
data['tokens'] = filtered_words

In [21]:
data.columns

Index(['review', 'sentiment', 'Pos', 'Neg', 'review_clean', 'Text_Final',
       'tokens'],
      dtype='object')

In [22]:
data = data[['Text_Final' , 'tokens' , 'sentiment' , 'Pos' , 'Neg']]

In [23]:
data[:4]

Unnamed: 0,Text_Final,tokens,sentiment,Pos,Neg
0,one reviewers mentioned watching oz episode yo...,"[one, reviewers, mentioned, watching, oz, epis...",1,1,0
1,wonderful little production filming technique ...,"[wonderful, little, production, filming, techn...",1,1,0
2,thought wonderful way spend time hot summer we...,"[thought, wonderful, way, spend, time, hot, su...",1,1,0
3,basically theres family little boy jake thinks...,"[basically, theres, family, little, boy, jake,...",0,0,1


In [24]:
#Start Training

In [25]:
data_train , data_test = train_test_split(data , test_size = 0.10 , random_state = 42)

In [26]:
all_training_words = [ word for tokens in data_train['tokens'] for word in tokens]
training_sentence_lenghts = [len(tokens) for tokens in data_train['tokens']]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total , with a vocabulary size of %s" % (len(all_training_words) , len(TRAINING_VOCAB)))
print("Max sentence lenght is %s" % max(training_sentence_lenghts))

5324121 words total , with a vocabulary size of 200251
Max sentence lenght is 1416


In [27]:
word2vec_path = 'GoogleNews-vectors-negative300.bin.gz'
word2vec = models.KeyedVectors.load_word2vec_format(word2vec_path , binary = True)

In [28]:
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_comments, generate_missing=False):
    embeddings = clean_comments['tokens'].apply(lambda x: get_average_word2vec(x, vectors, 
                                                                               generate_missing=generate_missing))
    return list(embeddings)

In [30]:
training_embeddings = get_word2vec_embeddings(word2vec , data_train , generate_missing = True)

In [49]:
MAX_SEQUENCE_LENGTH = 50
EMBEDDING_DIM = 300

In [50]:
tokenizer = Tokenizer(num_words = len(TRAINING_VOCAB) , lower = True , char_level = False)
tokenizer.fit_on_texts(data_train["Text_Final"].tolist())
training_sequences = tokenizer.texts_to_sequences(data_train["Text_Final"].tolist())

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

Found 200251 unique tokens.


In [51]:
training_sequences[0]

[877,
 506,
 56,
 41,
 46,
 9,
 22,
 324,
 948,
 1914,
 1230,
 227,
 163,
 222,
 18,
 188,
 41,
 665,
 3554,
 248,
 630,
 82,
 70,
 1505,
 4054,
 589,
 248,
 53,
 43,
 1505,
 212,
 830,
 1509,
 41,
 284,
 226,
 4465,
 74,
 281,
 174,
 9,
 28,
 15,
 41,
 90,
 539,
 652,
 9,
 23,
 15,
 29,
 295,
 35,
 2437,
 28,
 1299,
 1499,
 125,
 438,
 114,
 41,
 78,
 104,
 1386,
 75,
 153,
 658,
 72,
 400,
 90,
 163,
 10477,
 289,
 97,
 460,
 238,
 520,
 394]

In [52]:
import pickle
# Save
with open('tokenizer.pickle' , 'wb') as handle:
    pickle.dump(tokenizer , handle , protocol = pickle.HIGHEST_PROTOCOL)

In [53]:
# Building Model

In [54]:
train_cnn_data = pad_sequences(training_sequences , maxlen = MAX_SEQUENCE_LENGHT)

In [55]:
train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word, index in train_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)

(200252, 300)


In [56]:
text_sequences = tokenizer.texts_to_sequences (data_test["Text_Final"].tolist())
test_cnn_data = pad_sequences ( text_sequences , maxlen = MAX_SEQUENCE_LENGHT)

In [57]:
label_names = ['Pos' , 'Neg']

In [58]:
y_train = data_train[label_names].values

In [59]:
x_train = train_cnn_data
y_tr = y_train

In [60]:
def recurrent_nn(embeddings, max_sequence_length, num_words, embedding_dim, labels_index):
    embedding_layer = Embedding(num_words,
                                embedding_dim,
                                weights=[embeddings],
                                input_length=max_sequence_length,
                                trainable=False)
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    
    lstm = LSTM(256)(embedded_sequences)
    
    X = Dense(128, activation='relu')(lstm)
    X = Dropout(0.2)(X)
    preds = Dense(labels_index, activation='sigmoid')(X)
    
    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()
    return model


In [61]:
model = recurrent_nn(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, len(list(label_names)))

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 50)]              0         
                                                                 
 embedding (Embedding)       (None, 50, 300)           60075600  
                                                                 
 lstm (LSTM)                 (None, 256)               570368    
                                                                 
 dense (Dense)               (None, 128)               32896     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 2)                 258       
                                                                 
Total params: 60,679,122
Trainable params: 603,522
Non-traina

In [62]:
epochs = 16
batch_size = 34

In [63]:
hist = model.fit(x_train, y_tr, epochs=16, validation_split=0.1, shuffle=True, batch_size=34)

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


In [64]:
#Save Model
model.save("rcnn_lstm-model.h5")

In [65]:
predictions_data = model.predict(test_cnn_data , batch_size = 1024 , verbose = 1)



In [66]:
labels_data = [1 , 0]

In [71]:
prediction_labels = []
for p in predictions_data:
    prediction_labels.append(labels_data[np.argmax(p)])

In [73]:
sum(data_test.sentiment == prediction_labels) / len(prediction_labels)

0.8372

In [None]:
#Build webapp