In [None]:
# Sentiment Analysis of IMDB (Movie Review Dataset) Using LSTM - Postive - Negative - Neutral
# available in Keras
# also try the same with datasets in UCI or Kaggle

In [None]:
# Basic Packages
from keras.datasets import imdb
vocabulary_size = 5000
# train  and test = 2500, 2500 ???

In [None]:
#create train and test variables
(xtrain, ytrain),(xtest, ytest) = imdb.load_data(num_words = vocabulary_size) # verify : num_words = ramked by how often they occur
#try to give num_words = None -> getting an error in model.fit() 
#num_words: integer or None. Words are ranked by how often they occur (in the training set) and 
#only the 'num_words' most frequent words are kept. Any less frequent word will appear as oov_char value in the sequence data. 
#If None, all words are kept. Defaults to None, so all words are kept.
#by using num_words, we considering only the top 5000 words in every review sample 
#every sample in xtrain, ytrain, xtest, ytest will contain only the top 5000 words, other words are OOV

In [None]:
#check if data is loaded or not
# you will see numerical values as all the words in review dataset are embedded
print(xtrain.shape)#there are 25000 movies reviews in this dataset

In [None]:
# We will see the words from dataset say 6th row from dataset
print(xtrain[6])

In [None]:
wordtoid = imdb.get_word_index() #getting the numbers using the function get_word_index()
idtoword = {i: word for word, i in wordtoid.items()} #for loop for accessing the words associated with numbers using the function items() 

In [None]:
print('---Reviews as Words---')
print([idtoword.get(i,'') for i in xtrain[6]]) # only extracting those words from idtowords that are in the 6th data sample => xtrain[6]

In [None]:
print(xtrain[6]) # what you see here is the embedded form that is, text converted to number form

In [None]:
#We will see how the output is => Y for thhe 6th sample => positve or Negative - label of data
print(ytrain[6])

In [None]:
# attempt to reduce the review length as we have to train the model with same sized input
# we will see the max length first
#print('Maximum Review Length : {}'.format(len(max(xtrain+xtest),key=len))) # according to the format the values will be assigned to the dictionary

In [None]:
#if theres an unexpected EOF error, look for matching closing parenthesis
#len() takes no keyword argument error -> only one arugment inside len()
# so the above code is corrected as 
print('Maximum Review Length : {}'.format(len(max((xtrain + xtest),key=len))))

In [None]:
# Why are we checking for maximum length? During training time, all the data should be of same length, so if we know the maximum length in a dataset, we can padd the other samples in dataset accordingly. 
#or just fix a length and truncate there remaining, to do this, find out the minimuml review length also.
print('Maximum Review Length : {}'.format(len(min((xtrain + xtest),key=len))))


# Padding Sequence


In [None]:
# we will do a padding to convert all input data to same length
from keras.preprocessing import sequence #padding sequence
max_words = 500      #retaining first 500 words and truncating the remaining and padding shorter reviews
xtrain = sequence.pad_sequences(xtrain, maxlen = max_words)

In [None]:
#copy paste the same code for Xtest
xtest = sequence.pad_sequences(xtest, maxlen = max_words)

# Model

In [None]:
# Creating the LSTM Network
from keras import Sequential #to create a NN in a linear/sequential manner
from keras.layers import Embedding, LSTM, Dense, Dropout
embedding_size = 32
model= Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words)) # https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/
model.add(LSTM(100))
model.add(Dense(1, activation = 'sigmoid')) #last layer in network that will predict if positive (1) or negative (0), so 1 neuron is adequate
#print(model.summary) #print the model summary
model.summary()

# Training and Evaluation

In [None]:
model.compile(loss = 'binary_crossentropy', optimizer ='adam', metrics =['accuracy'])

In [None]:
batch_size = 64 #Number of samples trained per iteration
num_epochs = 3 #3 #number of times to train the training data, that is, it will subject the training data to training 3 times again and again
xvalid, yvalid = xtrain[:batch_size],ytrain[:batch_size] #64 data items from train will be used as xvalidate - its easy for algorithm (learns quickly) if a validation set s involved too
xtrain2, ytrain2 = xtrain[batch_size:],ytrain[batch_size:] # remaining is considered as training data

In [None]:
model.fit(xtrain2, ytrain2, validation_data =(xvalid, yvalid), batch_size=batch_size, epochs=num_epochs)

In [None]:
scores = model.evaluate(xtest,ytest)
print('Test Accuracy : ', scores[1]) #86.167 for 1 epoch, Im getting 83.64 for 1 epoch