In [2]:
# Mounting the google drive to get the images

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Checking out the data

!ls drive/MyDrive/sentiment_data

labels.txt  reviews.txt


In [4]:
# Importing libraries and helper functions

import torch 
from torch import nn
import numpy as np
import matplotlib.pyplot as plt

from string import punctuation
from collections import Counter
from torch.utils.data import TensorDataset,DataLoader

In [5]:
# Opening reading and passing into variables both the review and label

with open('drive/MyDrive/sentiment_data/reviews.txt','r') as f:
  reviews = f.read()

with open('drive/MyDrive/sentiment_data/labels.txt','r') as f:
  labels = f.read()

In [6]:
# Checking out the label and review

print(reviews[:1000])
print('===================')
print(labels[:20])

bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   
story of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orchestra audience is turn

In [7]:
# Creating a word list free from punctuations spaces or new lines

reviews = reviews.lower()
all_text = ''.join([c for c in reviews if c not in punctuation])
reviews_split = all_text.split('\n')
all_text = ''.join(reviews_split)

words = all_text.split()

In [8]:
# Checking out some words from our word list

words[10:20]

['same',
 'time',
 'as',
 'some',
 'other',
 'programs',
 'about',
 'school',
 'life',
 'such']

In [9]:
# Counter gives us each unique words and their number of occurance
# Vocab_to_int is a dictionary mapping the words to an intiger
# More occuring words are given a smaller intiger value

counts = Counter(words)
vocab = sorted(counts,key=counts.get,reverse=True)
vocab_to_int = { word : int_ for int_,word in enumerate(vocab,1)}

reviews_ints = []
for review in reviews_split:
  reviews_ints.append([vocab_to_int[word] for word in review.split()])



In [10]:
# Lets see number of unique words an also checkout our tokenized review

print('Unique words: ',len(vocab_to_int))
print('===================')
print('The first tokenized review: \n', reviews_ints[:1])

Unique words:  74072
The first tokenized review: 
 [[21025, 308, 6, 3, 1050, 207, 8, 2138, 32, 1, 171, 57, 15, 49, 81, 5785, 44, 382, 110, 140, 15, 5194, 60, 154, 9, 1, 4975, 5852, 475, 71, 5, 260, 12, 21025, 308, 13, 1978, 6, 74, 2395, 5, 613, 73, 6, 5194, 1, 24103, 5, 1983, 10166, 1, 5786, 1499, 36, 51, 66, 204, 145, 67, 1199, 5194, 19869, 1, 37442, 4, 1, 221, 883, 31, 2988, 71, 4, 1, 5787, 10, 686, 2, 67, 1499, 54, 10, 216, 1, 383, 9, 62, 3, 1406, 3686, 783, 5, 3483, 180, 1, 382, 10, 1212, 13583, 32, 308, 3, 349, 341, 2913, 10, 143, 127, 5, 7690, 30, 4, 129, 5194, 1406, 2326, 5, 21025, 308, 10, 528, 12, 109, 1448, 4, 60, 543, 102, 12, 21025, 308, 6, 227, 4146, 48, 3, 2211, 12, 8, 215, 23]]


In [11]:
# Lets split the label by new line and change positive to 1 and negative to 0

labels_split = labels.split('\n')
encoded_labels =  np.array([1 if label == 'positive' else 0 for label in labels_split])
encoded_labels[:10]

array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0])

In [12]:
# Lets count number of zero reviews and also the maximum review length

review_len = Counter([len(x) for x in reviews_ints])
print(f'Zero length reviews: {review_len[0]}')
print(f'Maximum review length is: {max(review_len)}')

Zero length reviews: 1
Maximum review length is: 2514


In [13]:
# Lets remove the review with a zero length

print('Number of reviews before removing outliers: ' , len(reviews_ints))

non_zero_idx = [idx for idx,review in enumerate(reviews_ints) if len(review) != 0 ]
reviews_ints = [reviews_ints[idx] for idx in non_zero_idx]
encoded_labels = np.array([encoded_labels[idx] for idx in non_zero_idx])

print('Number of reviews after removing outliers: ' , len(reviews_ints))

Number of reviews before removing outliers:  25001
Number of reviews after removing outliers:  25000


In [14]:
''' 

 Our reviews are some with longer characters and others with a shorter
 So for our model to be robust and working effectively we need to pad it
 We will trim longer characters greater than seq_len and add lists of 0s
 to the left for smaller length of characters 
 
'''
def pad_features(reviews_ints,seq_len):

  features = np.zeros((len(reviews_ints),seq_len),dtype=int)
  for i , row in enumerate(reviews_ints):
    features[i,-len(row):] = np.array(row)[:seq_len]

  return features

In [15]:
# Creating a max_sequence length and verifying our function is working well

seq_len = 200
features = pad_features(reviews_ints,seq_len = seq_len)

assert len(features) == len(reviews_ints)
assert len(features[0]) == seq_len
assert len(features[10]) == seq_len
assert len(features[100]) == seq_len
assert len(features[10002]) == seq_len

''' 
From the print we can see that we added multiple zeros on the left for padding
'''
 
print(features[0,:])

[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
 21025   308     6     3  1050   207     8  2138    32     1   171    57
    15    49    81  5785    44   382   110   140    15  5194    60   154
     9     1  4975  5852   475    71     5   260    12 21025   308    13
  1978     6    74  2395     5   613    73     6  5194     1 24103     5
  1983 10166     1  5786  1499    36    51    66   204   145    67  1199
  5194 19869     1 37442     4     1   221   883    31  2988    71     4
     1  5787    10   686     2    67  1499    54    10   216     1   383
     9    62     3  1406  3686   783     5  3483   180     1   382    10
  1212 13583    32   308     3   349   341  2913   

In [16]:
'''

Train test split:
We used 80% of our data for training 
10% for test and the other 10% for validation

'''
split_frac = 0.8

split_idx = int(len(features) * split_frac)
train_x, remaining_x = features[:split_idx], features[split_idx:] 
train_y, remaining_y = encoded_labels[:split_idx], encoded_labels[split_idx:] 

test_idx = int(len(remaining_x) * 0.5)
val_x,test_x = remaining_x[:test_idx], remaining_x[test_idx:]
val_y,test_y = remaining_y[:test_idx], remaining_y[test_idx:]

print('\n Feature shapes: \n')
print(f'Train set: {train_x.shape}\nTest set: {test_x.shape}\nValidation set {val_x.shape}')


 Feature shapes: 

Train set: (20000, 200)
Test set: (2500, 200)
Validation set (2500, 200)


In [17]:
# Creating a train_loader,test_loader and val_loader for further modeling

train_data = TensorDataset(torch.from_numpy(train_x),torch.from_numpy(train_y))
val_data = TensorDataset(torch.from_numpy(val_x),torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x),torch.from_numpy(test_y))

batch_size = 50

train_loader = DataLoader(train_data,shuffle=True,batch_size = batch_size)
val_loader = DataLoader(val_data,shuffle=True,batch_size = batch_size)
test_loader = DataLoader(test_data,shuffle=True,batch_size = batch_size)

In [18]:
# Lets sample from our train data and see their shapes if they are correct

sample_x,sample_y = next(iter(train_loader))

print('Sample input size: ', sample_x.size())
print('Sample input: ', sample_x)
print('Sample label size: ', sample_y.size())
print('Sample label: ', sample_y)

Sample input size:  torch.Size([50, 200])
Sample input:  tensor([[   0,    0,    0,  ...,  115,    8,  150],
        [   0,    0,    0,  ..., 1456,    8,   56],
        [   0,    0,    0,  ...,   45,    4,  674],
        ...,
        [   0,    0,    0,  ...,    2,  442, 2956],
        [   0,    0,    0,  ...,  789,    8,   45],
        [  10,   55,  742,  ..., 2394,   22,  345]])
Sample label size:  torch.Size([50])
Sample label:  tensor([0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0,
        1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
        1, 0])


In [20]:
# Checking out if there is a GPU available 

if torch.cuda.is_available():
  print('Training on GPU....')
else: print('Training on CPU....')

Training on GPU....


In [39]:
# Creating the RNN model

class SentimentRNN(nn.Module):

  def __init__(self,vocab_size,output_size,embedding_dim,hidden_dim,
               n_layers,drop_prob=0.5):
    
    # Inheriting from the torch.nn model module

    super(SentimentRNN,self).__init__()

    # Initializing some parameters for our model

    self.output_size = output_size
    self.n_layers = n_layers
    self.hidden_dim = hidden_dim

    '''

    Our review has more than 70 thousand words so having a one hot encoding
    for these all words will be very time consuming and costy. So the torch
    nn method has a good module that will enable us to get a vectorized 
    representation of our words with simpler dimensions.

    Given the vocablary size and embedding dimension we will get an embedding
    vector for our vocublary. i.e. all words will be represented by our 
    embedding vector

    '''
    self.embedding = nn.Embedding(vocab_size,embedding_dim)
    self.lstm = nn.LSTM(embedding_dim,hidden_dim,n_layers,dropout=drop_prob,
                        batch_first=True)
    
    # We will have a dropout to decrease overfitting

    self.dropout = nn.Dropout(0.3)

    '''
    At last we will have a fully connected layer 
    followed by a sigmoid activation for our final
    prediction
    '''
    self.fc = nn.Linear(hidden_dim,output_size)
    self.sig = nn.Sigmoid()


  def forward(self,x,hidden):

    batch_size = x.size(0)

    # .long method changes our tensor into numpy

    x = x.long()

    embeds = self.embedding(x)
    lstm_out,hidden = self.lstm(embeds,hidden)

    # Taking only the last part of the lstm ouptput above

    lstm_out = lstm_out[:,-1,:]

    output = self.dropout(lstm_out)
    output = self.fc(output)

    sig_out = self.sig(output)

    return sig_out,hidden

  # A function used to initialize the hidden state for our RNN model
  
  def init_hidden(self,batch_size):
    weight = next(self.parameters()).data

    if torch.cuda.is_available():
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
    else:
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())

    return hidden
  
  

In [40]:
# Inistantiating the model passing the parameters

vocab_size = len(vocab_to_int) + 1

# Because it predicts either 1 or 0 for which 1 is postive and 0 negative
 
output_size = 1 
embedding_dim = 400
hidden_dim = 256
n_layers = 2

net = SentimentRNN(vocab_size,
                   output_size,
                   embedding_dim,
                   hidden_dim,
                   n_layers
                   )
print(net)

SentimentRNN(
  (embedding): Embedding(74073, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [41]:
# Initializing a loss function and optimizer

lr = 0.01 # Learning rate of 0.01
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(),lr=lr)


In [122]:
# Initializing parameters for trainging

epochs = 4
counter = 0
print_every = 100

''' 

Clip is a parameter that we will use later while we 
train our model. In our back propagation we may encounter
exploding gradient, for that purpose we will clip every 
gradients more than clip to be clip

'''
clip = 5 

# Check if we are using GPU and if so moving the model to the GPU

if torch.cuda.is_available():
  net.cuda()

# Entering into training mode

net.train()

for epoch in range(epochs):
  
  # Initializing the hidden state

  h = net.init_hidden(batch_size)

  # Getting inputs and labels from the train loader

  for inputs,labels in train_loader:

    counter+=1

    if torch.cuda.is_available():
      inputs,labels = inputs.cuda(),labels.cuda()

    h = tuple([each.data for each in h])

    # Clearing cache for our back propagation

    net.zero_grad()

    # Feeding forward our input and getting the lstm output

    output,h = net(inputs,h)

    # Calculating the loss based on our criterion we set before

    loss = criterion(output.squeeze(),labels.float())

    # Backward propagation 

    loss.backward()

    # Gradient clipping to elimnate exploding gradient issue

    nn.utils.clip_grad_norm_(net.parameters(),clip)

    # Weight updating 

    optimizer.step()

    # Printing validation result every (print_every) steps

    if counter % print_every == 0:

      val_h = net.init_hidden(batch_size)

      val_losses = []

      # Entering the evaluation mode and removing all the dropouts

      net.eval()

      for inputs,labels in val_loader:
        
        # Creating a validation hidden state

        val_h = tuple([each.data for each in val_h])

        if torch.cuda.is_available():

          inputs,labels = inputs.cuda(),labels.cuda()
      
        output,val_h = net(inputs,val_h)
        val_loss = criterion(output.squeeze(),labels.float())
        val_losses.append(val_loss.item())

      # Getting back to the training mode after finishing the validation

      net.train()

      # Printing validation results every (print_every) steps

      print('Epoch: {}/{}......'.format(epoch+1,epochs),
            'Step: {}.....'.format(counter),
            'Loss: {:.6f}'.format(loss.item()),
            'Validation loss: {:.6f}'.format(np.mean(val_losses)))




Epoch: 1/4...... Step: 100..... Loss: 0.249063 Validation loss: 0.641705
Epoch: 1/4...... Step: 200..... Loss: 0.546562 Validation loss: 0.673263
Epoch: 1/4...... Step: 300..... Loss: 0.315827 Validation loss: 0.616148
Epoch: 1/4...... Step: 400..... Loss: 0.523387 Validation loss: 0.870079
Epoch: 2/4...... Step: 500..... Loss: 0.461046 Validation loss: 0.735487
Epoch: 2/4...... Step: 600..... Loss: 0.602474 Validation loss: 0.615451
Epoch: 2/4...... Step: 700..... Loss: 0.430550 Validation loss: 0.671795
Epoch: 2/4...... Step: 800..... Loss: 0.510659 Validation loss: 0.569646
Epoch: 3/4...... Step: 900..... Loss: 0.478471 Validation loss: 0.618111
Epoch: 3/4...... Step: 1000..... Loss: 0.585657 Validation loss: 0.677233
Epoch: 3/4...... Step: 1100..... Loss: 0.555066 Validation loss: 0.649515
Epoch: 3/4...... Step: 1200..... Loss: 0.641896 Validation loss: 0.674005
Epoch: 4/4...... Step: 1300..... Loss: 0.576618 Validation loss: 0.690297
Epoch: 4/4...... Step: 1400..... Loss: 0.725539

In [123]:
# Testing our model on our test_data

test_losses = []
num_correct = 0

h = net.init_hidden(batch_size)

net.eval()

for inputs,labels in test_loader:

  h = tuple([each.data for each in h])

  if torch.cuda.is_available():
    inputs,labels = inputs.cuda(),labels.cuda()

  output,h = net(inputs,h)

  test_loss = criterion(output.squeeze(),labels.float())

  test_losses.append(test_loss.item())

  pred = torch.round(output.squeeze())

  # Checking out if the pred and labels match

  correct_tensor = pred.eq(labels.float().view_as(pred))

  if not torch.cuda.is_available():

    correct = np.squeeze(correct_tensor.numpy())

  else:

    correct = np.squeeze(correct_tensor.cpu().numpy())

  # Summing up the results of correct predictions

  num_correct += np.sum(correct)

'''
Calculating the test accuracy by deviding number of correct
predictions by the total data set number.
'''

test_accuracy = num_correct / (len(test_loader.dataset))

# Printing the test loss

print('Test loss: {:.4f}'.format(np.mean(test_losses)))

# Printing the test accuracy

print('Test accuracy: {:.4f}'.format(test_accuracy))

Test loss: 0.6393
Test accuracy: 0.6308


In [124]:
# A helper function for tokenizing an input review for testing

def tokenize_review(test_review):

  test_review = test_review.lower()
  test_text = ''.join([c for c in test_review if c not in punctuation])

  test_words = test_text.split()

  test_ints = []
  test_ints.append([vocab_to_int.get(word,0) for word in test_words])

  return test_ints

# Creating a negative test review for test
test_review_neg = 'The worst movie I have seen; acting was terrible and I want my money back.'
# Tokenizing the review
test_ints = tokenize_review(test_review_neg)
print(test_ints)

[[1, 247, 18, 10, 28, 108, 113, 14, 388, 2, 10, 181, 60, 273, 144]]


In [125]:
# Padding the review message

seq_len = 200
features = pad_features(test_ints,seq_len)
features

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0

In [126]:
# Changing the feature into tensor since we feed tensor to our model

feature_tensor = torch.from_numpy(features)
print(feature_tensor.size())

torch.Size([1, 200])


In [127]:
# A helper function for predicting sentiment given a review message

def predict(net,test_review,seq_len=200):

  # Enetering into evaluation mode and removing all the dropouts

  net.eval()

  # Tokenize the input message

  test_ints = tokenize_review(test_review)

  # Padding the input message

  seq_len = 200
  features = pad_features(test_ints,seq_len)

  # Changing features into tensors

  feature_tensor = torch.from_numpy(features)

  # Creating a batch size form the number of messages

  batch_size = feature_tensor.size(0)

  h = net.init_hidden(batch_size)

  if torch.cuda.is_available():

    feature_tensor = feature_tensor.cuda()

  output,h = net(feature_tensor,h)

  pred = torch.round(output.squeeze())

  print('Prediction value before rounding: {:.6f}'.format(output.item()))

  # Printing the result of the prediction
  
  if(pred.item()==1): print('Positive review')
  else: print('Negative review')


In [153]:
# Creating lists of reviews for testing

positive_reviews = ['Thank you!!',
                    'Nice moview',
                    'This movie has good acting and best video quality. I loved it.',
                    'Good']

negative_reviews = ['The worst movie I have seen; acting was terrible and I want my money back. This movie had bad acting',
                    'Very bad movie!!',
                    'Worst movie!!',
                    'I hate this movie. It has very bad video quality']

In [154]:
# Testing the postive reviews

for review in positive_reviews:
  print('\n================')
  print('Result of review:',review)
  predict(net,review)


Result of review: Thank you!!
Prediction value before rounding: 0.578878
Positive review

Result of review: Nice moview
Prediction value before rounding: 0.502255
Positive review

Result of review: This movie has good acting and best video quality. I loved it.
Prediction value before rounding: 0.756788
Positive review

Result of review: Good
Prediction value before rounding: 0.516207
Positive review


In [155]:
# Testing the negative reviews

for review in negative_reviews:
  print('\n================')
  print('Result of review:',review)
  predict(net,review)


Result of review: The worst movie I have seen; acting was terrible and I want my money back. This movie had bad acting
Prediction value before rounding: 0.081830
Negative review

Result of review: Very bad movie!!
Prediction value before rounding: 0.428250
Negative review

Result of review: Worst movie!!
Prediction value before rounding: 0.340143
Negative review

Result of review: I hate this movie. It has very bad video quality
Prediction value before rounding: 0.252431
Negative review
