# <h1 align= 'center'>PyTorch RNN Modeling</h1>

The goal for this analysis is to predict if a review rates the movie positively or negatively.
<a href="https://imgur.com/FfdEBRz"><img src="https://i.imgur.com/FfdEBRzm.png" title="source: imgur.com" align="right"></a>
- IMDB movie reviews dataset
- http://ai.stanford.edu/~amaas/data/sentiment
- Contains 25000 positive and 25000 negative reviews
- Contains at most reviews per movie
- At least 7 stars out of 10 $\rightarrow$ positive (label = 1)
- At most 4 stars out of 10 $\rightarrow$ negative (label = 0)

> **Here, we use PyTorch to train and test recurrent neural network to classify sentiments of reviews by inputting word embeddings to an LSTM layer which is connected to a fully connected layer**

## <h2> <center>Dependencies</center></h2>

In [1]:
import re
import nltk
import torch
import unicodedata
import numpy as np
import pandas as pd
nltk.download('stopwords')
from bs4 import BeautifulSoup
from collections import Counter
from sklearn import preprocessing
from nltk.stem import SnowballStemmer
from torch.utils.data import TensorDataset, DataLoader

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
from google.colab import drive 
drive.mount('/content/drive')
movies = pd.read_csv('/content/drive/MyDrive/imdb_data.csv')
movies.sample(7)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,review,sentiment
33467,I went to see Ashura as 2005 Fantasia Festival...,positive
18094,An underrated addition to the Graham Greene ci...,positive
47594,I love Tudor Chirila and maybe that's why i en...,positive
11059,I agree with most of the critics above. More y...,negative
5964,"Contrary to most other commentators, I deeply ...",negative
14996,"And it's not because since her days on ""Claris...",positive
7266,Farrah Fawcett gives an award nominated perfor...,positive


## <h2> <center>Preprocessing</center></h2>

In [8]:
# Cateogrize positive and negative as 1 and 0 respectively
label_encoder = preprocessing.LabelEncoder()
movies['sentiment'] = label_encoder.fit_transform(movies['sentiment'])
movies.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [9]:
def review_to_wordlist(review, custom = True, stem_words = True):
    # Clean the text, with the option to remove stopwords and stem words.
    
    # Strip html
    soup = BeautifulSoup(review, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    review_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    
    # replace accents
    review_text = unicodedata.normalize('NFKD', review_text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    review_text = re.sub(r"[^A-Za-z0-9!?\'\`]", " ", review_text) # remove special characters
    review_text = contractions.fix(review_text) # expand contractions
    
    review_text = review_text.lower()
    words = review_text.split()
    if custom:
        stop_words = set(nltk.corpus.stopwords.words('english'))
        stop_words.update(['movie', 'film', 'one', 'would', 'even', 
                           'movies', 'films', 'cinema',
                           'character', 'show', "'", "!", 'like'])
    else:
        stop_words = set(nltk.corpus.stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    review_text = " ".join(words)
        
    review_text = re.sub(r"!", " ! ", review_text)
    review_text = re.sub(r"\?", " ? ", review_text)
    review_text = re.sub(r"\s{2,}", " ", review_text)
    
    if stem_words:
        words = review_text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in words]
        review_text = " ".join(stemmed_words)
    
    # Return a list of words, with each word as its own string
    return review_text

In [10]:
!pip install contractions
import contractions

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.0.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.8/110.8 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contrac

In [11]:
movies['review'] = movies['review'].apply(review_to_wordlist)

  soup = BeautifulSoup(review, "html.parser")


## <h2> <center>Features</center></h2>

In [12]:
all_text = ''
all_text = '\n'.join([review for review in movies['review']])

In [13]:
len(all_text)

34732415

In [14]:
# split by new lines and spaces
reviews_split = all_text.split('\n')
all_text = ' '.join(reviews_split)

# create a list of words
words = all_text.split()
len(words)

5723713

In [15]:
counts = Counter(words)
vocab = sorted(counts, key = counts.get, reverse = True)
word_id = {word: i for i, word in enumerate(vocab, 1)} # dictionary to words to ints

review_ids = []
for review in reviews_split:
    review_ids.append([word_id[word] for word in review.split()])

In [16]:
len(review_ids) # list of lists

50000

In [17]:
encoded_labels = np.array(movies['sentiment'])
len(encoded_labels)

50000

In [18]:
# stats about vocabulary
print('Unique words: ', len((word_id))) 
print()

Unique words:  74429



In [19]:
# outlier review stats
review_lens = Counter([len(x) for x in review_ids])
print("Maximum review length: {}".format(max(review_lens)))

Maximum review length: 1437


> **Padding the sequences**

In [20]:
def pad_features(review_ids, seq_length):
    """ Return features of review_ints, where each review is 
        padded with 0's or truncated to the input seq_length.
    """ 

    features = np.zeros((len(review_ids), seq_length), dtype=int)

    # for each review, truncate the review to seq_length
    for i, row in enumerate(review_ids):
        features[i, -len(row):] = np.array(row)[:seq_length]
    
    return features

In [21]:
seq_length = 200

features = pad_features(review_ids, seq_length=seq_length)

## test statements - do not change - ##
assert len(features)==len(review_ids), "Your features should have as many rows as reviews."
assert len(features[0])==seq_length, "Each feature row should contain seq_length values."

# print first 10 values of the first 30 batches 
print(features[:30,:10])

[[    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [  112   116  3343    34    39    76 12258   202  3343   464]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0

## <h2> <center>Split the data</center></h2>

In [38]:
split_frac = 0.8

## split data into training, validation, and test data (features and labels, x and y)
split_idx = int(len(features)*0.8)
train_x, remaining_x = features[:split_idx], features[split_idx:]
train_y, remaining_y = encoded_labels[:split_idx], encoded_labels[split_idx:]

test_idx = int(len(remaining_x)*0.5)
val_x, test_x = remaining_x[:test_idx], remaining_x[test_idx:]
val_y, test_y = remaining_y[:test_idx], remaining_y[test_idx:]

## print out the shapes of your resultant feature data
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))


			Feature Shapes:
Train set: 		(40000, 200) 
Validation set: 	(5000, 200) 
Test set: 		(5000, 200)


> **Dataloaders**

In [23]:
# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

In [25]:
# dataloaders
batch_size = 50

# make sure to SHUFFLE your data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [26]:
dataiter = iter(train_loader)
for sample_x, sample_y in dataiter:
    print('Sample input size: ', sample_x.size()) # batch_size, seq_length
    print('Sample input: \n', sample_x)
    print()
    print('Sample label size: ', sample_y.size()) # batch_size
    print('Sample label: \n', sample_y)
    break


Sample input size:  torch.Size([50, 200])
Sample input: 
 tensor([[   0,    0,    0,  ...,  501,   10,  428],
        [   0,    0,    0,  ...,  313,  691,   67],
        [   0,    0,    0,  ...,  649,  550,  428],
        ...,
        [   0,    0,    0,  ...,  568,  647,  321],
        [   0,    0,    0,  ..., 1040,   12, 1513],
        [   0,    0,    0,  ..., 3252,  239,    1]])

Sample label size:  torch.Size([50])
Sample label: 
 tensor([1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1,
        1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1,
        1, 0])


## <h2> <center>RNN with PyTorch</center></h2>
---
# Sentiment Network with PyTorch

The layers are as follows:
1. An [embedding layer](https://pytorch.org/docs/stable/nn.html#embedding) that converts our word tokens (integers) into embeddings of a specific size.
2. An [LSTM layer](https://pytorch.org/docs/stable/nn.html#lstm) defined by a hidden_state size and number of layers
3. A fully-connected output layer that maps the LSTM layer outputs to a desired output_size
4. A sigmoid activation layer which turns all outputs into a value 0-1; return **only the last sigmoid output** as the output of this network.


In [27]:
# First checking if GPU is available
train_on_gpu=torch.cuda.is_available()

if(train_on_gpu):
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')

Training on GPU.


In [28]:
import torch.nn as nn

class SentimentRNN(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super(SentimentRNN, self).__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(0.3)
        
        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
        

    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size(0)

        # embeddings and lstm_out
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
    
        # stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # dropout and fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        # sigmoid function
        sig_out = self.sig(out)
        
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] # get last batch of labels
        
        # return last sigmoid output and hidden state
        return sig_out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden


> **Instantiate the network**

* `vocab_size`: Size of our vocabulary or the range of values for our input, word tokens.
* `output_size`: Size of our desired output; the number of class scores we want to output (pos/neg).
* `embedding_dim`: Number of columns in the embedding lookup table; size of our embeddings.
* `hidden_dim`: Number of units in the hidden layers of our LSTM cells. Usually larger is better performance wise. Common values are 128, 256, 512, etc.
* `n_layers`: Number of LSTM layers in the network. Typically between 1-3


In [29]:
# Instantiate the model w/ hyperparams
vocab_size = len(word_id)+1 # +1 for the 0 padding + our word tokens
output_size = 1
embedding_dim = 400
hidden_dim = 256
n_layers = 2

net = SentimentRNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)

print(net)

SentimentRNN(
  (embedding): Embedding(74430, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


> **Training**

In [30]:
# loss and optimization functions
lr= 1e-2

criterion = nn.BCELoss() # binary classification
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

In [31]:
# training params

epochs = 4 # 3-4 is approx where I noticed the validation loss stop decreasing

counter = 0
print_every = 100
clip=5 # gradient clipping

# move model to GPU, if available
if(train_on_gpu):
    net.cuda()

net.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    h = net.init_hidden(batch_size)

    # batch loop
    for inputs, labels in train_loader:
        counter += 1

        if(train_on_gpu):
            inputs, labels = inputs.cuda(), labels.cuda()

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        net.zero_grad()

        # get the output from the model
        output, h = net(inputs, h)

        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in valid_loader:

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                if(train_on_gpu):
                    inputs, labels = inputs.cuda(), labels.cuda()

                output, val_h = net(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))
print("Done.")

Epoch: 1/4... Step: 100... Loss: 0.314067... Val Loss: 0.452309
Epoch: 1/4... Step: 200... Loss: 0.502516... Val Loss: 0.402665
Epoch: 1/4... Step: 300... Loss: 0.359628... Val Loss: 0.359375
Epoch: 1/4... Step: 400... Loss: 0.355464... Val Loss: 0.347294
Epoch: 1/4... Step: 500... Loss: 0.667143... Val Loss: 0.340062
Epoch: 1/4... Step: 600... Loss: 0.297685... Val Loss: 0.343553
Epoch: 1/4... Step: 700... Loss: 0.338691... Val Loss: 0.365326
Epoch: 1/4... Step: 800... Loss: 0.402119... Val Loss: 0.340825
Epoch: 2/4... Step: 900... Loss: 0.207419... Val Loss: 0.329560
Epoch: 2/4... Step: 1000... Loss: 0.252502... Val Loss: 0.364728
Epoch: 2/4... Step: 1100... Loss: 0.320059... Val Loss: 0.337132
Epoch: 2/4... Step: 1200... Loss: 0.299975... Val Loss: 0.375009
Epoch: 2/4... Step: 1300... Loss: 0.349033... Val Loss: 0.432669
Epoch: 2/4... Step: 1400... Loss: 0.276925... Val Loss: 0.390780
Epoch: 2/4... Step: 1500... Loss: 0.460764... Val Loss: 0.345922
Epoch: 2/4... Step: 1600... Loss: 


> **Testing**

In [32]:
# Get test data loss and accuracy

test_losses = [] # track loss
num_correct = 0

# init hidden state
h = net.init_hidden(batch_size)

net.eval()
# iterate over test data
for inputs, labels in test_loader:

    # Creating new variables for the hidden state, otherwise
    # we'd backprop through the entire training history
    h = tuple([each.data for each in h])

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()
    
    # get predicted outputs
    output, h = net(inputs, h)
    
    # calculate loss
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())  # rounds to the nearest integer
    
    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)

In [33]:

# -- stats! -- #
# avg test loss
print("Test loss: {:.3f}".format(np.mean(test_losses)))

# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc*100))

Test loss: 0.426
Test accuracy: 80.800


In [34]:
train_losses = []  # track loss
num_correct_train = 0

# init hidden state
h = net.init_hidden(batch_size)

net.train()  # set the model to training mode
# iterate over training data
for inputs, labels in train_loader:

    # Creating new variables for the hidden state, otherwise
    # we'd backprop through the entire training history
    h = tuple([each.data for each in h])

    if train_on_gpu:
        inputs, labels = inputs.cuda(), labels.cuda()

    # zero the gradients
    optimizer.zero_grad()

    # get predicted outputs
    output, h = net(inputs, h)

    # calculate loss
    loss = criterion(output.squeeze(), labels.float())
    train_losses.append(loss.item())

    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())  # rounds to the nearest integer

    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct_train += np.sum(correct)

    # perform backpropagation and optimization
    loss.backward()
    optimizer.step()



In [35]:
# -- stats! -- #
# avg train loss
print("Train loss: {:.3f}".format(np.mean(train_losses)))

# accuracy over all train data
train_acc = num_correct_train / len(train_loader.dataset)
print("Train accuracy: {:.3f}".format(train_acc * 100))


Train loss: 0.439
Train accuracy: 79.517
