In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import nltk
import re
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, TensorDataset

from torch.utils.data import DataLoader

In [2]:
import collections
from collections import Counter

In [3]:
file_name = 'IMDB Dataset.csv'
df = pd.read_csv(file_name)
df.head(2)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive


In [4]:
x = df['review'].values
y = df['sentiment'].values
x_train,x_test,y_train,y_test = train_test_split(x,y,stratify=y)
#print(x_train[0])
#s = ''.join(x_train[0])
#print(s)

In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [9]:
def textCleaning(s):
    s = re.sub('[^A-Za-z]+', '', s)
    return s

In [10]:
#print(textCleaning('PhdPassword@24'))

In [11]:
def preProcessing(x_train, y_train, x_test, y_test):
    vocab = []
    
    stop_words = set(stopwords.words('english'))
    
    for sent in x_train:
        temp = ''
        temp = ''.join(sent)
        for word in temp.split():
            word = word.lower()
            word = textCleaning(word)
            if word not in stop_words and word != '':
                #print(word)
                vocab.append(word)
                
    corpus = Counter(vocab)
    corpus_ = sorted(corpus,key=corpus.get,reverse=True)
    #print(corpus)
    
    onehot_dict = {w:i+1 for i,w in enumerate(corpus_)}
    
    x_final_train, x_final_test = [], []
    
    for sent in x_train:
        temp = ''
        temp = ''.join(sent)
        x_final_train.append([onehot_dict[textCleaning(word.lower())] for word in temp.split() 
                             if textCleaning(word) in onehot_dict.keys()])
    for sent in x_test:
        temp = ''
        temp = ''.join(sent)
        x_final_test.append([onehot_dict[textCleaning(word.lower())] for word in temp.split() 
                             if textCleaning(word) in onehot_dict.keys()])
        
    y_final_train = [1 if label == 'positive' else 0 for label in y_train]
    y_final_test = [1 if label == 'positive' else 0 for label in y_test]
    
    return x_final_train, x_final_test, y_final_train, y_final_test, onehot_dict
    #return x_final_train
    
    #print(onehot_dict)
                

In [12]:
x_train, x_test, y_train, y_test, onehot_dict = preProcessing(x_train, y_train, x_test, y_test)

In [13]:
#print(x_train[:2])

In [14]:
#print(len(x_train))
x_train = np.array(x_train, dtype = object)
x_test = np.array(x_test, dtype = object)
y_train = np.array(y_train, dtype = np.int32)
y_test = np.array(y_test, dtype = np.int32)

In [15]:
#print(x_train[1])

In [16]:
def padding_(data, seqLen):
    features = np.zeros((len(data), seqLen),dtype=int)
    for i, rev in enumerate(data):
        if len(rev) != 0:
            features[i, -len(rev):] = np.array(rev)[:seqLen]
    return features

In [17]:
x_train_pad = padding_(x_train,500)
x_test_pad = padding_(x_test,500)

In [18]:
#print(x_train_pad[1])
#print(y_train.shape)

In [19]:
#Hyper parameter
inputDimension = 300 # number of feature in input as we use glove 300 that converts every word in a 300 dimension feature vector that's why here inputdimension is 300
hiddenSize = 64
numLayer = 2 # how many RNN layers will be stacked togather to form the network
bidirectional = False # True -> bidirectional RNN, LSTM, GRU
batchFirst = True # decide the input, output shape
outputSize = 2 # num of class
direction = 1 # 1 when bidirectional is False otherwise it will be 2
batchSize = 50
seqLen = 500 # lenght of the input data

In [20]:
x_train_tensor = torch.from_numpy(x_train_pad)
y_train_tensor = torch.from_numpy(y_train)

x_test_tensor = torch.from_numpy(x_test_pad)
y_test_tensor = torch.from_numpy(y_test)

In [21]:
train_data = TensorDataset(x_train_tensor, y_train_tensor)
test_data = TensorDataset(x_test_tensor, y_test_tensor)

In [22]:
print(y_train_tensor.shape)

torch.Size([37500])


In [23]:
train_loader = DataLoader(train_data, shuffle=True, batch_size=batchSize)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batchSize)

In [24]:
dataiter = iter(train_loader)
sample_x, sample_y = next(dataiter)


print('Sample input size: ', sample_x.size()) # batch_size, seq_length
#print('Sample input: \n', sample_x)
#print('Sample output: \n', sample_y)

Sample input size:  torch.Size([50, 500])


In [25]:
print(sample_x.shape)
print(sample_y.shape)

torch.Size([50, 500])
torch.Size([50])


In [26]:
import torchtext
glove = torchtext.vocab.GloVe(name='6B', dim=300)

In [44]:
class RNN(torch.nn.Module):
    def __init__(self, inputDimension, hiddenSize, numLayer, batchFirst, outputSize):
        super(RNN, self).__init__()
        
        self.input_size = inputDimension
        self.hidden_size = hiddenSize
        self.num_layer = numLayer
        self.batch_first = batchFirst
        self.outputSize = outputSize
        #Embedding layer: weights of the embedding layer has been initialized by pre-train glove model rather then random values
        self.embedding = nn.Embedding.from_pretrained(glove.vectors)
        #RNN structure
        self.rnn = torch.nn.RNN(input_size=inputDimension,
                                hidden_size=hiddenSize,
                                num_layers=numLayer,
                                bidirectional=False,
                                batch_first=batchFirst)
        #dropout
        self.dropout = nn.Dropout(0.3)
        #fully connected layer
        self.linear = torch.nn.Linear(hiddenSize, outputSize)
        self.sig = nn.Sigmoid()
    
    def forward(self, input):
        # input shape -> batch size, sequence length
        # hidden shape -> direction*num of layer, batch size, hidden size
        h = torch.zeros(1 * numLayer, 50, hiddenSize)
        embd = self.embedding(input)
        # embd (input data) shape -> batch size, sequence length, input dimension (number of features in input)
        # embedding layer takes input in shape (batch size, sequence length) converts the input shape as 
        # (batch size, sequence length, input dimension(number of features in input))
        # print(embd.shape)
       
        output, hidden_ = self.rnn(embd, h)
        
        output = self.dropout(output)
        output = self.linear(output[:, -1, :])  # Get the output from the last time step
        output = self.sig(output)
        
        return output, hidden_
    

In [45]:
model = RNN(inputDimension, hiddenSize, numLayer, batchFirst, outputSize)
print(model)

RNN(
  (embedding): Embedding(400000, 300)
  (rnn): RNN(300, 64, num_layers=2, batch_first=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (linear): Linear(in_features=64, out_features=2, bias=True)
  (sig): Sigmoid()
)


In [46]:
learningRate = 0.001
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learningRate)

def acc(pred,label):
    pred = torch.round(pred.squeeze())
    return torch.sum(pred == label.squeeze()).item()


In [47]:
print(len(train_loader))
print(model.parameters())

750
<generator object Module.parameters at 0x0000021E734D9620>


In [48]:
clip = 5
epochs = 3
valid_loss_min = np.Inf

epoch_tr_loss, epoch_vl_loss = [], []
epoch_tr_acc, epoch_vl_acc = [], []

for epoch in range(epochs):
    print(f'Epoch {epoch+1}')
    train_loss = 0.0
    train_correct = 0
    train_total = 0
    model.train()
    
    #h = torch.zeros(1 * numLayer, batchSize, hiddenSize)
    
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        
        outputs, _ = model(inputs)
        # model output shape is [batch_size, num_classes]
        # Convert the labels to LongTensor if they are not already
        labels = labels.long()
        
        loss = criterion(outputs, labels)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        train_loss += loss.item()
        
        _, predicted = torch.max(outputs.data, 1)
        train_total += labels.size(0)
        train_correct += (predicted == labels).sum().item()

    epoch_tr_loss.append(train_loss / len(train_loader))
    epoch_tr_acc.append(train_correct / train_total)
    
    print(f'Training Loss: {epoch_tr_loss[-1]:.4f}, Training Accuracy: {epoch_tr_acc[-1]*100:.2f}%')


Epoch 1
Training Loss: 0.6740, Training Accuracy: 57.72%
Epoch 2
Training Loss: 0.6667, Training Accuracy: 59.59%
Epoch 3
Training Loss: 0.6531, Training Accuracy: 62.45%
