In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import torch.nn as nn
import torch.nn.functional as F
from nltk.corpus import stopwords 
from collections import Counter
import string
import re
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

import torch.optim as optim
from IPython.core.debugger import set_trace
from collections import Counter

from sklearn.preprocessing import LabelEncoder
from torchtext.vocab import build_vocab_from_iterator, GloVe
from torchtext.data.utils import get_tokenizer

In [2]:
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")
      
print(device)

GPU is available
cuda


In [3]:
base_csv = '/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv'
df = pd.read_csv(base_csv)
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
stop_words = list(set(stopwords.words('english')))

def preprocess_string(s):
    s = s.lower()
    s = ' '.join([word for word in s.split() if word not in stop_words])
    # replace <br /><br /> with nothing
    s = re.sub('<[^>]+>', '', s)
    # Remove all non-word characters (everything except numbers and letters)
    s = re.sub(r"[^\w\s]", ' ', s)
    # Replace all runs of whitespaces with no space
    s = re.sub(r"\s+", ' ', s)
    # replace digits with no space
    s = re.sub(r"\d", '', s)
    
    # remove single characters
    s = ' '.join(list(filter(lambda x: len(x)!=1, s.split())))
    

    return s

In [5]:
df.loc[:, 'review_2'] = df.loc[:, 'review'].apply(preprocess_string)
df.head()

Unnamed: 0,review,sentiment,review_2
0,One of the other reviewers has mentioned that ...,positive,one reviewers mentioned watching oz episode ho...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production the filming techni...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically there family little boy jake thinks ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visually stunnin...


In [6]:
X,y = df['review_2'].values,df['sentiment'].values
x_train,x_test,y_train,y_test = train_test_split(X,y,stratify=y)
print(f'shape of train data is {x_train.shape}')
print(f'shape of test data is {x_test.shape}')
y_train = [1 if label =='positive' else 0 for label in y_train]  
y_test = [1 if label =='positive' else 0 for label in y_test] 

shape of train data is (37500,)
shape of test data is (12500,)


In [7]:
embed_dim = 50
globe = GloVe(name='6B', dim=embed_dim)
glove_weights = torch.load(f".vector_cache/glove.6B.{embed_dim}d.txt.pt")

.vector_cache/glove.6B.zip: 862MB [02:38, 5.43MB/s]                              
100%|█████████▉| 399999/400000 [00:16<00:00, 24159.56it/s]


In [8]:
glove_vocab = glove_weights[0]
glove_word_to_id = glove_weights[1]
glove_vectors = glove_weights[2]

In [9]:
complete = ' '.join(df['review_2'].tolist()).split()
print(len(complete))

# word_list = []

# stop_words = set(stopwords.words('english')) 
# for sent in x_train:
#     for word in sent.lower().split():
#         word = preprocess_string(word)
#         if word not in stop_words and word != '':
#             word_list.append(word)
            
counter = Counter(complete)

print(counter.most_common(10))

corpus = sorted(counter, key=counter.get, reverse=True)[:2000]
word_to_id = {word:i+1 for i, word in enumerate(corpus)}
id_to_word = {i+1:word for i, word in enumerate(corpus)}

train_sequences, test_sequences = list(), list()

for train_sent in x_train:
    #set_trace()
    train_sequence = [word_to_id[word] for word in train_sent.split() if word in word_to_id.keys()]
    train_sequences.append(train_sequence)
    
for test_sent in x_test:
    test_sequence = [word_to_id[word] for word in test_sent.split() if word in word_to_id.keys()]
    test_sequences.append(test_sequence)
    
list(map(len, [train_sequences, test_sequences]))

6232887
[('movie', 87935), ('film', 79675), ('one', 53585), ('like', 40160), ('it', 29982), ('good', 29737), ('the', 28864), ('time', 25099), ('even', 24856), ('would', 24599)]


[37500, 12500]

In [10]:
size_vocab = 2000
weight_matrix = np.zeros((size_vocab+1, embed_dim))

for word, id_ in word_to_id.items():
    glove_id = glove_word_to_id[word]
    weight_matrix[id_] = glove_vectors[glove_id]
                                       
weight_matrix = weight_matrix.astype(np.float32)
print(weight_matrix.dtype)

float32


In [11]:
def padding_(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            #features[ii, -len(review):] = np.array(review)[:seq_len]
            features[ii, :len(review)] = np.array(review)[:seq_len]
    return features

In [12]:
#we have very less number of reviews with length > 500.
#So we will consideronly those below it.
x_train_pad = torch.tensor(padding_(train_sequences,200))
x_test_pad = torch.tensor(padding_(test_sequences,200))
x_train_pad.shape, x_test_pad.shape

(torch.Size([37500, 200]), torch.Size([12500, 200]))

In [74]:
class LSTMClassifier(nn.Module):
    def __init__(self, n_vocab, embed_dim, input_dim, hidden_size, fc1_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(n_vocab, embed_dim)
        self.lstm = nn.LSTM(input_dim, hidden_size, batch_first=True)
        self.fc1 = nn.Linear(hidden_size, fc1_dim)
        self.output = nn.Linear(fc1_dim, output_dim)
        self.dropout = nn.Dropout(0.7)
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.dropout(x)
        output, (h0, c0) = self.lstm(x)
        x = F.relu(self.fc1(h0))
        x = F.sigmoid(self.output(x))
        
        return x

In [78]:
n_vocab = len(word_to_id) + 1
embed_dim = 64
input_dim = 64
hidden_size = 32
fc1_dim = 16
output_dim = 1
batch_size = 256
batch_per_epoch = x_train_pad.shape[0] // batch_size

train_padded, train_y = x_train_pad.to(device).long(), torch.tensor(y_train, dtype=torch.float).to(device)
val_padded, val_y = x_test_pad.to(device).long(), torch.tensor(y_test, dtype=torch.float).to(device)

lstm_classifier = LSTMClassifier(n_vocab, embed_dim, input_dim, hidden_size, fc1_dim, output_dim).to(device)
loss_function = nn.BCELoss()
learning_rate = 0.001 
optimizer = optim.Adam(lstm_classifier.parameters(), lr=learning_rate)

In [67]:
def train_epoch(x_batch, y_batch):
    optimizer.zero_grad()
    output = lstm_classifier(x_batch)
    #set_trace()
    
    loss = loss_function(output.view(-1), y_batch.view(-1))
    
    correct = 0
    correct += ((output.view(-1) > 0.5).float() == y_batch).float().sum()
    accuracy = correct / y_batch.shape[0]

    loss.backward()
    optimizer.step()
    
    return loss, accuracy

In [68]:
def calculate_accuracy(outputs, labels):
    correct = 0
    #correct += (torch.argmax(F.softmax(outputs, dim=-1), dim=-1) == labels).float().sum()
    correct += ((outputs.view(-1) > 0.5).float() == labels).float().sum()
    return correct / labels.shape[0]

In [79]:
for epoch in range(0, 20):
    train_loss, val_loss = 0, 0
    training_accuracy = 0
    for i in range(batch_per_epoch):
        start = i * batch_size
        x_batch, y_batch = train_padded[start:start+batch_size], train_y[start:start+batch_size]
#         x_batch, y_batch, mask = x_batch.to(device).long(), y_batch.to(device).long(), mask.to(device).long()

        lstm_classifier.train(True)
    
        loss, accuracy = train_epoch(x_batch, y_batch)
        train_loss += loss
        training_accuracy += accuracy
        
    print(f'Epoch {epoch} Loss: {train_loss / (i+1)}')
    print(f'Accuracy at Epoch {epoch} is {training_accuracy / (batch_per_epoch)}')
    
    lstm_classifier.eval()
    with torch.no_grad():
        output_val = lstm_classifier(val_padded)
        
        loss_val = loss_function(output_val.view(-1), val_y.view(-1))
        
        accuracy = calculate_accuracy(output_val, val_y)
        
        print(f'Epoch {epoch} Val loss: {loss_val}')
        print(f'Accuracy at Epoch {epoch} is {accuracy}')
        
    print()

Epoch 0 Loss: 0.6935608983039856
Accuracy at Epoch 0 is 0.5048694014549255
Epoch 0 Val loss: 0.6932814121246338
Accuracy at Epoch 0 is 0.5023199915885925

Epoch 1 Loss: 0.693334698677063
Accuracy at Epoch 1 is 0.5020334124565125
Epoch 1 Val loss: 0.693285346031189
Accuracy at Epoch 1 is 0.5019199848175049

Epoch 2 Loss: 0.6932961344718933
Accuracy at Epoch 2 is 0.501819372177124
Epoch 2 Val loss: 0.6932485699653625
Accuracy at Epoch 2 is 0.5018399953842163

Epoch 3 Loss: 0.693311870098114
Accuracy at Epoch 3 is 0.5010969638824463
Epoch 3 Val loss: 0.6931638717651367
Accuracy at Epoch 3 is 0.5019199848175049

Epoch 4 Loss: 0.6933209896087646
Accuracy at Epoch 4 is 0.4978595972061157
Epoch 4 Val loss: 0.6931223273277283
Accuracy at Epoch 4 is 0.502079963684082

Epoch 5 Loss: 0.6932105422019958
Accuracy at Epoch 5 is 0.5007491111755371
Epoch 5 Val loss: 0.6931169629096985
Accuracy at Epoch 5 is 0.5023999810218811

Epoch 6 Loss: 0.6931852698326111
Accuracy at Epoch 6 is 0.5006688833236694


In [37]:
b[0].shape

torch.Size([1, 2, 128])

In [38]:
b[1].shape

torch.Size([1, 2, 128])