In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F  
import torchvision
import matplotlib.pyplot as plt
from torchvision import transforms
transforms = torchvision.transforms
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import gensim
from gensim import corpora

nltk.download(['punkt','wordnet','stopwords'])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# Device configuration

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
# downloading dataset
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/IMDB Dataset.csv')

In [5]:
print(df.columns)
df.describe()

Index(['review', 'sentiment'], dtype='object')


Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [6]:
# binary sentiments
df['sentiment'].unique()

array(['positive', 'negative'], dtype=object)

## **TEXT PROCESSING**

#### Removing Stopwords and Punctuations

In [7]:
stop = stopwords.words('english')
df['reduced_text'] = df['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

# stopwords eliminated

In [8]:
def remove_punctuation(text):
    final = "".join(p for p in text if p not in ("?", ".", ";", ":", "!", '"', ','))
    return final

df['reduced_text'] = df['reduced_text'].apply(remove_punctuation)

# punctuations removed

#### Removing html tags and lowering case

In [9]:
def tag_removal(text):
    sum = ''
    include = True
    for char in text:
        if char == '<':
            include = False
        if (include): sum = sum+char
        if char == '>':
            include = True
            sum = sum + ' '
    return sum

df['reduced_text'] = df['reduced_text'].apply(tag_removal)

# html tags removed

In [10]:
df['reduced_text'][0]

"One reviewers mentioned watching 1 Oz episode hooked They right exactly happened me  The first thing struck Oz brutality unflinching scenes violence set right word GO Trust me show faint hearted timid This show pulls punches regards drugs sex violence Its hardcore classic use word  It called OZ nickname given Oswald Maximum Security State Penitentary It focuses mainly Emerald City experimental section prison cells glass fronts face inwards privacy high agenda Em City home manyAryans Muslims gangstas Latinos Christians Italians Irish moreso scuffles death stares dodgy dealings shady agreements never far away  I would say main appeal show due fact goes shows dare Forget pretty pictures painted mainstream audiences forget charm forget romanceOZ mess around The first episode I ever saw struck nasty surreal I say I ready it I watched more I developed taste Oz got accustomed high levels graphic violence Not violence injustice (crooked guards who'll sold nickel inmates who'll kill order get 

In [11]:
text_cleaning_repo = "@\S+|https?:\S+|http?:\S+|[^A-Za-z0-9]+"

In [12]:
def clean_text(text):
  text = re.sub(text_cleaning_repo,' ', str(text).lower()).strip()
  tokens = []
  for token in text.split():
    tokens.append(token)
  return " ".join(tokens)

df["reduced_text"] = df["reduced_text"].apply(clean_text)

#### Lemmatization and Tokenization

In [13]:
lemma = WordNetLemmatizer()
def lemmatize_text(text):
  return " ".join([lemma.lemmatize(word) for word in text.split()])

df["reduced_text"]  = df["reduced_text"].apply(lambda text: lemmatize_text(str(text)))

# lemmatization complete 

In [14]:
df["tokenized_text"] = df["reduced_text"].apply(lambda text: word_tokenize(text))

# all documents tokenized

In [15]:
df['label'] = [2*(sent=='positive')-1 for sent in df['sentiment']]
df.head()

Unnamed: 0,review,sentiment,reduced_text,tokenized_text,label
0,One of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching 1 oz episode h...,"[one, reviewer, mentioned, watching, 1, oz, ep...",1
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production the filming tech...,"[a, wonderful, little, production, the, filmin...",1
2,I thought this was a wonderful way to spend ti...,positive,i thought wonderful way spend time hot summer ...,"[i, thought, wonderful, way, spend, time, hot,...",1
3,Basically there's a family where a little boy ...,negative,basically there s family little boy jake think...,"[basically, there, s, family, little, boy, jak...",-1
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei s love time money visually stunn...,"[petter, mattei, s, love, time, money, visuall...",1


#### Splitting the dataset into train and test set

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(df[['tokenized_text']], df['sentiment'],shuffle=True,test_size=0.1,random_state=10)

X_train = X_train.reset_index()
X_test = X_test.reset_index()
Y_train = Y_train.to_frame()
Y_train = Y_train.reset_index()
Y_test = Y_test.to_frame()
Y_test = Y_test.reset_index()

# **FEED FORWARDED MODEL**
Creating a NN model with 3 hidden layers

In [17]:
df.head()

Unnamed: 0,review,sentiment,reduced_text,tokenized_text,label
0,One of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching 1 oz episode h...,"[one, reviewer, mentioned, watching, 1, oz, ep...",1
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production the filming tech...,"[a, wonderful, little, production, the, filmin...",1
2,I thought this was a wonderful way to spend ti...,positive,i thought wonderful way spend time hot summer ...,"[i, thought, wonderful, way, spend, time, hot,...",1
3,Basically there's a family where a little boy ...,negative,basically there s family little boy jake think...,"[basically, there, s, family, little, boy, jak...",-1
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei s love time money visually stunn...,"[petter, mattei, s, love, time, money, visuall...",1


In [18]:
class FeedforwardNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(FeedforwardNN, self).__init__()
        
        # First Hidden Layer
        self.f1 = nn.Linear(input_dim, hidden_dim) 
        self.relu1 = nn.ReLU()                     #this adds non-linearity

        # Second Hidden Layer
        self.f2 = nn.Linear(hidden_dim, hidden_dim) 
        self.relu2 = nn.ReLU()

        # Third Hidden Layer
        self.f3 = nn.Linear(hidden_dim, hidden_dim) 
        self.relu3 = nn.ReLU()

        # Output Layer
        self.f4 = nn.Linear(hidden_dim, output_dim)
    
    def forward(self,x):
        out = self.f1(x)
        out = self.relu1(out)

        out = self.relu2(self.f2(out))

        out = self.relu3(self.f3(out))

        out = self.f4(out)

        return F.softmax(out, dim=1)

#### Generating input and label tensor

In [19]:
def make_dict(df):
  review_dict = corpora.Dictionary(df["tokenized_text"])
  return review_dict

review_dict = make_dict(df)

# dictionary created

In [20]:
# hyperparameters

VOCAB_SIZE = len(review_dict)
input_dim = VOCAB_SIZE
hidden_dim = 500
output_dim = 2
num_epochs = 2
learning_rate=0.1

In [21]:
# Function to make bow vector to be used as input to network
def make_bow_vector(review_dict, sentence):
    vec = torch.zeros(VOCAB_SIZE, dtype=torch.float64, device=device)
    for word in sentence:
        vec[review_dict.token2id[word]] += 1
    return vec.view(1, -1).float()

In [22]:
# Function to get the output tensor
def make_target(label):
    if label == 1:
        return torch.tensor([1], dtype=torch.long, device=device)
    else: return torch.tensor([0], dtype=torch.long, device=device)


#### Training FNN model

Hyperparameters: <hr>

VOCAB_SIZE = len(review_dict) i.e. 111692 <br>
input_dim = VOCAB_SIZE = 111692 <br>
hidden_dim = 500 <br>
output_dim = 2 <br>
num_epochs = 10 <br>
learning_rate=0.01 <br>
<br>

In [23]:
# model
ff_nn_bow_model = FeedforwardNN(input_dim, hidden_dim, output_dim).to(device)

# loss and optimizer
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(ff_nn_bow_model.parameters(), lr=learning_rate)

In [24]:
#writing the loss value in a file
ffnn_loss_file_name = 'ffnn_loss.csv'
f = open(ffnn_loss_file_name,'w')
f.write('iter,loss')
f.write('\n'    )
losses=[]
iter=0

In [None]:
#starting training
a=0
for epoch in range(num_epochs):
  train_loss=0
  for index, row in X_train.iterrows():
    #Clearing the accumulated gradients
    optimizer.zero_grad()

    # Make the bag of words vector for stemmed tokens 
    bow_vec = make_bow_vector(review_dict, row["tokenized_text"])
      
    #Forward pass to get output
    probs = ff_nn_bow_model(bow_vec)

    #Getting the target label
    target = make_target(Y_train["sentiment"][index])

    #Calculate loss
    loss = loss_function(probs,target)
    
    #Accumulating the loss over time
    train_loss += loss.item()
    
    #Getting gradients wrt paramters:-
    loss.backward()

    #Updating Paramters
    optimizer.step()

    if a%1000 == 0: print(epoch)

    if (a+1)%10000 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Step [{a+1}/{input_dim}], Loss: {loss.item():.4f}')

    a+=1

  f.write(str(epoch+1) + "," + str(train_loss/len(X_train))  )
  f.write("\n")
  train_loss=0

f.close()

In [None]:
bow_ff_nn_predictions = []
original_lables_ff_bow = []
with torch.no_grad():
    for index, row in X_test.iterrows():
        bow_vec = make_bow_vector(review_dict, row)
        probs = ff_nn_bow_model(bow_vec)
        bow_ff_nn_predictions.append(torch.argmax(probs, dim=1).cpu().numpy()[0])
        original_lables_ff_bow.append(make_target(Y_test["sentiment"][index]).cpu().numpy()[0])
print(classification_report(original_lables_ff_bow,bow_ff_nn_predictions))
ffnn_loss_df = pd.read_csv(ffnn_loss_file_name)
print(len(ffnn_loss_df))
print(ffnn_loss_df.columns)
ffnn_plt_500_padding_100_epochs = ffnn_loss_df[' loss'].plot()
fig = ffnn_plt_500_padding_100_epochs.get_figure()