In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F  
import torchvision
import matplotlib.pyplot as plt
from torchvision import transforms
transforms = torchvision.transforms
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
import gensim
from gensim import corpora

nltk.download(['punkt','wordnet','stopwords'])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
# Device configuration

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [16]:
# downloading dataset
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/IMDB Dataset.csv')

In [17]:
print(df.columns)
df.describe()

Index(['review', 'sentiment'], dtype='object')


Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,negative
freq,5,25000


In [18]:
zero_1 = pd.get_dummies(df['sentiment'])
df = df.drop('sentiment', axis=1) 
df = df.join(zero_1)
df.head()

Unnamed: 0,review,negative,positive
0,One of the other reviewers has mentioned that ...,0,1
1,A wonderful little production. <br /><br />The...,0,1
2,I thought this was a wonderful way to spend ti...,0,1
3,Basically there's a family where a little boy ...,1,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",0,1


In [19]:
def text_processing(text):
  words=word_tokenize(text)

  words_without_punc=[]
  for w in words:
    if w.isalpha():
      words_without_punc.append(w.lower())

  clean_words=[]
  stop_words = set(stopwords.words("english"))
  for w in words_without_punc:
    if w not in stop_words:
      clean_words.append(w)
    
  lemmatized_words=[]
  for w in clean_words:
    lemmatized_words.append(WordNetLemmatizer().lemmatize(w))
  return ' '.join(lemmatized_words)

In [20]:
df['review'] = df['review'].apply(text_processing)
df.to_csv('/content/drive/MyDrive/NLPlay_with_Transformers/pre_processed_data.csv', index=False)
df.head()

Unnamed: 0,review,negative,positive
0,one reviewer mentioned watching oz episode hoo...,0,1
1,wonderful little production br br filming tech...,0,1
2,thought wonderful way spend time hot summer we...,0,1
3,basically family little boy jake think zombie ...,1,0
4,petter mattei love time money visually stunnin...,0,1


In [21]:
X = df.iloc[:, 0].values
Y = df.iloc[:, 2].values

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, shuffle=True, random_state=60, test_size=0.20)
print(len(X_train))
print(len(X_test))

40000
10000


In [None]:
count_vectorizer = CountVectorizer()
bag_of_words = count_vectorizer.fit_transform(X_train)
feature_names = count_vectorizer.get_feature_names()

In [24]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train= tfidf_vectorizer.fit_transform(X_train)
feature_names = tfidf_vectorizer.get_feature_names()
X_train=pd.DataFrame(X_train.toarray())
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,4960,4961,4962,4963,4964,4965,4966,4967,4968,4969,4970,4971,4972,4973,4974,4975,4976,4977,4978,4979,4980,4981,4982,4983,4984,4985,4986,4987,4988,4989,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.056589,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.095417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
class ffnn(nn.Module):
    def __init__(self,input_dim):
        
        super(ffnn,self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.fc2=nn.Linear(256,64)
        self.fc3= nn.Linear(64, 1)

    def forward(self, x):
        out = self.fc1(x)
        out=F.relu(out)
        out = self.fc2(out)
        out=F.relu(out)
        out = self.fc3(out)
        return out

In [26]:
input_dim = X_train.shape[1]
model=ffnn(input_dim)
n_epocs = 3
n_samples=30000
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [27]:
correct=0
train_loss=0
for n in range(n_epocs):
  print("Epoch completed: " + str(n)+" total_loss: "+str(train_loss)+" acc: "+str(correct/30000))
  correct=0
  train_loss=0
  for index, row in X_train.iterrows():
    optimizer.zero_grad()
    input=torch.tensor(row.values, requires_grad=True)
    input=input.reshape((1,input_dim))
    output = model(input.float())
    loss = criterion(output[0][0],torch.tensor(y_train[index]).float())
    train_loss+=loss
    output = (output.data[0][0]>0.5).float()
    correct += (output ==torch.tensor(y_train[index]).float()).float()
    if (index+1) % 10000 == 0:
      with torch.no_grad():
        print("samples_trained: " + str(n*30000+index+1)+" total_loss: "+str(train_loss/(index+1))+" acc: "+str(correct/(index+1)))
    loss.backward()
    optimizer.step()

Epoch completed: 0 total_loss: 0 acc: 0.0
samples_trained: 10000 total_loss: tensor(0.2356) acc: tensor(0.5786)
samples_trained: 20000 total_loss: tensor(0.1794) acc: tensor(0.7060)
samples_trained: 30000 total_loss: tensor(0.1558) acc: tensor(0.7552)
samples_trained: 40000 total_loss: tensor(0.1421) acc: tensor(0.7825)
Epoch completed: 1 total_loss: tensor(5685.1719, grad_fn=<AddBackward0>) acc: tensor(1.0433)
samples_trained: 40000 total_loss: tensor(0.0949) acc: tensor(0.8725)
samples_trained: 50000 total_loss: tensor(0.0890) acc: tensor(0.8827)
samples_trained: 60000 total_loss: tensor(0.0879) acc: tensor(0.8846)
samples_trained: 70000 total_loss: tensor(0.0867) acc: tensor(0.8862)
Epoch completed: 2 total_loss: tensor(3469.7385, grad_fn=<AddBackward0>) acc: tensor(1.1816)
samples_trained: 70000 total_loss: tensor(0.0821) acc: tensor(0.8911)
samples_trained: 80000 total_loss: tensor(0.0789) acc: tensor(0.8962)
samples_trained: 90000 total_loss: tensor(0.0784) acc: tensor(0.8971)
sa