In [2]:
from google.colab import drive,files
drive.mount('/content/drive')

Mounted at /content/drive


#Gathering Data

In [4]:
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

pos_path='./drive/My Drive/reviews-Lab-3/reviews/pos/'
neg_path='./drive/My Drive//reviews-Lab-3/reviews/neg/'
positive_files=os.listdir(pos_path)
negative_files=os.listdir(neg_path)
max_features=200

corpus=[]
labels=[]
no_reviews=500
#Reading 500 documents from positive reviews
for i in range(no_reviews):
  doc=positive_files[i]
  with open(pos_path+doc) as fi:
    corpus.append(fi.read().replace('\n',' '))
    labels.append([1,0])
#Reading 500 documents from negative reviews
for i in range(no_reviews):
  doc=negative_files[i]
  with open(neg_path+doc) as fi:
    corpus.append(fi.read().replace('\n',' '))
    labels.append([0,1])


vectorizer=TfidfVectorizer(max_features=max_features,stop_words='english')
X=vectorizer.fit_transform(corpus)
y=np.array(labels)
#print(corpus[:5])
print(X.shape,y.shape)
print(X[9,14])



(1000, 200) (1000, 2)
0.11642945566934004


#Data Preprocessing

In [5]:
import torch
from sklearn.model_selection import train_test_split

words_length=-1

word_tokenizer=vectorizer.build_tokenizer()
#print(word_tokenizer)
vocab=vectorizer.vocabulary_
#One-hot encoding using TF-IDF value instead of 1 and left-padding with zeroes if length is less than maximum number of words present in vocabulary in a document
words_list=[word_tokenizer(doc_str) for doc_str in corpus]

docs=[]
for i in range(len(words_list)):
  terms=[]
  for j in range(len(words_list[i])):
    word=words_list[i][j]
    if word in vocab:
      terms.append(word)
  if len(terms) > words_length:
    words_length=len(terms)
  docs.append(terms)

datasets=np.zeros((X.shape[0],words_length,max_features))
print(datasets.shape)
print(len(docs))
print(len(terms))
print(words_length)

for i in range(len(docs)):
  #print(len(docs[i]))
  no_padding=words_length-len(docs[i])
  #print(no_padding)

  for j in range(len(docs[i])):
    w=docs[i][j]
    idx=vocab[w]
    tfidf_val=X[i,idx]
    datasets[i,j+no_padding,idx]=tfidf_val

datasets=datasets.astype(np.float32)
y=y.astype(np.float32)
#Creating training and testing data
X_train,X_val,y_train,y_val=train_test_split(datasets,y,test_size=0.2,random_state=2020)
print(X_train.shape,y_train.shape,X_val.shape,y_val.shape)




(1000, 310, 200)
1000
78
310
(800, 310, 200) (800, 2) (200, 310, 200) (200, 2)


#Data Loading

In [6]:
from torch.utils.data import DataLoader,TensorDataset

batch_size=16

training_data=TensorDataset(torch.from_numpy(X_train),torch.from_numpy(y_train))
validation_data=TensorDataset(torch.from_numpy(X_val),torch.from_numpy(y_val))

train_loader=DataLoader(training_data,shuffle=True,batch_size=batch_size)
val_loader=DataLoader(validation_data,shuffle=True,batch_size=batch_size)

#Running the model

In [7]:
from __future__ import unicode_literals, print_function, division
import torch
import torch.nn as nn 
import torch.nn.functional as F 
import torch.optim as optim
class Model(nn.Module):
  def __init__(self, input_size, output_size, hidden_size, n_layers):
    super().__init__()
    self.hidden_size = hidden_size
    self.n_layers = n_layers
    self.rnn = nn.RNN(input_size,hidden_size,n_layers,batch_first=True) # rnn layer
    self.fc1 = nn.Linear(hidden_size,output_size) # rnn output (y_t) --> output (y'_t)
    self.fc2 = nn.Linear(output_size,2) #the output from the last time period ->sentiment prediction
  def forward(self,x, hidden):
    batch_size = x.size()[0]
    hidden = self.init_hidden(batch_size)
    
    rnn_out,hidden = self.rnn(x,hidden)
    rnn_out = self.fc1(rnn_out)
    last_out = rnn_out[:,-1,:].view(batch_size,-1)
    out = F.softmax(self.fc2(last_out))
    return out,hidden 
  def init_hidden(self,batch_size):
    hidden = torch.zeros(self.n_layers, batch_size, self.hidden_size).cuda()
    return hidden
model = Model(200,32,256,3)
print(model)

Model(
  (rnn): RNN(200, 256, num_layers=3, batch_first=True)
  (fc1): Linear(in_features=256, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=2, bias=True)
)


#Training and Validating the model

In [10]:
device=torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
  model.to(device)

n_epochs=10
lr=1e-5
b=0

criterion=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=lr)
#Model Training
model.train()
for epoch in range(n_epochs):
  hidden=model.init_hidden(batch_size)

  for data,labels in train_loader:
    b+=1
    data,labels=data.to(device),labels.to(device)

    model.zero_grad()
    output,hidden=model(data,hidden)
    pred_output=torch.max(labels,1)[1]
    loss=criterion(output,pred_output)
    loss.backward()
   
    optimizer.step()

    val_hidden=model.init_hidden(batch_size).cuda()
    val_losses=[]
#Model Validation
    model.eval()

    for data,labels in val_loader:
        data,labels=data.to(device),labels.to(device)
        val_outputs,val_hidden=model(data,val_hidden)
        pred_output=torch.max(labels,1)[1]
        val_loss=criterion(val_outputs,pred_output)
        val_losses.append(val_loss.item())

    model.train()

    print('Epoch:{}/{}'.format(epoch,n_epochs), # epoch is the index of epoch
    'Batch:{}'.format(b),  # b is the index of batch
    'Train Loss:{:.5f}'.format(loss.item()),
    'Val Loss:{:.5f}'.format(np.mean(val_losses)))



Epoch:0/10 Batch:1 Train Loss:0.69249 Val Loss:0.69346
Epoch:0/10 Batch:2 Train Loss:0.69641 Val Loss:0.69343
Epoch:0/10 Batch:3 Train Loss:0.69126 Val Loss:0.69335
Epoch:0/10 Batch:4 Train Loss:0.69340 Val Loss:0.69357
Epoch:0/10 Batch:5 Train Loss:0.69600 Val Loss:0.69334
Epoch:0/10 Batch:6 Train Loss:0.69220 Val Loss:0.69334
Epoch:0/10 Batch:7 Train Loss:0.69418 Val Loss:0.69333
Epoch:0/10 Batch:8 Train Loss:0.69338 Val Loss:0.69326
Epoch:0/10 Batch:9 Train Loss:0.69337 Val Loss:0.69342
Epoch:0/10 Batch:10 Train Loss:0.69250 Val Loss:0.69326
Epoch:0/10 Batch:11 Train Loss:0.69390 Val Loss:0.69340
Epoch:0/10 Batch:12 Train Loss:0.69251 Val Loss:0.69329
Epoch:0/10 Batch:13 Train Loss:0.69182 Val Loss:0.69330
Epoch:0/10 Batch:14 Train Loss:0.69384 Val Loss:0.69329
Epoch:0/10 Batch:15 Train Loss:0.69232 Val Loss:0.69329
Epoch:0/10 Batch:16 Train Loss:0.69233 Val Loss:0.69337
Epoch:0/10 Batch:17 Train Loss:0.69319 Val Loss:0.69337
Epoch:0/10 Batch:18 Train Loss:0.69323 Val Loss:0.69349
E