In [70]:
%load_ext autoreload
%autoreload 2
import matplotlib.pyplot as plt

from pathlib import Path

import torch
from torch import nn
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()
%matplotlib inline

import process_text
import split_data
import IMDB_dataset
import model
import training
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [71]:
# load reviews
pos_path = Path('/data/analytics/naveen.bansal/pytorch/data/aclImdb/train/pos')
neg_path = Path('/data/analytics/naveen.bansal/pytorch/data/aclImdb/train/neg')

pos = [open(x).read() for x in pos_path.iterdir() if x.is_file()]
neg = [open(x).read() for x in neg_path.iterdir() if x.is_file()]
labels = [1]*len(pos) + [0]*len(neg)
reviews = pos+neg
df = pd.DataFrame(columns=('review_raw','label'))
df['review_raw'] = reviews
df['label'] = labels

In [72]:
df['review_processed'] = df['review_raw'].apply(lambda x: process_text.preprocess_text(x))
vocab_to_int = process_text.get_vocab_to_int(df['review_processed'].tolist())
df['review_encoded'] = df['review_processed'].apply(lambda x: process_text.encode_sent(x,vocab_to_int))
df['review_encoded_padded'] = df['review_encoded'].apply(lambda x: process_text.pad_features(x,500))

In [73]:
df.head()

Unnamed: 0,review_raw,label,review_processed,review_encoded,review_encoded_padded
0,Zentropa has much in common with The Third Man...,1,zentropa much common third man another noirlik...,"[12826, 13, 992, 718, 49, 64, 42920, 3, 170, 6...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Zentropa is the most original movie I've seen ...,1,zentropa original movie ive seen years like un...,"[12826, 101, 2, 97, 33, 59, 5, 813, 3020, 3976...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Lars Von Trier is never backward in trying out...,1,lars von trier never backward trying new techn...,"[8274, 2422, 7163, 35, 11089, 156, 65, 3257, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,*Contains spoilers due to me having to describ...,1,contains spoilers due describe film techniques...,"[1161, 1065, 543, 1439, 3, 3257, 214, 35975, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,That was the first thing that sprang to mind a...,1,first thing sprang mind watched closing credit...,"[20, 61, 31480, 241, 181, 2498, 783, 5494, 25,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [74]:
# split train, validation, test sets
df_train, df_val, df_test = \
    split_data.split_stratified_into_train_val_test(df, stratify_colname='label', frac_train=0.70, frac_val=0.29, frac_test=0.01)

In [75]:
# creating dataset and dataloader
batch_size=64
train_loader = IMDB_dataset.get_dataloader(df_train,'review_encoded_padded','label',batch_size=batch_size)
val_loader   = IMDB_dataset.get_dataloader(df_val,'review_encoded_padded','label',batch_size=batch_size)
test_loader  = IMDB_dataset.get_dataloader(df_test,'review_encoded_padded','label',batch_size=batch_size)

In [76]:
# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
device

device(type='cuda')

In [77]:
# Instantiate the LSTM model w/ hyperparams
n_vocab = len(vocab_to_int)+1 # +1 for the 0 padding
output_size = 2
embedding_size = 50
hidden_state_size = 256
num_layers = 2
net = model.UserActivityModel(n_vocab, embedding_size, num_layers, hidden_state_size,output_size, drop_prob=0.3,use_gpu=True)
net.to(device)
print(net)

UserActivityModel(
  (embedding): Embedding(117846, 50)
  (lstm): LSTM(50, 256, num_layers=2, batch_first=True, dropout=0.3)
  (dense): Linear(in_features=256, out_features=2, bias=True)
  (softmax): Softmax(dim=1)
)


In [78]:
# loss and optimization functions
lr=0.001
epochs=10
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
tb = SummaryWriter()

In [79]:
# train and validation loop
training.train(tb,epochs,net,train_loader,val_loader,batch_size,optimizer,criterion,device)

06:20:07.656452:Epoch:0 0.6898624739804111 0.5250686813186813 0.6894782159180768 0.5460453539823009
06:20:32.825113:Epoch:1 0.6599879366355937 0.6073145604395604 0.5947488232523994 0.6966261061946902
06:20:55.951204:Epoch:2 0.5459687084287077 0.7539491758241759 0.52079620614516 0.781941371681416
06:21:19.551697:Epoch:3 0.47462257502716537 0.8325892857142857 0.49033811029079744 0.8151272123893806
06:21:43.029635:Epoch:4 0.4393374134987702 0.8696771978021978 0.5180963451883435 0.7876106194690266
06:22:06.357882:Epoch:5 0.41631144274285425 0.8942307692307693 0.468198812113399 0.8389103982300885
06:22:32.132458:Epoch:6 0.40540621830866885 0.9057921245421245 0.46618670726244427 0.8402931415929203
06:22:57.208441:Epoch:7 0.4015399124814477 0.909283424908425 0.4657689609886271 0.8419524336283186
06:23:20.734009:Epoch:8 0.3784359201188489 0.9339514652014652 0.4632963886303184 0.8462389380530974
06:23:45.940905:Epoch:9 0.3763459949266343 0.9362408424908425 0.46645842795878384 0.8419524336283186

In [114]:
# test set accuracy
net.eval()
test_accuracy=[]
test_losses=[]
for batch_indx,data in enumerate(test_loader):

    hidden = net.zero_state(batch_size)

    inputs, labels = data['x'].to(device), data['y'].to(device)

    out = net.forward(inputs, hidden)

    loss = criterion(out,labels.flatten())
    test_losses.append(loss.item())

    y_pred = torch.argmax(out,dim=1)
    accuracy = (y_pred==labels.long().squeeze()).sum().item()/y_pred.shape[0]
    test_accuracy.append(accuracy)
print (f"Test loss: {np.mean(test_losses)}, Test Accuracy: {np.mean(test_accuracy)}")

Test loss: 0.4947056869665782, Test Accuracy: 0.8125


In [91]:
def predict_text(text):
    #word_seq = np.array([vocab[preprocess_string(word)] for word in text.split() 
    #                 if preprocess_string(word) in vocab.keys()])
    #word_seq = np.expand_dims(word_seq,axis=0)
    #pad =  torch.from_numpy(padding_(word_seq,500))
    review_processed = process_text.preprocess_text(text)
    review_encoded = process_text.encode_sent(review_processed,vocab_to_int)
    review_encoded_padded = process_text.pad_features(review_encoded,500)
    
    review_encoded_padded = torch.tensor([review_encoded_padded],dtype=torch.long).reshape(1,-1)
    #print (review_encoded_padded)
    inputs = review_encoded_padded.to(device)
    batch_size = 1
    net.eval()
    hidden = net.zero_state(batch_size)
    #h = tuple([each.data for each in h])
    out = net.forward(inputs, hidden)
    #print (out)
    return([out[0][0].item(),out[0][1].item()])

In [101]:
df_test = df_test.reset_index()

In [119]:
index = 195
print(df_test['review_raw'][index])
print('='*70)
print(f'Actual sentiment is  : {df_test["label"][index]}')
print('='*70)
pro = predict_text(df_test['review_raw'][index])
print (pro)
status = "negative" if pro[0] > 0.5 else "positive"
pro = pro[0] if status == "negative" else pro[1]
print(f'Predicted sentiment is {status} with a probability of {pro}')


Mimicking its long title the movie finds ways to come close to the 90' mark. The beautiful sets are here with all that made the Hamer production values a trademark, yet Paris drowned in the fog is a sign of indolent neglect. The story is obvious and can be summed up in a dozen words so there comes nothing unexpected and nothing worth more than 5% of your attention to be expected.<br /><br />The directing is heavy as a direct transfer from the stage play, actors are mostly stiff as wax figures (ok this is a Hamer feature, only it's sometimes better featured in the whole package). My conclusion: this movie is trash, not worth the time I spend that evening. Eternal life is a boring matter and I should have hoped the guys in charge of programming at the Cinemathèque would have known better.
Actual sentiment is  : 0
[0.9893736243247986, 0.010626359842717648]
Predicted sentiment is negative with a probability of 0.9893736243247986
