In [1]:
import numpy as np
import pandas as pd

PATH = './../movie_reviews/'

import os
train_pos_names = os.listdir(PATH + 'aclImdb/train/pos')
train_neg_names = os.listdir(PATH + 'aclImdb/train/neg')
test_pos_names = os.listdir(PATH + 'aclImdb/test/pos')
test_neg_names = os.listdir(PATH + 'aclImdb/test/neg')

In [2]:
y_train = [int(s.split('_')[1].split('.')[0]) for s in train_pos_names] + \
          [int(s.split('_')[1].split('.')[0]) for s in train_neg_names]
    
y_test = [int(s.split('_')[1].split('.')[0]) for s in test_pos_names] + \
          [int(s.split('_')[1].split('.')[0]) for s in test_neg_names]

In [3]:
train = []
test = []

for name in train_pos_names:
    with open(PATH + 'aclImdb/train/pos/' + name) as f:
        train.append(f.read())
        
for name in train_neg_names:
    with open(PATH + 'aclImdb/train/neg/' + name) as f:
        train.append(f.read())
        
for name in test_pos_names:
    with open(PATH + 'aclImdb/test/pos/' + name) as f:
        test.append(f.read())
        
for name in test_neg_names:
    with open(PATH + 'aclImdb/test/neg/' + name) as f:
        test.append(f.read())

In [4]:
from sklearn.model_selection import train_test_split

train, val, y_train, y_val = train_test_split(train, y_train, shuffle=True, test_size=0.2)

In [5]:
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences

class Tokenizer:
    def __init__(self, data, maxlen=1024):
        self.maxlen = maxlen
        text = ' '.join(data).lower()
        text = re.sub(r'[^\w\s]','',text)
        words = set(text.split())
        self.num_tokens = len(words) + 4
        self.w2i = {'<unk>': 0, '<BOS>': 1, '<EOS>': 2, '<pad>': 3}
        self.i2w = ['<unk', '<BOS>', '<EOS>', '<pad>']
        for w in words:
            self.w2i[w] = len(self.i2w)
            self.i2w.append(w)
    
    def tokenize(self, data):
        output = []
        for sent in data:
            text = re.sub(r'[^\w\s]','',sent.lower())
            output.append([1])
            for w in text.split():
                if w in self.w2i:
                    output[-1].append(self.w2i[w])
                else:
                    output[-1].append(0)
            output[-1].append(2)
        return pad_sequences(output, maxlen=self.maxlen, value=3)

In [6]:
tok = Tokenizer(train, maxlen=32)

In [7]:
train_tok = tok.tokenize(train)
val_tok = tok.tokenize(val)
test_tok = tok.tokenize(test)

In [8]:
from torch import nn

In [24]:
import torch

class SimpleModel(nn.Module):
    def __init__(self, num_tokens, emb_size=16, hid_size=64):
        super(self.__class__, self).__init__()
        self.emb = nn.Embedding(num_tokens, emb_size)
        self.lstm = nn.LSTM(emb_size, hid_size, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(2 * hid_size, 1)
        
    def forward(self, x):
        x = x.type(torch.LongTensor)
        emb = self.emb(x)
        h, _ = self.lstm(emb)
        estimate = self.fc(h[:, -1, :])
        return torch.flatten(estimate)

In [10]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, data, y):
        self.data = data
        self.y = y
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx], self.y[idx]

In [12]:
train_dset = TextDataset(train_tok, y_train)
val_dset = TextDataset(val_tok, y_val)
test_dset = TextDataset(test_tok, y_test)

In [13]:
train_loader = DataLoader(train_dset, batch_size=16)
val_loader = DataLoader(val_dset, batch_size=16)
test_loader = DataLoader(test_dset, batch_size=16)

In [25]:
def compute_quality(model, loader):
    model.eval()
    
    loss_acum = 0.
    correct = total = 0
    
    for i_step, (x, y) in enumerate(loader):
        pred = model(x)
        loss_val = loss(pred, y)
        loss_acum += loss_val
        
        pred_rounded = torch.round(pred)
        correct += torch.sum(pred_rounded == y)
        total += y.shape[0]
        
    return loss_acum / (i_step + 1), float(correct) / total

def train_model(model, train_loader, val_loader, epochs, optimizer, loss, scheduler=None):
    loss_history = []
    acc_history = []
    
    for epoch in range(epochs):
        model.train()
        
        loss_acum = 0.
        correct = total = 0
        
        for i_step, (x, y) in enumerate(train_loader):
            pred = model(x)
            loss_val = loss(pred, y.type(torch.FloatTensor))
            loss_acum += loss_val
            
            optimizer.zero_grad()
            loss_val.backward()
            optimizer.step()
            
            pred_rounded = torch.round(pred)
            correct += torch.sum(pred_rounded == y)
            total += y.shape[0]
        
        if scheduler != None:
            scheduler.step()
            
        train_acc = float(correct) / total
        train_loss = loss_acum / (i_step + 1)
        
        val_loss, val_acc = compute_quality(model, val_loader)
        loss_history.append(val_loss)
        acc_history.append(val_acc)
        
        print(f'Epoch {epoch}\nTrain loss: {train_loss}\nTrain accuracy: {train_acc}')
        print(f'Val loss: {val_loss}\nVal accuracy: {val_acc}')
    return loss_history, acc_history

In [26]:
model = SimpleModel(tok.num_tokens, 2, 4)
loss = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

train_hist, val_hist = train_model(model, train_loader, val_loader, 5, optimizer, loss)

Epoch 0
Train loss: 16.60061264038086
Train accuracy: 0.0418
Val loss: 12.0274019241333
Val accuracy: 0.0
Epoch 1
Train loss: 11.989115715026855
Train accuracy: 0.00015
Val loss: 11.874347686767578
Val accuracy: 0.0056
Epoch 2
Train loss: 11.154111862182617
Train accuracy: 0.022
Val loss: 10.992056846618652
Val accuracy: 0.0386
Epoch 3
Train loss: 10.042634010314941
Train accuracy: 0.0832
Val loss: 10.533951759338379
Val accuracy: 0.0846
Epoch 4
Train loss: 9.219391822814941
Train accuracy: 0.10255
Val loss: 10.191737174987793
Val accuracy: 0.1022
