Import libraries

In [26]:
import pandas as pd
import torch
import torch.nn as nn
from transformers import PreTrainedTokenizerFast
import pandas as pd
from torch.utils.data import Dataset, DataLoader, random_split
from nltk.corpus import stopwords
import math
import sys
import os
# Workaround for python's terrible package management system
sys.path.append(os.path.abspath(os.path.join(os.path.dirname("model"), '..')))
from util.calculate_accuracy import calculate_accuracy
from util.calculate_metrics import calculate_metrics
from util.plot_metrics import plot_metrics

Create pytorch dataset 

In [20]:
tokenizer = PreTrainedTokenizerFast(tokenizer_file="../tokenizer/tokenizer.json")
tokenizer.add_special_tokens({'pad_token': '<pad>'})
class MovieDataset(Dataset):
    def __init__(self):
        self.corpus = pd.read_csv('../util/IMDB Dataset.csv')
        self.corpus.drop_duplicates(inplace=True) 
        self.corpus = self.corpus.to_numpy()
        self.stop_words = set(stopwords.words("english"))
        self.X_train = self.corpus[:,0]
        self.Y_train = self.corpus[:,1]

    def __len__(self):
        return len(self.X_train)

    def __getitem__(self, index):
        words = [] 
        for word in self.X_train[index]:
            if(word not in self.stop_words):
                words.append(word)
        cleaned_str = ''.join(words)
        text = tokenizer(cleaned_str, max_length=250, padding='max_length',
                        truncation=True, return_tensors="pt")
        input_ids = text.input_ids.squeeze(0)
        label = 0 if self.Y_train[index] == "negative" else 1
        return input_ids, label

In [23]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(
            vocab_size + 1, embedding_dim, padding_idx=tokenizer.pad_token_id)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.drop = nn.Dropout(p=0.3)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.rnn(embedded)
        hidden = hidden[-1, :, :]
        mask = self.drop(hidden)
        output = self.fc(mask)
        return output

device = "cuda" if torch.cuda.is_available() else "cpu"

VOCAB_SIZE = tokenizer.vocab_size
EMBEDDING_DIM = 128
NUM_EPOCHS = 10
SEED = 6463

torch.manual_seed(SEED)

model = RNN(vocab_size=VOCAB_SIZE, embedding_dim=EMBEDDING_DIM, hidden_dim=256, output_dim=2).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.05)
criterion = nn.CrossEntropyLoss()

ds = MovieDataset()
length = ds.__len__()
train_size = math.floor(0.8 * length)
valid_size = math.ceil(0.1*length)
test_size = math.floor(0.1 * length)

train, valid, test = random_split(ds, [train_size,valid_size,test_size])
train_loader = DataLoader(train, shuffle=True, batch_size=1000)
valid_loader = DataLoader(valid, shuffle=False)
test_loader = DataLoader(test, shuffle=False)

39665 4958


In [27]:
train_losses = []
valid_losses = []
f1_scores = []
recalls = []
precisions = []

def train():
    for epoch in range(NUM_EPOCHS):
        model.train()
        total_loss = 0
        total_correct = 0
        total_samples = 0

        for batch, label in train_loader:
            batch, label = batch.to(device), label.to(device)
            optimizer.zero_grad()
            outputs = model(batch)
            loss = criterion(outputs, label)
            loss.backward()
            optimizer.step()

            _, predicted_labels = torch.max(outputs, 1)
            correct_predictions = (predicted_labels == label).sum().item()
            total_correct += correct_predictions
            total_samples += label.size(0)
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        training_accuracy = total_correct / total_samples

        valid_loss, valid_accuracy = calculate_accuracy(model, valid_loader, criterion, device)
        train_losses.append(avg_loss)
        valid_losses.append(valid_loss)

        f1, recall, precision, _ = calculate_metrics(model, valid_loader, device)
        f1_scores.append(f1)
        recalls.append(recall)
        precisions.append(precision)

        print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Train acc: {training_accuracy:.4f}, Valid Acc: { valid_accuracy:.4f}, F1: {f1:.4f}, Recall: {recall:.4f}, Precision: {precision:.4f}")
train()

test_loss, test_accuracy = calculate_accuracy(model, test_loader, criterion, device)

f1, recall, precision, confusion_mat = calculate_metrics(model, test_loader, device)

print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}, F1: {f1}, Recall: {recall}, Precision: {precision}")

plot_metrics(train_losses, valid_losses, f1_scores,recalls, precisions, confusion_mat)

# torch.save(model, "model.pt")

Epoch 1/10, Train acc: 0.5060, Valid Acc: 0.5053, F1: 0.4791, Recall: 0.4627, Precision: 0.4967


KeyboardInterrupt: 