Please place everything in `drive/MyDrive/nlpkiso`.

In [None]:
from google.colab import drive

drive.mount("/content/drive")


In [None]:
import os

os.chdir('/content/drive/MyDrive/nlpkiso')


In [None]:
import nltk

nltk.download('popular')


In [None]:
# Download news+aggregator data & preprocess it

! bash data/main.sh


`GoogleNews-vectors-negative300.bin` should be downloaded separately.

In [3]:
from gensim.models import KeyedVectors

vectors = KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True)


In [29]:
# Prepare vocabulary

from collections import Counter

counter = Counter()
with open("data/train.txt") as f:
    for line in f:
        _, title = line.rstrip("\n").split("\t")
        tokens = nltk.tokenize.word_tokenize(title)
        counter.update(tokens)

vocab_in_train_list = sorted(counter.keys(), key=lambda x: counter[x], reverse=True)
vocab_list = ["[UNK]"] + vocab_in_train_list
vocab_dict = {vocab: id for id, vocab in enumerate(vocab_list)}


In [6]:
# Define functions used to prepare datasets

import torch

categories = ["b", "t", "e", "m"]


def sent_to_ids(sent):
    return torch.tensor(
        [vocab_dict[x if x in vocab_dict else "[UNK]"] for x in sent], dtype=torch.long
    )


def dataset_to_ids(dataset):
    return [sent_to_ids(x) for x in dataset]


def read_feature_dataset(filename):
    with open(filename) as f:
        dataset = f.read().splitlines()
    dataset = [line.split("\t") for line in dataset]
    dataset_t = [categories.index(line[0]) for line in dataset]
    dataset_x = [nltk.tokenize.word_tokenize(line[1]) for line in dataset]
    return dataset_x, torch.tensor(dataset_t, dtype=torch.long)


def init_embed(embed):
    for i, token in enumerate(vocab_list):
        if token in vectors:
            embed.weight.data[i] = torch.from_numpy(vectors[token])
    return embed


class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, data_ids, data_y, phase="train"):
        self.X = data_ids
        self.y = data_y
        self.phase = phase

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        inputs = torch.tensor(self.X[idx])
        return inputs, self.y[idx]


def collate_fn(batch):
    sequences = [x[0] for x in batch]
    labels = torch.LongTensor([x[1] for x in batch])
    x = torch.nn.utils.rnn.pad_sequence(
        sequences, batch_first=True, padding_value=PADDING_IDX
    )
    return x, labels


In [30]:
# Create dataset

train_X, train_y = read_feature_dataset("data/train.txt")
valid_X, valid_y = read_feature_dataset("data/valid.txt")
test_X, test_y = read_feature_dataset("data/test.txt")

train_ids = dataset_to_ids(train_X)
valid_ids = dataset_to_ids(valid_X)
test_ids = dataset_to_ids(test_X)

train_dataset = NewsDataset(train_ids, train_y, phase="train")
valid_dataset = NewsDataset(valid_ids, valid_y, phase="valid")
test_dataset = NewsDataset(test_ids, test_y, phase="test")


In [19]:
# Define parameters of RNN and LSTM

import numpy as np
import src.models as models
from src.set_seed import seed_everything, seed_worker

seed = 42
seed_everything(seed)

g = torch.Generator()
g.manual_seed(seed)

VOCAB_SIZE = len(set(vocab_dict.values())) + 2
EMB_SIZE = 300
PADDING_IDX = len(set(vocab_dict.values())) + 1
OUTPUT_SIZE = 4
HIDDEN_SIZE = 50
NUM_LAYERS = 1

weights = np.zeros((VOCAB_SIZE, EMB_SIZE))
words_in_pretrained = 0
for i, word in enumerate(vocab_dict.keys()):
    try:
        weights[i] = vectors[word]
        words_in_pretrained += 1
    except KeyError:
        weights[i] = np.random.normal(scale=0.1, size=(EMB_SIZE,))
weights = torch.from_numpy(weights.astype((np.float32)))


In [21]:
# Create models

rnn_model = models.RNN(
    VOCAB_SIZE, EMB_SIZE, PADDING_IDX, HIDDEN_SIZE, OUTPUT_SIZE, NUM_LAYERS, weights
)
lstm_model = models.LSTM(
    VOCAB_SIZE, EMB_SIZE, PADDING_IDX, HIDDEN_SIZE, OUTPUT_SIZE, NUM_LAYERS, weights
)
print(rnn_model)
print(lstm_model)


RNN(
  (emb): Embedding(18833, 300, padding_idx=18832)
  (rnn): RNN(300, 50, batch_first=True)
  (fc): Linear(in_features=50, out_features=4, bias=True)
)
LSTM(
  (emb): Embedding(18833, 300, padding_idx=18832)
  (lstm): LSTM(300, 50, batch_first=True)
  (fc): Linear(in_features=50, out_features=4, bias=True)
)


In [22]:
# Create dataloader

import torch.utils.data as data

batch_size = 8

train_dataloader = data.DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn,
    worker_init_fn=seed_worker,
    generator=g,
)
valid_dataloader = data.DataLoader(
    valid_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_fn,
    worker_init_fn=seed_worker,
    generator=g,
)
test_dataloader = data.DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_fn,
    worker_init_fn=seed_worker,
    generator=g,
)

dataloaders_dict = {
    "train": train_dataloader,
    "val": valid_dataloader,
    "test": test_dataloader,
}


In [25]:
# Train RNN model

import torch.nn as nn
from src.trainer import train_model

rnn_model.train()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(rnn_model.parameters(), lr=0.01, momentum=0.9)

num_epochs = 10

rnn_reports = dict()
(
    rnn_reports["train_loss"],
    rnn_reports["train_acc"],
    rnn_reports["valid_loss"],
    rnn_reports["valid_acc"],
) = train_model(
    rnn_model, dataloaders_dict, criterion, optimizer, num_epochs=num_epochs
)


Tesla T4
device: cuda:0


  inputs = torch.tensor(self.X[idx])


Epoch 1 / 10 (train) Loss: 1.2120, Acc: 0.4418, (val) Loss: 1.1499, Acc: 0.4708
Epoch 2 / 10 (train) Loss: 1.2349, Acc: 0.4493, (val) Loss: 1.2537, Acc: 0.3990
Epoch 3 / 10 (train) Loss: 1.2468, Acc: 0.4520, (val) Loss: 1.2496, Acc: 0.4214
Epoch 4 / 10 (train) Loss: 1.2451, Acc: 0.4227, (val) Loss: 1.3605, Acc: 0.3960
Epoch 5 / 10 (train) Loss: 1.2812, Acc: 0.4149, (val) Loss: 1.2505, Acc: 0.4214
Epoch 6 / 10 (train) Loss: 1.2578, Acc: 0.4179, (val) Loss: 1.3009, Acc: 0.3960
Epoch 7 / 10 (train) Loss: 1.2553, Acc: 0.4277, (val) Loss: 1.7413, Acc: 0.3960
Epoch 8 / 10 (train) Loss: 1.2692, Acc: 0.4265, (val) Loss: 1.1621, Acc: 0.3915
Epoch 9 / 10 (train) Loss: 1.2521, Acc: 0.4354, (val) Loss: 1.2210, Acc: 0.3960
Epoch 10 / 10 (train) Loss: 1.2552, Acc: 0.4190, (val) Loss: 1.2011, Acc: 0.4528


In [26]:
# Train LSTM model

lstm_model.train()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(lstm_model.parameters(), lr=0.01, momentum=0.9)

num_epochs = 10
lstm_reports = dict()
(
    lstm_reports["train_loss"],
    lstm_reports["train_acc"],
    lstm_reports["valid_loss"],
    lstm_reports["valid_acc"],
) = train_model(
    lstm_model, dataloaders_dict, criterion, optimizer, num_epochs=num_epochs
)


Tesla T4
device: cuda:0


  inputs = torch.tensor(self.X[idx])


Epoch 1 / 10 (train) Loss: 0.7401, Acc: 0.7054, (val) Loss: 0.5043, Acc: 0.8069
Epoch 2 / 10 (train) Loss: 0.4468, Acc: 0.8390, (val) Loss: 0.7603, Acc: 0.6916
Epoch 3 / 10 (train) Loss: 0.3494, Acc: 0.8800, (val) Loss: 0.3390, Acc: 0.8780
Epoch 4 / 10 (train) Loss: 0.2969, Acc: 0.8997, (val) Loss: 0.3268, Acc: 0.8937
Epoch 5 / 10 (train) Loss: 0.2667, Acc: 0.9105, (val) Loss: 0.3592, Acc: 0.8728
Epoch 6 / 10 (train) Loss: 0.2395, Acc: 0.9179, (val) Loss: 0.3877, Acc: 0.8488
Epoch 7 / 10 (train) Loss: 0.2239, Acc: 0.9255, (val) Loss: 0.3363, Acc: 0.8787
Epoch 8 / 10 (train) Loss: 0.2077, Acc: 0.9325, (val) Loss: 0.2616, Acc: 0.9132
Epoch 9 / 10 (train) Loss: 0.1910, Acc: 0.9351, (val) Loss: 0.2744, Acc: 0.9019
Epoch 10 / 10 (train) Loss: 0.1818, Acc: 0.9393, (val) Loss: 0.2932, Acc: 0.8975


In [None]:
# Visualize results

from src.visualizer import visualize

visualize(rnn_reports, lstm_reports)
