In [81]:
import numpy as np

import pandas as pd

from sklearn import svm
from sklearn.preprocessing import StandardScaler

import os

import torch
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

# 1. Data set

In [104]:
from pyexpat.errors import XML_ERROR_FEATURE_REQUIRES_XML_DTD
from torch.utils.data import random_split
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence

NUM_PROLIFIC_AUTHORS = 100
NUM_WORDS = 5000
NUM_VENUES = 466 # 465 valid venues + 1 reserved for null venues

class CustomDataset(Dataset):
    def __init__(self, file_name, data_dir, transform=None, target_transform=None):
        # Get data
        self.data_dir = data_dir
        self.data = pd.read_json(os.path.join(data_dir, file_name))

        # Clean data
        self.data.venue = self.data.venue.map(lambda x: 465 if x == '' else x) # Change "" data to 465, a new cateogry

        # # Convert venue to one-hot as it is not ordinal data, and convert to numpy array
        # venue = np.zeros((len(self), NUM_VENUES))
        # venue[np.arange(len(self)), self.data.venue] = 1

        # Convert year and venue to numpy array
        self.x_year = self.data.year.to_numpy()[:,np.newaxis]
        self.x_venue = self.data.venue.to_numpy()[:,np.newaxis]

        # Convert abstracts and titles to lists of arrays of words (in numbers)
        abstracts_list = self.data.abstract.to_list()
        self.x_abstract = [np.array(x)[:,np.newaxis] for x in abstracts_list]
        titles_list = self.data.title.to_list()
        self.x_title = [np.array(x)[:,np.newaxis] for x in titles_list]

        # Convert y to numpy array
        self.y = self.data.authors.values

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x_year = self.x_year[idx]
        x_venue = self.x_venue[idx]
        x_abstract = self.x_abstract[idx]
        x_title = self.x_title[idx]
        y = self.authors_to_one_hot(self.y[idx])
        return x_year, x_venue, x_abstract, x_title, y

    def authors_to_one_hot(self, authors):
        one_hot = np.zeros(NUM_PROLIFIC_AUTHORS).astype(int)
        a_array = np.array(authors)
        prolific_a_array = a_array[a_array < 100]
        one_hot[prolific_a_array] = 1
        return one_hot

# def text_to_one_hot(text):
#     '''
#     Take a piece of text (represented as list of words represented in numbers) and output 
#     a numpy matrix representing the texts, where each row represent a word in one-hot encoded format
#     '''
#     t_array = np.zeros((len(text), NUM_WORDS))
#     t_array[np.arange(len(text)), text] = 1
#     return t_array


dataset = CustomDataset('train.json', './data/')
train_set, val_set = random_split(dataset, [round(len(dataset)*0.7), round(len(dataset)*0.3)])

def pad_collate(batch):
    (x_year, x_venue, x_abstract, x_title, y) = zip(*batch)
    # print(torch.tensor(x_fixed))
    
    # Convert x (year and venue) and y to tensors and pack them
    x_year_tensor = torch.tensor(x_year)
    x_venue_tensor = torch.tensor(x_venue)
    y_tensor = torch.tensor(y)

    # Convert x_title and x_abstract to tensors and pack them

    # Get length to later recover orignal from padded tensors
    x_abstract_lens = [len(x) for x in x_abstract]
    x_title_lens = [len(x) for x in x_title]

    # Convert words to tensors
    x_abstract_tensor = [torch.tensor(x) for x in x_abstract]
    x_title_tensor = [torch.tensor(x) for x in x_title]

    # Pad variable length tensors
    x_abstract_pad = pad_sequence(x_abstract_tensor, padding_value=0)
    x_title_pad = pad_sequence(x_title_tensor, padding_value=0)

    # Pack padded tensors along with lengths for recovery
    x_abstract_packed = pack_padded_sequence(x_abstract_pad, x_abstract_lens, enforce_sorted=False)
    x_title_packed = pack_padded_sequence(x_title_pad, x_title_lens, enforce_sorted=False)

    return x_year_tensor, x_venue_tensor, x_abstract_packed, x_title_packed, y_tensor
    
train_dataloader = DataLoader(train_set, batch_size=128, shuffle=True, collate_fn=pad_collate)
val_dataloader = DataLoader(val_set, batch_size=128, shuffle=True, collate_fn=pad_collate)

# for _ in range(5):
#     _, _, _, y = next(iter(train_dataloader))
#     print(y)

# 2. Model

## 2.1 NN

In [83]:
import torch.nn.functional as F

'''
Define the NN model tailored for the task
'''


class MLP(nn.Module):
    def __init__(self, embed_input_size, other_input_size, embed_vocab, embed_size, 
        hidden_size1, hidden_size2, out_size):
        super().__init__()

        self.embedding_layer = nn.Embedding(embed_vocab, embed_size)
        self.input_layer = nn.Linear(embed_size*embed_input_size + other_input_size, hidden_size1)
        self.hidden_layer = nn.Linear(hidden_size1, hidden_size2)
        self.output_layer = nn.Linear(hidden_size2, out_size)
        self.dropout = nn.Dropout(0.25)

    def forward(self, x_embed, x):
        batch_size = x_embed.shape[0]
        embed_out = self.embedding_layer(x_embed)
        embed_out = torch.flatten(embed_out, start_dim=1) # Flatten the new dimension added by embedding
        # print(embed_out.shape)

        x = torch.cat((embed_out, x), 1)
        
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        
        x = F.relu(self.hidden_layer(x))
        x = self.dropout(x)
        
        out = F.sigmoid(self.output_layer(x))
        
        return out

## 2.2 RNN

In [84]:
import torch.nn as nn
import torch.optim as optim

In [106]:
class RNN_MLP(nn.Module):
    def __init__(self, rnn_embed_vocab, rnn_embed_size, rnn_hidden_size,
        mlp_embed_input_size, mlp_other_input_size, mlp_embed_vocab, 
        mlp_embed_size, mlp_hidden_dim1, mlp_hidden_dim2, mlp_out_size):
        super().__init__()

        # self.embedding_layer = nn.Embedding(rnn_embed_vocab, rnn_embed_size)
        self.rnn1 = nn.RNN(
            input_size=1, 
            hidden_size=rnn_hidden_size,
            nonlinearity='tanh',
            )
        self.rnn2 = nn.RNN(
            input_size=1, 
            hidden_size=rnn_hidden_size,
            nonlinearity='tanh',
            )
        self.mlp = MLP(
            mlp_embed_input_size, 
            mlp_other_input_size + rnn_hidden_size*2, 
            mlp_embed_vocab, 
            mlp_embed_size, 
            mlp_hidden_dim1, 
            mlp_hidden_dim2, 
            mlp_out_size)

    def forward(self, x_rnn1, x_rnn2, x_mlp_embed, x_mlp):
        # print(x_rnn1.shape)
        # emed_out1 = self.embedding_layer(x_rnn1)
        rnn_out1 = self.rnn1(x_rnn1)

        # emed_out2 = self.embedding_layer(x_rnn2)
        rnn_out2 = self.rnn2(x_rnn2)

        mlp_out = self.mlp(x_mlp_embed, torch.cat((rnn_out1, rnn_out2, x_mlp), 1))
        
        return mlp_out

In [86]:
class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size) 
        self.activation = nn.Tanh() 
        self.softmax = nn.LogSoftmax(dim=1)
    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.activation(self.i2h(combined)) 
        output = self.h2o(hidden) 
        output = self.softmax(output)
        return output, hidden
    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

In [87]:
class AttentionalGRUClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(AttentionalGRUClassifier, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size) 
        self.att = nn.Linear(hidden_size, 1) 
        
    def forward(self, input_sequence):
        # process the input sequence into a sequence of RNN hidden states
        states, _ = self.gru(input_sequence)
        # compute attention scores to each RNN hidden state (we use a linear function)
        att_scores = self.att(states)
        # rescale the attention scores using a softmax, so they sum to one
        alpha = F.softmax(att_scores, dim=0)
        # compute the "c" vector as a weighted combination of the RNN hidden states
        c = torch.sum(torch.mul(states, alpha), dim=0)
        # now couple up the c state to the output, and compute log-softmax
        output = self.h2o(c.view(1, -1)) 
        output = F.log_softmax(output, dim=1)
        return output, alpha

RNN Training code

In [88]:
# def categoryFromOutput(output):
#     top_n, top_i = output.topk(1)
#     category_i = top_i[0].item()
#     return all_categories[category_i], category_i

# def randomChoice(l):
#     return l[random.randint(0, len(l) - 1)]

# def randomTrainingExample(noise=0, noise_chars=".,;'"):
#     # noise: integer denoting the maximum number of distractor characters to add
#     # noise_chars: inventory of distractor characters
#     category = randomChoice(all_categories)
#     line = randomChoice(category_lines[category])
#     # added code to insert distracting nonsense into the string
#     if noise > 0:
#         line_prime = line
#         for i in range(random.randint(0, noise+1)):
#             line_prime += random.choice(noise_chars)
#         line = line_prime
#     # end change
#     category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)
#     line_tensor = lineToTensor(line)
#     return category, line, category_tensor, line_tensor

In [89]:
# n_iters = 80000
# print_every = 5000
# plot_every = 1000
# noise_level = 0 # change this line (as discussed later)
# n_hidden = 32
# learning_rate = 0.005

# current_loss = 0
# all_losses = []

# rnn = RNNClassifier(n_letters, n_hidden, n_categories)
# criterion = nn.NLLLoss()

# def timeSince(since):
#     now = time.time()
#     s = now - since
#     m = math.floor(s / 60)
#     s -= m * 60
#     return '%dm %ds' % (m, s)
# start = time.time()

# # training algorithm, which takes one instance and performs single SGD update
# def train(category_tensor, line_tensor):
#     hidden = rnn.initHidden()
#     rnn.zero_grad()
#     # key step: unroll the RNN over each symbol in the input sequence
#     for i in range(line_tensor.size()[0]):
#         output, hidden = rnn(line_tensor[i], hidden)
#     # treat the last output as the prediction of the category label
#     loss = criterion(output, category_tensor)
#     loss.backward()

#     # Add parameters' gradients to their values, multiplied by learning rate
#     for p in rnn.parameters():
#         p.data.add_(p.grad.data, alpha=-learning_rate)
#     return output, loss.item()

# for iter in range(1, n_iters + 1):
#     category, line, category_tensor, line_tensor = randomTrainingExample(noise=noise_level)
#     output, loss = train(category_tensor, line_tensor)
#     current_loss += loss

#     # Print iter number, loss, name and guess
#     if iter % print_every == 0:
#         guess, guess_i = categoryFromOutput(output)
#         correct = '✓' if guess == category else '✗ (%s)' % category
#         print('%d %d%% (%s) %.4f %s / %s %s' % (iter, iter / n_iters * 100, timeSince(start), loss, line, guess, correct))

#     # Add current loss avg to list of losses
#     if iter % plot_every == 0:
#         all_losses.append(current_loss / plot_every)
#         current_loss = 0

GRU Traning code

In [90]:


# model = AttentionalGRUClassifier(n_letters, n_hidden, n_categories)
# optimizer = optim.SGD(model.parameters(), lr=learning_rate)
# criterion = nn.NLLLoss()

# start = time.time()
# all_losses_att = []
# current_loss = 0

# for iter in range(1, n_iters + 1):
#     category, line, category_tensor, line_tensor = randomTrainingExample(noise=noise_level)

#     model.zero_grad()
#     output, _ = model.forward(line_tensor)
#     output = torch.squeeze(output, 1) # remove redundant dimension
#     loss = criterion(output, category_tensor)
#     current_loss += loss.item()
#     loss.backward()
#     optimizer.step()

#     # Print iter number, loss, name and guess
#     if iter % print_every == 0:
#         guess, guess_i = categoryFromOutput(output)
#         correct = '✓' if guess == category else '✗ (%s)' % category
#         print('%d %d%% (%s) %.4f %s / %s %s' % (iter, iter / n_iters * 100, timeSince(start), loss, line, guess, correct))

#     # Add current loss avg to list of losses
#     if iter % plot_every == 0:
#         all_losses_att.append(current_loss / plot_every)
#         current_loss = 0

# 3. Training and Testing

In [91]:
import time
from tkinter import Y

def test(model, criterion, test_loader):
    test_loss = 0.
    test_preds, test_labels = list(), list()
    for i, data in enumerate(test_loader):
        x_year, x_venue, x_abstract, x_title, y = data

        with torch.no_grad():
            logits = model(x_abstract, x_title, x_venue, x_year)
            predictions = torch.round(logits)
            test_loss += criterion(input=logits, target=y.float()).item()
            test_preds.append(predictions)
            test_labels.append(y)

    test_preds = torch.cat(test_preds)
    test_labels = torch.cat(test_labels)

    test_accuracy = torch.mean((torch.sum(torch.eq(predictions, y).float(), 1) == 100).float()).item()

    print('[TEST] Mean loss {:.4f} | Accuracy {:.4f}'.format(test_loss/len(test_loader), test_accuracy))

def train(model, train_loader, val_loader, optimizer, n_epochs=10):
    """
    Generic training loop for supervised multiclass learning
    """
    LOG_INTERVAL = 250
    running_loss, running_accuracy = list(), list()
    start_time = time.time()
    criterion = torch.nn.BCELoss()

    for epoch in range(n_epochs):  # Loop over training dataset `n_epochs` times

        epoch_loss = 0.

        for i, data in enumerate(train_loader):  # Loop over elements in training set

            x_year, x_venue, x_abstract, x_title, y = data

            logits = model(x_abstract, x_title, x_venue, x_year)

            predictions = torch.round(logits)
            # print(f'{torch.mean((torch.sum(torch.eq(predictions, y).float(), 1) == 100).float())}\n')
            train_acc = torch.mean((torch.sum(torch.eq(predictions, y).float(), 1) == 100).float()).item()

            # print(logits)
            loss = criterion(input=logits, target=y.float())

            loss.backward()               # Backward pass (compute parameter gradients)
            optimizer.step()              # Update weight parameter using SGD
            optimizer.zero_grad()         # Reset gradients to zero for next iteration


            # ============================================================================
            # You can safely ignore the boilerplate code below - just reports metrics over
            # training and test sets

            running_loss.append(loss.item())
            running_accuracy.append(train_acc)

            epoch_loss += loss.item()

            if i % LOG_INTERVAL == 0:  # Log training stats
                deltaT = time.time() - start_time
                mean_loss = epoch_loss / (i+1)
                print('[TRAIN] Epoch {} [{}/{}]| Mean loss {:.4f} | Current Mean train accuracy {:.5f} | Current data accuracy {:.5f} | Time {:.2f} s'.format(epoch, 
                    i, len(train_loader), mean_loss, sum(running_accuracy)/len(running_accuracy), train_acc, deltaT))

        print('Epoch complete! Mean loss: {:.4f}'.format(epoch_loss/len(train_loader)))

        test(model, criterion, val_loader)
        
    return running_loss, running_accuracy

In [66]:
mlp_model = MLP(
    embed_input_size=1, 
    other_input_size=1, 
    embed_vocab=NUM_VENUES, 
    embed_size=NUM_VENUES//2, 
    hidden_size1=512, 
    hidden_size2=256, 
    out_size=100)
    
optimizer = torch.optim.SGD(mlp_model.parameters(), lr=1e-2, momentum=0.9)
# optimizer = torch.optim.Adam(mlp_model.parameters(), lr=1e-2)
mlp_loss, mlp_acc = train(mlp_model, train_dataloader, val_dataloader, optimizer)

[TRAIN] Epoch 0 [0/142]| Mean loss 0.6996 | Current Mean train accuracy 0.00000 | Current data accuracy 0.00000 | Time 0.01 s
Epoch complete! Mean loss: 0.2637
[TEST] Mean loss 0.0391 | Accuracy 0.7241
[TRAIN] Epoch 1 [0/142]| Mean loss 0.0448 | Current Mean train accuracy 0.45681 | Current data accuracy 0.61719 | Time 1.98 s
Epoch complete! Mean loss: 0.0378
[TEST] Mean loss 0.0354 | Accuracy 0.7586
[TRAIN] Epoch 2 [0/142]| Mean loss 0.0418 | Current Mean train accuracy 0.58367 | Current data accuracy 0.64844 | Time 3.91 s
Epoch complete! Mean loss: 0.0355
[TEST] Mean loss 0.0340 | Accuracy 0.7069
[TRAIN] Epoch 3 [0/142]| Mean loss 0.0342 | Current Mean train accuracy 0.62626 | Current data accuracy 0.72656 | Time 5.79 s
Epoch complete! Mean loss: 0.0346
[TEST] Mean loss 0.0329 | Accuracy 0.6552
[TRAIN] Epoch 4 [0/142]| Mean loss 0.0332 | Current Mean train accuracy 0.64623 | Current data accuracy 0.70312 | Time 7.73 s
Epoch complete! Mean loss: 0.0331
[TEST] Mean loss 0.0322 | Accura

In [None]:
from scipy.signal import savgol_filter  # Smooth spiky curves
import matplotlib.pyplot as plt
running_loss_smoothed = savgol_filter(mlp_loss, 21, 3)
running_acc_smoothed = savgol_filter(mlp_acc, 21, 3)

plt.plot(running_loss_smoothed)
plt.xlabel('Iterations')
plt.ylabel('Cross-entropy Loss (Train)')

In [None]:
plt.plot(running_acc_smoothed)
plt.xlabel('Iterations')
plt.ylabel('Accuracy (Train)')
plt.ylim(0.2,1.)

In [107]:
mlp_model = RNN_MLP(
    rnn_embed_vocab=NUM_WORDS,
    rnn_embed_size=NUM_WORDS//2,
    rnn_hidden_size=256,

    mlp_embed_input_size=1, 
    mlp_other_input_size=1, 
    mlp_embed_vocab=NUM_VENUES, 
    mlp_embed_size=NUM_VENUES//2, 
    mlp_hidden_dim1=512, 
    mlp_hidden_dim2=256, 
    mlp_out_size=100)
    
optimizer = torch.optim.SGD(mlp_model.parameters(), lr=1e-2, momentum=0.9)
# optimizer = torch.optim.Adam(mlp_model.parameters(), lr=1e-2)
mlp_loss, mlp_acc = train(mlp_model, train_dataloader, val_dataloader, optimizer)

RuntimeError: input must have 2 dimensions, got 1