In [16]:
import numpy as np

import pandas as pd

from sklearn import svm
from sklearn.preprocessing import StandardScaler

import os

import torch
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

### 1. Data set

In [60]:
def authors_to_one_hot(a):
    one_hot = np.zeros(100).astype(int)
    a = np.array(a)
    a = a[a < 100]
    one_hot[a] = 1
    return one_hot

# read data from json file
train_df = pd.read_json('./data/train.json')
train_df.venue = train_df.venue.map(lambda x: -1 if x == '' else x)

# Set up training data
X_train = train_df.drop(columns=['authors'])
y_train = train_df.authors
y_train = y_train.map(authors_to_one_hot)

# Set up test data
X_test = pd.read_json('./data/test.json')



In [61]:
# from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, GridSearchCV

# full_df = pd.read_csv('./data/cats.csv')
# full_df.SEX = full_df.SEX.map({'M': -1, 'F': 1})
# train_df, test_df = train_test_split(full_df, test_size=0.2, random_state=1)
# scaler = StandardScaler()
# X_train = scaler.fit_transform(train_df.drop('SEX', axis=1)) # fill in
# y_train = train_df.SEX # fill in

# X_test = scaler.transform(test_df.drop('SEX', axis=1)) # fill in
# y_test = test_df.SEX # fill in

### 2. Model

NN

In [19]:
NUM_PROLIFIC_AUTHORS = 100

class CustomDataset(Dataset):
    def __init__(self, file_name, data_dir, transform=None, target_transform=None):
        self.data_dir = data_dir
        
        self.data = pd.read_json(os.path.join(data_dir, file_name))
        self.data = self.data.applymap(lambda x: -1 if x == '' else x) # Change null data to -1
        
        self.x = self.data.drop(columns=['authors']).values
        self.y = self.data.authors.values

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x_fixed = self.x[idx,[0, 2]].astype(np.int32)
        x_abstract = np.array(self.x[idx,1])
        x_title = np.array(self.x[idx,3])
        y = np.array(self.y[idx])
        y = self.authors_to_one_hot(y)

        return x_fixed, x_abstract, x_title, y

    def authors_to_one_hot(self, a):
        one_hot = np.zeros(NUM_PROLIFIC_AUTHORS).astype(int)
        a = np.array(a)
        a = a[a < 100]
        one_hot[a] = 1
        return one_hot

train_dataset = CustomDataset('train.json', './data/')
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)

# for _ in range(5):
#     _, _, _, y = next(iter(train_dataloader))
#     print(y)

In [20]:
import torch.nn.functional as F

HIDDEN_DIM1 = 256
HIDDEN_DIM2 = 100

class MultilayerPerceptronModel(nn.Module):
    def __init__(self, n_features, n_classes, hidden_dim1 = HIDDEN_DIM1, hidden_dim2 = HIDDEN_DIM2):
        super().__init__()
        
        self.input_layer = nn.Linear(n_features, hidden_dim1)
        self.hidden_layer = nn.Linear(hidden_dim1, hidden_dim2)
        self.output_layer = nn.Linear(hidden_dim2, n_classes)
        self.dropout = nn.Dropout(0.25)

    def forward(self, x):
        batch_size = x.shape[0]
        
        x = x.view(batch_size, -1)  # Flatten image into vector, retaining batch dimension
        
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        
        x = F.relu(self.hidden_layer(x))
        x = self.dropout(x)
        
        out = self.output_layer(x)
        
        return out

In [23]:
import time
from tkinter import Y

def train(model, train_loader, test_loader, optimizer, n_epochs=10):
    """
    Generic training loop for supervised multiclass learning
    """
    LOG_INTERVAL = 250
    running_loss, running_accuracy = list(), list()
    start_time = time.time()
    criterion = torch.nn.CrossEntropyLoss()

    for epoch in range(n_epochs):  # Loop over training dataset `n_epochs` times

        epoch_loss = 0.

        for i, data in enumerate(train_loader):  # Loop over elements in training set

            x_fixed, x_abstract, x_title, y = data

            logits = model(x_fixed.float())

            predictions = torch.round(logits)
            train_acc = torch.mean(torch.sum(torch.eq(predictions, y)).float()).item()

            loss = criterion(input=logits, target=y)

            loss.backward()               # Backward pass (compute parameter gradients)
            optimizer.step()              # Update weight parameter using SGD
            optimizer.zero_grad()         # Reset gradients to zero for next iteration


            # ============================================================================
            # You can safely ignore the boilerplate code below - just reports metrics over
            # training and test sets

            running_loss.append(loss.item())
            running_accuracy.append(train_acc)

            epoch_loss += loss.item()

            if i % LOG_INTERVAL == 0:  # Log training stats
                deltaT = time.time() - start_time
                mean_loss = epoch_loss / (i+1)
                print('[TRAIN] Epoch {} [{}/{}]| Mean loss {:.4f} | Train accuracy {:.5f} | Time {:.2f} s'.format(epoch, 
                    i, len(train_loader), mean_loss, train_acc, deltaT))

        print('Epoch complete! Mean loss: {:.4f}'.format(epoch_loss/len(train_loader)))

        # test(model, criterion, test_loader)
        
    return running_loss, running_accuracy

In [24]:
NUM_FEATURES_FIXED = 2

mlp_model = MultilayerPerceptronModel(NUM_FEATURES_FIXED, NUM_PROLIFIC_AUTHORS)
optimizer = torch.optim.SGD(mlp_model.parameters(), lr=1e-2, momentum=0.9)
mlp_loss, mlp_acc = train(mlp_model, train_dataloader, None, optimizer)

TypeError: round() got an unexpected keyword argument 'dim'

In [18]:
def authors_to_one_hot(a):
    one_hot = np.zeros(100).astype(int)
    a = np.array(a)
    a = a[a < 100]
    one_hot[a] = 1
    return one_hot

class CustomDataset(Dataset):
    def __init__(self, file_name, data_dir, transform=None, target_transform=None):
        self.data = pd.read_json(os.path.join(data_dir, file_name))
        self.data_dir = data_dir
        # self.transform = transform
        # self.target_transform = target_transform
        print(len)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0])
        image = read_image(img_path)
        label = self.img_labels.iloc[idx, 1]
        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            label = self.target_transform(label)
        return image, label

a = CustomDataset('train.json', './data/')
    

# src: https://androidkt.com/load-pandas-dataframe-using-dataset-and-dataloader-in-pytorch/
class MyDataset():
    def __init__(self, file_name, data_dir):
        train_df=pd.read_json(file_name)
        train_df = train_df.map(lambda x: -1 if x == '' else x)
        
        X_train = train_df.drop(columns=['authors'])
        y_train = train_df.authors
        y_train = y_train.map(authors_to_one_hot)
        
        x=X_train.iloc[:,0].values
        print(y_train)
        y=y_train.values
        y = [np.array(arr).astype(int) for arr in y]
        
        # x=train_df.iloc[:,3].map(lambda x: -1 if x == '' else x).values # WRONG!!!
        # y=train_df.iloc[:,0].map(authors_to_one_hot).values
    
        print(type(train_df.iloc[:,3]).map(lambda x: -1 if x == '' else x))
        print(y)

        y = [np.array(arr).astype(int) for arr in y]
        
        self.x_train=torch.tensor(x, dtype=torch.float32)
        self.y_train=torch.tensor(y, dtype=torch.float32)
 
    def __len__(self):
        return len(self.y_train)
   
    def __getitem__(self,idx):
        return self.x_train[idx],self.y_train[idx]

# # Define Conv. network (Src: worksheet 8)
# class BasicConvNet(nn.Module):
#     def __init__(self, out_c1, out_c2, dense_units, n_classes=10):
#         super(BasicConvNet, self).__init__()
#         self.conv1 = nn.Conv2d(in_channels=3, out_channels=out_c1, kernel_size=5)
#         self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
#         self.conv2 = nn.Conv2d(in_channels=out_c1, out_channels=out_c2, kernel_size=5)
#         self.fc1 = nn.Linear(16 * 5 * 5, dense_units)
#         self.logits = nn.Linear(dense_units, n_classes)

#     def forward(self, x):
#         x = self.pool(F.relu(self.conv1(x)))
#         x = self.pool(F.relu(self.conv2(x)))
#         x = x.view(-1, 16 * 5 * 5)
#         x = F.relu(self.fc1(x))
#         out = self.logits(x)
#         return out


# # Define parameters
# BATCH_SIZE = 128
# OUT_C1 = 8
# OUT_C2 = 16
# DENSE_UNITS = 256
# LR = 1e-2
    
# Load data
# train_ds = MyDataset('./data/train.json')
# test_ds = MyDataset('./data/test.json')  #TODO: 

# train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE,shuffle=True)
# test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE,shuffle=False)
    
# # Construct model
# conv2D_model = BasicConvNet(OUT_C1, OUT_C2, DENSE_UNITS)

# # Start training 
# optimizer = torch.optim.SGD(conv2D_model.parameters(), lr=LR, momentum=0.9)  #TODO: try adam
# conv_loss, conv_acc = train(conv2D_model, train_loader, test_loader, optimizer)


25793


RNN

In [2]:
import torch.nn as nn
import torch.optim as optim

In [None]:
class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size) 
        self.activation = nn.Tanh() 
        self.softmax = nn.LogSoftmax(dim=1)
    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.activation(self.i2h(combined)) 
        output = self.h2o(hidden) 
        output = self.softmax(output)
        return output, hidden
    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

def categoryFromOutput(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return all_categories[category_i], category_i

def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

def randomTrainingExample(noise=0, noise_chars=".,;'"):
    # noise: integer denoting the maximum number of distractor characters to add
    # noise_chars: inventory of distractor characters
    category = randomChoice(all_categories)
    line = randomChoice(category_lines[category])
    # added code to insert distracting nonsense into the string
    if noise > 0:
        line_prime = line
        for i in range(random.randint(0, noise+1)):
            line_prime += random.choice(noise_chars)
        line = line_prime
    # end change
    category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)
    line_tensor = lineToTensor(line)
    return category, line, category_tensor, line_tensor

n_iters = 80000
print_every = 5000
plot_every = 1000
noise_level = 0 # change this line (as discussed later)
n_hidden = 32
learning_rate = 0.005

current_loss = 0
all_losses = []

rnn = RNNClassifier(n_letters, n_hidden, n_categories)
criterion = nn.NLLLoss()

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)
start = time.time()

# training algorithm, which takes one instance and performs single SGD update
def train(category_tensor, line_tensor):
    hidden = rnn.initHidden()
    rnn.zero_grad()
    # key step: unroll the RNN over each symbol in the input sequence
    for i in range(line_tensor.size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)
    # treat the last output as the prediction of the category label
    loss = criterion(output, category_tensor)
    loss.backward()

    # Add parameters' gradients to their values, multiplied by learning rate
    for p in rnn.parameters():
        p.data.add_(p.grad.data, alpha=-learning_rate)
    return output, loss.item()

for iter in range(1, n_iters + 1):
    category, line, category_tensor, line_tensor = randomTrainingExample(noise=noise_level)
    output, loss = train(category_tensor, line_tensor)
    current_loss += loss

    # Print iter number, loss, name and guess
    if iter % print_every == 0:
        guess, guess_i = categoryFromOutput(output)
        correct = '✓' if guess == category else '✗ (%s)' % category
        print('%d %d%% (%s) %.4f %s / %s %s' % (iter, iter / n_iters * 100, timeSince(start), loss, line, guess, correct))

    # Add current loss avg to list of losses
    if iter % plot_every == 0:
        all_losses.append(current_loss / plot_every)
        current_loss = 0

In [None]:
class AttentionalGRUClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(AttentionalGRUClassifier, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size) 
        self.att = nn.Linear(hidden_size, 1) 
        
    def forward(self, input_sequence):
        # process the input sequence into a sequence of RNN hidden states
        states, _ = self.gru(input_sequence)
        # compute attention scores to each RNN hidden state (we use a linear function)
        att_scores = self.att(states)
        # rescale the attention scores using a softmax, so they sum to one
        alpha = F.softmax(att_scores, dim=0)
        # compute the "c" vector as a weighted combination of the RNN hidden states
        c = torch.sum(torch.mul(states, alpha), dim=0)
        # now couple up the c state to the output, and compute log-softmax
        output = self.h2o(c.view(1, -1)) 
        output = F.log_softmax(output, dim=1)
        return output, alpha

model = AttentionalGRUClassifier(n_letters, n_hidden, n_categories)
optimizer = optim.SGD(model.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()

start = time.time()
all_losses_att = []
current_loss = 0

for iter in range(1, n_iters + 1):
    category, line, category_tensor, line_tensor = randomTrainingExample(noise=noise_level)

    model.zero_grad()
    output, _ = model.forward(line_tensor)
    output = torch.squeeze(output, 1) # remove redundant dimension
    loss = criterion(output, category_tensor)
    current_loss += loss.item()
    loss.backward()
    optimizer.step()

    # Print iter number, loss, name and guess
    if iter % print_every == 0:
        guess, guess_i = categoryFromOutput(output)
        correct = '✓' if guess == category else '✗ (%s)' % category
        print('%d %d%% (%s) %.4f %s / %s %s' % (iter, iter / n_iters * 100, timeSince(start), loss, line, guess, correct))

    # Add current loss avg to list of losses
    if iter % plot_every == 0:
        all_losses_att.append(current_loss / plot_every)
        current_loss = 0