In [87]:
import pandas as pd
import torch
import numpy as np
from argparse import Namespace
import os
import json

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
from torch.utils.data import Dataset
from sklearn.preprocessing import OneHotEncoder

# Multiclass Classification

## Data Loading

In [88]:
def load_data(name):
    train = pd.read_csv(f'./data/{name}-train.csv')
    test = pd.read_csv(f'./data/{name}-test.csv')
    val = pd.read_csv(f'./data/{name}-val.csv')
    return train, test, val

class CustomDataset(Dataset):
    def __init__(self, x_tensor, y_tensor):
        self.x = x_tensor
        self.y = y_tensor

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.x)

In [89]:
train, test, val = load_data('task_2')
#x_train_tensor = torch.as_tensor(x_data).float()
#y_train_tensor = torch.as_tensor(y_truth).float()

In [90]:
train_x = train.job_description
train_y = train.category.astype('category')

In [91]:
train_y.unique()

['Banking & Financial Services', 'Manufacturing, Transport & Logistics', 'Community Services & Development', 'Healthcare & Medical', 'Insurance & Superannuation', ..., 'Call Centre & Customer Service', 'Marketing & Communications', 'Advertising, Arts & Media', 'CEO & General Management', 'Real Estate & Property']
Length: 30
Categories (30, object): ['Accounting', 'Administration & Office Support', 'Advertising, Arts & Media', 'Banking & Financial Services', ..., 'Science & Technology', 'Self Employment', 'Sport & Recreation', 'Trades & Services']

# Models

## Vanilla RNN

In [100]:
args = Namespace(
    # Data and path information
    surname_csv="../data/surnames/surnames_with_splits.csv",
    vectorizer_file="vectorizer.json",
    model_state_file="model.pth",
    save_dir="model_storage/ch6/surname_classification",
    # Model hyper parameter
    char_embedding_size=100,
    rnn_hidden_size=64,
    # Training hyper parameter
    num_epochs=100,
    learning_rate=1e-3,
    batch_size=64,
    seed=1337,
    early_stopping_criteria=5,
    # Runtime hyper parameter
    cuda=True,
    catch_keyboard_interrupt=True,
    reload_from_files=False,
    expand_filepaths_to_save_dir=True,
)

# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False

args.device = torch.device("cuda" if args.cuda else "cpu")
    
print("Using CUDA: {}".format(args.cuda))


if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,
                                        args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)
    
# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)


Using CUDA: False


NameError: name 'set_seed_everywhere' is not defined

In [98]:
CATEGORIES = ['Banking & Financial Services', 'Manufacturing, Transport & Logistics', 'Community Services & Development', 'Healthcare & Medical', 'Insurance & Superannuation']

class ElmanRNN(nn.Module):
    """ an Elman RNN built using the RNNCell """
    def __init__(self, input_size, hidden_size, batch_first=False):
        """
        Args:
            input_size (int): size of the input vectors
            hidden_size (int): size of the hidden state vectors
            bathc_first (bool): whether the 0th dimension is batch
        """
        super(ElmanRNN, self).__init__()
        
        self.rnn_cell = nn.RNNCell(input_size, hidden_size)
        
        self.batch_first = batch_first
        self.hidden_size = hidden_size

    def _initial_hidden(self, batch_size):
        return torch.zeros((batch_size, self.hidden_size))

    def forward(self, x_in, initial_hidden=None):
        """The forward pass of the ElmanRNN
        
        Args:
            x_in (torch.Tensor): an input data tensor. 
                If self.batch_first: x_in.shape = (batch, seq_size, feat_size)
                Else: x_in.shape = (seq_size, batch, feat_size)
            initial_hidden (torch.Tensor): the initial hidden state for the RNN
        Returns:
            hiddens (torch.Tensor): The outputs of the RNN at each time step. 
                If self.batch_first: hiddens.shape = (batch, seq_size, hidden_size)
                Else: hiddens.shape = (seq_size, batch, hidden_size)
        """
        if self.batch_first:
            batch_size, seq_size, feat_size = x_in.size()
            x_in = x_in.permute(1, 0, 2)
        else:
            seq_size, batch_size, feat_size = x_in.size()
    
        hiddens = []

        if initial_hidden is None:
            initial_hidden = self._initial_hidden(batch_size)
            initial_hidden = initial_hidden.to(x_in.device)

        hidden_t = initial_hidden
                    
        for t in range(seq_size):
            hidden_t = self.rnn_cell(x_in[t], hidden_t)
            hiddens.append(hidden_t)
            
        hiddens = torch.stack(hiddens)

        if self.batch_first:
            hiddens = hiddens.permute(1, 0, 2)

        return hiddens



class JobCategoryClassifier(nn.Module):
    """ A Classifier with an RNN to extract features and an MLP to classify """
    def __init__(self, embedding_size, num_embeddings, num_classes,
                 rnn_hidden_size, batch_first=True, padding_idx=0):
        """
        Args:
            embedding_size (int): The size of the character embeddings
            num_embeddings (int): The number of characters to embed
            num_classes (int): The size of the prediction vector 
                Note: the number of nationalities
            rnn_hidden_size (int): The size of the RNN's hidden state
            batch_first (bool): Informs whether the input tensors will 
                have batch or the sequence on the 0th dimension
            padding_idx (int): The index for the tensor padding; 
                see torch.nn.Embedding
        """
        super(JobCategoryClassifier, self).__init__()

        self.emb = nn.Embedding(num_embeddings=num_embeddings,
                                embedding_dim=embedding_size,
                                padding_idx=padding_idx)
        self.rnn = ElmanRNN(input_size=embedding_size,
                             hidden_size=rnn_hidden_size,
                             batch_first=batch_first)
        self.fc1 = nn.Linear(in_features=rnn_hidden_size,
                         out_features=rnn_hidden_size)
        self.fc2 = nn.Linear(in_features=rnn_hidden_size,
                          out_features=num_classes)


    def forward(self, x_in, x_lengths=None, apply_softmax=False):
        """The forward pass of the classifier
        
        Args:
            x_in (torch.Tensor): an input data tensor. 
                x_in.shape should be (batch, input_dim)
            x_lengths (torch.Tensor): the lengths of each sequence in the batch.
                They are used to find the final vector of each sequence
            apply_softmax (bool): a flag for the softmax activation
                should be false if used with the Cross Entropy losses
        Returns:
            the resulting tensor. tensor.shape should be (batch, output_dim)
        """
        x_embedded = self.emb(x_in)
        y_out = self.rnn(x_embedded)

        if x_lengths is not None:
            y_out = column_gather(y_out, x_lengths)
        else:
            y_out = y_out[:, -1, :]

        y_out = F.relu(self.fc1(F.dropout(y_out, 0.5)))
        y_out = self.fc2(F.dropout(y_out, 0.5))

        if apply_softmax:
            y_out = F.softmax(y_out, dim=1)

        return y_out
        
class RNNWrapper():
    def __init__(self, vectorizer, dataset):
        self.vectorizer = vectorizer
        self.model = JobCategoryClassifier(
                                embedding_size=args.char_embedding_size, 
                                num_embeddings=len(vectorizer.char_vocab),
                                num_classes=len(CATEGORIES),
                                rnn_hidden_size=args.rnn_hidden_size,
                                padding_idx=vectorizer.char_vocab.mask_index)
        
    def train(self):
        # USED FOR IMBALANCED DATA:::::
        # loss_func = nn.CrossEntropyLoss(dataset.class_weights)
        # :::::::
        loss_func = nn.CrossEntropyLoss()
        optimizer = optim.Adam(self.model.parameters(), lr=args.learning_rate)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                   mode='min', factor=0.5,
                                                   patience=1)
        
        for epoch_index in range(args.num_epochs):
            # setup: batch generator, set loss and acc to 0, set train mode on
            batch_generator = generate_batches(x, 
                                               batch_size=args.batch_size, 
                                               device=args.device)
            
            self.model.train()
            for batch_index, batch_dict in enumerate(batch_generator):
                # the training routine is these 5 steps:

                # --------------------------------------    
                # step 1. zero the gradients
                optimizer.zero_grad()

                # step 2. compute the output
                y_pred = self.model(x_in=batch_dict['x_data'], 
                                    x_lengths=batch_dict['x_length'])

                # step 3. compute the loss
                loss = loss_func(y_pred, batch_dict['y_target'])

                #running_loss += (loss.item() - running_loss) / (batch_index + 1)

                # step 4. use loss to produce gradients
                loss.backward()

                # step 5. use optimizer to take gradient step
                optimizer.step()
                # -----------------------------------------
                # compute the accuracy
#                 acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
#                 running_acc += (acc_t - running_acc) / (batch_index + 1)

#                 # update bar
#                 train_bar.set_postfix(loss=running_loss, acc=running_acc, epoch=epoch_index)
#                 train_bar.update()
            
    
    def test(self):
        pass
    
    def classify(self, pmf):
        """
            pmf is a vector of probabilities over the different categories
            picks the most likely category
        """
        return CATEGORIES[np.argmax(pmf)]
    
    def interact(self):
        sentence = input("Input: ")
        y_hat = self.model(sentence)
        result = self.classify(y_hat)
        print(f"Model prediction: {result}")

In [99]:
#multiclass_dataset = OurDataset(path)
multiclass_dataset = None

#one_hot_vec = OneHotVectorizor(dataset=multiclass_dataset)
#our_embeddings_vec = OurW2VVectorizor(dataset=multiclass_dataset)
#pretrained_embeddings_vec = PretrainedW2VVectorizor(dataset=multiclass_dataset)
one_hot_vec = None
our_embeddings_vec = None
pretrained_embeddings_vec = None

rnn_model_one_hot = RNNWrapper(one_hot_vec, multiclass_dataset)
rnn_model_our_embeddings = RNNWrapper(our_embeddings_vec, multiclass_dataset)
rnn_model_pretrained = RNNWrapper(pretrained_embeddings_vec, multiclass_dataset)

rnn_model_one_hot.train()
rnn_model_our_embeddings.train()
rnn_model_pretrained.train()

rnn_model_one_hot.test()
rnn_model_our_embeddings.test()
rnn_model_pretrained.test()

AttributeError: 'NoneType' object has no attribute 'parameters'

### Interactive

In [77]:
interact(rnn_model_one_hot)

NameError: name 'interact' is not defined

In [78]:
interact(rnn_model_our_embeddings)

NameError: name 'interact' is not defined

In [79]:
interact(rnn_model_pretrained)

NameError: name 'interact' is not defined

## LSTM