In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import os
import sys
from util import evaluate, load_data

In [2]:
class Dataset(Dataset):
    def __init__(self, train_data, dev_data, test_data, label_to_index):
        """
        Args:
            df (pandas.DataFrame): the dataset
        """
        self.train_data = train_data
        self.train_size = len(self.train_data)

        self.dev_data = dev_data
        self.validation_size = len(self.dev_data)

        self.test_data = test_data
        self.test_data = len(self.test_data)
        
        self.label_to_index = label_to_index
        self.index_to_label = {v: k for k, v in label_to_index.items()}

    def __len__(self):
        return self._target_size
    
    def set_split(self, split):
        if split == 'train':
            self._target = self.train_data
            self._target_size = len(self.train_data)
        elif split == 'dev':
            self._target = self.dev_data
            self._target_size = len(self.dev_data)
        elif split == 'test':
            self._target = self.test_data
            self._target_size = len(self.test_data)

    def __getitem__(self, index):
        """the primary entry point method for PyTorch datasets
        
        Args:
            index (int): the index to the data point 
        Returns:
            a dictionary holding the data point's:
                features (x_data)
                label (y_target)
                feature length (x_length)
        """
        row = self._target[index]
        
        x_data = row[0]
        y_target = [0] * len(self.label_to_index.keys())
        y_target[self.label_to_index[row[1]]] = 1
        
        return {'x_data': torch.Tensor(x_data), 
                'y_target': torch.Tensor(y_target).to(device='cpu', dtype=torch.int64)}

    def get_num_batches(self, batch_size):
        """Given a batch size, return the number of batches in the dataset
        
        Args:
            batch_size (int)
        Returns:
            number of batches in the dataset
        """
        return len(self) // batch_size

    
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"): 
    """
    A generator function which wraps the PyTorch DataLoader. It will 
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [3]:
class MultiLayerPerceptron(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, batch_size=1):
        super(MultiLayerPerceptron, self).__init__()
                
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.dropout = nn.Dropout(0.4)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, 200)
        self.fc4 = nn.Linear(200, output_dim)
    
    def forward(self, x_in):
        out = F.relu(self.fc1(x_in))
        out = self.dropout(out)
        out = F.relu(self.fc2(out))
        out = self.dropout(out)
        out = F.relu(self.fc3(out))
        out = self.dropout(out)
        out = F.relu(self.fc4(out))
        scores = F.softmax(out, dim=1)
        
        return out

In [4]:
def create_dummy_bias(data):
    for sample in data:
        sample[0].append(1)
    return data 

train_data, dev_data, test_data, data_type, label_to_index = load_data(['propername'])

train_data = create_dummy_bias(train_data)
dev_data = create_dummy_bias(dev_data)
test_data = create_dummy_bias(test_data)

In [5]:
dataset = Dataset(train_data, dev_data, test_data, label_to_index)

input_dim = len(train_data[0][0])
hidden_dim = 1000
output_dim = len(label_to_index)
print(input_dim)

classifier = MultiLayerPerceptron(input_dim, hidden_dim, output_dim, label_to_index)
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(classifier.parameters(), lr=5e-3, weight_decay=1e-5)

def compute_accuracy(y_pred, y_target):
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return (n_correct / len(y_pred_indices)) * 100

20328


In [6]:
#TODO:
#HYPERPARAMETER SELECTION ON NETWORK WITH FEW LAYERS
#THEN RUN TESTS ON DEEPER NETWORK

num_epochs = 100

for epoch in range(num_epochs):
    dataset.set_split('train')
    batch_generator = generate_batches(dataset, batch_size=1000)
    running_loss = 0.0
    running_acc = 0.0
    classifier.train()
        
    for batch_index, batch_dict in enumerate(batch_generator):
        # the training routine is these 5 steps:

        # --------------------------------------
        # step 1. zero the gradients
        optimizer.zero_grad()

        # step 2. compute the output
        y_pred = classifier(batch_dict['x_data'])

        # step 3. compute the loss
        loss = loss_func(y_pred, torch.max(batch_dict['y_target'],1)[1])
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / (batch_index + 1)

        # step 4. use loss to produce gradients
        loss.backward()

        # step 5. use optimizer to take gradient step
        optimizer.step()
        # -----------------------------------------
        # compute the accuracy
        acc_t = compute_accuracy(y_pred, torch.max(batch_dict['y_target'],1)[1])
        running_acc += (acc_t - running_acc) / (batch_index + 1)
    print("Loss :", running_loss)
    print("Training acc: ", running_acc)
    
    # setup: batch generator, set loss and acc to 0; set eval mode on
    dataset.set_split('dev')
    batch_generator = generate_batches(dataset, batch_size=500)
    
    running_loss = 0.
    running_acc = 0.
    classifier.eval()
        
    for batch_index, batch_dict in enumerate(batch_generator):

        # compute the output
        y_pred =  classifier(batch_dict['x_data'])

        # step 3. compute the loss
        loss = loss_func(y_pred, torch.max(batch_dict['y_target'],1)[1])
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / (batch_index + 1)

        # compute the accuracy
        acc_t = compute_accuracy(y_pred, torch.max(batch_dict['y_target'],1)[1])
        running_acc += (acc_t - running_acc) / (batch_index + 1)
        
    print("Development acc: ", running_acc)

Loss : 1.2687498823456145
Training acc:  48.300000000000004
Development acc:  70.67999999999999
Loss : 0.6108014790908148
Training acc:  75.72608695652173
Development acc:  73.72
Loss : 0.38320402865824493
Training acc:  85.4304347826087
Development acc:  85.52000000000001
Loss : 0.13074511160021243
Training acc:  96.58260869565217
Development acc:  85.08
Loss : 0.0780261299856331
Training acc:  97.86521739130436
Development acc:  85.12
Loss : 0.045905844267943634
Training acc:  98.73478260869565
Development acc:  84.47999999999999
Loss : 0.031560341503633106
Training acc:  99.0086956521739
Development acc:  84.64
Loss : 0.03058140319974526
Training acc:  99.13478260869566
Development acc:  85.03999999999999
Loss : 0.028867560803242348
Training acc:  99.1695652173913
Development acc:  84.8
Loss : 0.025584493399314255
Training acc:  99.23043478260868
Development acc:  85.0
Loss : 0.02591663611161968
Training acc:  99.34347826086956
Development acc:  85.4
Loss : 0.031576762947699295
Trai

KeyboardInterrupt: 