In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import os
import sys
from util import evaluate, load_data
from sklearn.metrics import confusion_matrix

print(torch.cuda.is_available())
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

True
cuda:0


In [2]:
class Dataset(Dataset):
    def __init__(self, train_data, dev_data, test_data, label_to_index):
        """
        Args:
            df (pandas.DataFrame): the dataset
        """
        self.train_data = train_data
        self.train_size = len(self.train_data)

        self.dev_data = dev_data
        self.validation_size = len(self.dev_data)

        self.test_data = test_data
        self.test_size = len(self.test_data)
        
        self.label_to_index = label_to_index
        self.index_to_label = {v: k for k, v in label_to_index.items()}

    def __len__(self):
        return self._target_size
    
    def set_split(self, split):
        if split == 'train':
            self._target = self.train_data
            self._target_size = len(self.train_data)
        elif split == 'dev':
            self._target = self.dev_data
            self._target_size = len(self.dev_data)
        elif split == 'test':
            self._target = self.test_data
            self._target_size = len(self.test_data)

    def __getitem__(self, index):
        """the primary entry point method for PyTorch datasets
        
        Args:
            index (int): the index to the data point 
        Returns:
            a dictionary holding the data point's:
                features (x_data)
                label (y_target)
                feature length (x_length)
        """
        row = self._target[index]
        
        x_data = row[0]
        y_target = [0] * len(self.label_to_index.keys())
        if self._target != self.test_data:
            y_target[self.label_to_index[row[1]]] = 1
        
        return {'x_data': torch.Tensor(x_data), 
                'y_target': torch.Tensor(y_target)}

    def get_num_batches(self, batch_size):
        """Given a batch size, return the number of batches in the dataset
        
        Args:
            batch_size (int)
        Returns:
            number of batches in the dataset
        """
        return len(self) // batch_size

    
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device=device): 
    """
    A generator function which wraps the PyTorch DataLoader. It will 
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        
        
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [3]:
class MultiLayerPerceptron(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, batch_size=1):
        super(MultiLayerPerceptron, self).__init__()
                
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x_in):
        out = F.relu(self.fc1(x_in))
        
        out = self.dropout(out)
        out = F.relu(self.fc2(out))
        
        out = self.dropout(out)
        out = F.relu(self.fc3(out))
        
        out = F.log_softmax(out, dim=1)
        
        return out

In [4]:
def create_dummy_bias(data):
    for sample in data:
        sample[0].append(1)
    return data 

train_data, dev_data, test_data, data_type, label_to_index = load_data(['propername'])

train_data = create_dummy_bias(train_data)
dev_data = create_dummy_bias(dev_data)
test_data = create_dummy_bias(test_data)

data dict len:  23410


In [5]:
dataset = Dataset(train_data, dev_data, test_data, label_to_index)

input_dim = len(train_data[0][0])
hidden_dim = 200
output_dim = len(label_to_index)
print(input_dim)

classifier = MultiLayerPerceptron(input_dim, hidden_dim, output_dim, label_to_index)
classifier.to(device)

loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(classifier.parameters(), lr=8e-3, weight_decay=1e-3)

def compute_accuracy(y_pred, y_target):
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return (n_correct / len(y_pred_indices)) * 100

23411


In [None]:
#TODO:
#HYPERPARAMETER SELECTION ON NETWORK WITH FEW LAYERS
#THEN RUN TESTS ON DEEPER NETWORK

num_epochs = 20

for epoch in range(num_epochs):
    dataset.set_split('train')
    batch_generator = generate_batches(dataset, batch_size=500)
    running_loss = 0.0
    running_acc = 0.0
    classifier.train()
        
    for batch_index, batch_dict in enumerate(batch_generator):
        # the training routine is these 5 steps:

        # --------------------------------------
        # step 1. zero the gradients
        optimizer.zero_grad()
        
        # step 2. compute output
        inputs, labels = batch_dict['x_data'], batch_dict['y_target']
        y_pred = classifier(inputs)

        # step 3. compute the loss
        loss = loss_func(y_pred, torch.max(labels ,1)[1])
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / (batch_index + 1)

        # step 4. use loss to produce gradients
        loss.backward()

        # step 5. use optimizer to take gradient step
        optimizer.step()
        # -----------------------------------------
        # compute the accuracy
        acc_t = compute_accuracy(y_pred, torch.max(labels, 1)[1])
        running_acc += (acc_t - running_acc) / (batch_index + 1)
    print("Training Loss :", running_loss)
    print("Training Acc: ", running_acc)
    
    # setup: batch generator, set loss and acc to 0; set eval mode on
    dataset.set_split('dev')
    batch_generator = generate_batches(dataset, batch_size=500)
    
    running_loss = 0.
    running_acc = 0.
    classifier.eval()
        
    for batch_index, batch_dict in enumerate(batch_generator):

        # step 2. compute output
        inputs, labels = batch_dict['x_data'], batch_dict['y_target']
        y_pred = classifier(inputs)

        # step 3. compute the loss
        loss = loss_func(y_pred, torch.max(labels,1)[1])
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / (batch_index + 1)
        
        conf_mat = confusion_matrix(torch.max(labels,1)[1], y_pred, 
            labels=np.sort(np.unique(torch.max(labels,1)[1])))
        print(conf_mat)
        print(np.sort(np.unique(torch.max(labels,1)[1])))

        # compute the accuracy
        acc_t = compute_accuracy(y_pred, torch.max(labels, 1)[1])
        running_acc += (acc_t - running_acc) / (batch_index + 1)
        
    print("Development Loss: ", running_loss)
    print("Development Acc: ", running_acc)
    if (running_acc > 87.0):
        break
        
#test predictions
classifier.eval()
dataset.set_split('test')

batch_generator = generate_batches(dataset, batch_size=dataset.test_size, shuffle=False)
for batch_index, batch_dict in enumerate(batch_generator):
    y_pred = classifier(batch_dict['x_data'])
    label_pred = [dataset.index_to_label[torch.max(y,0)[1].item()] for y in y_pred]

df = pd.DataFrame(list(zip(range(len(label_pred)), label_pred)), 
               columns =['id', 'type'])
df.to_csv("mlp_propername_test_predictions.csv", index=False)


Training Loss : 0.7573476718819661
Training Acc:  71.5695652173913
Development Loss:  0.4462601006031036
Development Acc:  83.88
Training Loss : 0.46788396589134046
Training Acc:  82.98695652173913
Development Loss:  0.4340768575668335
Development Acc:  84.4
Training Loss : 0.44488297726797016
Training Acc:  84.09130434782607
Development Loss:  0.45395978093147277
Development Acc:  84.08
Training Loss : 0.42980527165143384
Training Acc:  84.58695652173913
Development Loss:  0.4471429228782654
Development Acc:  84.04
Training Loss : 0.416516477647035
Training Acc:  84.90434782608696
Development Loss:  0.4067946016788483
Development Acc:  84.91999999999999
Training Loss : 0.4025510277437127
Training Acc:  85.6782608695652
Development Loss:  0.40787228345870974
Development Acc:  84.76
Training Loss : 0.39552190575910645
Training Acc:  85.8173913043478
Development Loss:  0.4079636216163635
Development Acc:  84.80000000000001
Training Loss : 0.39618508776892797
Training Acc:  86.02173913043