In [307]:
import pandas as pd
import numpy as np
import datetime
import importlib

import data_utils
import model_utils
import train_utils
import evaluation
importlib.reload(data_utils)
importlib.reload(model_utils)
importlib.reload(train_utils)
importlib.reload(evaluation)

import torch
import torch.nn as nn

### read data and put in batch

In [308]:
BATCH_SIZE = 64
VOCAB_SIZE = len(np.load("./data/vocab.npy"))
data_path = "data"
train_file = "train_real.csv"
val_file = "val.csv"
test_file = "holdout_test.csv"

In [309]:
train_data, val_data = data_utils.prep_train_val(
    data_path, train_file, val_file, batch_size=BATCH_SIZE)

### Get models

In [310]:
m = model_utils.MultiLayerMLP()

### Training 

In [311]:
result = train_utils.train(train_data, val_data, m, lr=1e-3)

Epoch: 0, LR: 0.001, Train Loss: 14.4529, Val Loss: 14.9266, Val f1 0.096
Epoch: 10, LR: 0.001, Train Loss: 3.7388, Val Loss: 3.7423, Val f1 0.503
Epoch: 20, LR: 0.001, Train Loss: 2.1122, Val Loss: 2.3308, Val f1 0.710
Epoch: 30, LR: 0.001, Train Loss: 1.4026, Val Loss: 2.0568, Val f1 0.776
Epoch: 40, LR: 0.001, Train Loss: 1.0980, Val Loss: 2.1321, Val f1 0.782
Epoch: 50, LR: 0.0001, Train Loss: 0.6411, Val Loss: 2.1379, Val f1 0.801
Epoch: 60, LR: 1e-05, Train Loss: 0.5960, Val Loss: 2.1204, Val f1 0.803


### parameter searching - MLP
best model so far: 3 middle layer, 100 hidden units, dropout=0.2 at second to last layer

saved as ./data/model_checkpoints/MLP_Jan23.mdl

### Parameter Searching Bidirectional LSTM 

### evaluation

### Error analysis

In [None]:
VOCAB_LOOKUP = np.array(TEXT.vocab.itos)

In [None]:
def x2text(seq, onehot=False):
    if onehot:
        idx = np.argmax(seq.cpu(), axis=2)
    else:
        idx = seq.cpu().numpy()
    text = []
    for i in range(seq.shape[1]):
        text.append(VOCAB_LOOKUP[idx[:, i]])
    return text
    

def y2text(manyhot_label):
    labels = np.array(np.load("./data/labels.npy"))
    all_labels = []    
    for i in range(manyhot_label.shape[0]):
        assert manyhot_label[i].sum() > 0
        all_labels.append(labels[manyhot_label[i].astype(bool)])
    return(all_labels)


def error_analysis(data, m, onehot=False):
    for x, y in data:
        pred = m(x)
        batch_size = x.shape[1]
        y_pred = (torch.sigmoid(pred) > 0.5).int().cpu().numpy()
        for i in range(batch_size):
            if y_pred[i, :].sum() == 0:
                y_pred[i, np.argmax(y_pred[i, :])] = 1
        
        correct = np.all(y_pred==y, axis=1)
        print("total correct: {} out of {}".format(correct.sum(), batch_size))
        
        incorrect_idx = np.arange(batch_size)[(1-correct).astype(bool)]

        x_text = x2text(x, onehot=onehot)
        y_label_true = y2text(y)
        y_label_pred = y2text(y_pred)
        
        for i in incorrect_idx:
#             break
            print(x_text[i])
            print("--------------")
            print("true", y_label_true[i])
            print("pred", y_label_pred[i])
            print("correct?", correct[i])
            print()

In [None]:
error_analysis(val_data, models[1]["trained_model"], onehot=False)

### Generate submission file for kaggle 

In [None]:
def get_submission(m):
    m.eval()
    labels = np.load("./data/labels.npy")
    final_labels = []
    tokenize = lambda x: x.split()
    text_field = Field(sequential=True, 
                       tokenize=tokenize, 
                       lower=True, 
                       include_lengths=False)
    tst_datafields = [("ID", RawField()),
                      ("UTTERANCE", TEXT)]
    tst = TabularDataset(
        path="data/original_data/hw1_test.csv", # the file path
        format='csv',
        skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
        fields=tst_datafields)
    test_iter = Iterator(tst, batch_size=len(tst), device='cuda', shuffle=False,
                         sort=False, sort_within_batch=False, repeat=False)
    for batch in test_iter:
        pred = (torch.sigmoid(m(batch.UTTERANCE)) > 0.5).int().cpu()
        for sample in pred:
            if sample.sum() == 0:
                pred_label = [labels[np.argmax(sample)]]
            else:
                pred_label = np.array(labels)[sample==1]
            final_labels.append(" ".join(pred_label))
    test_df = pd.read_csv("./data/original_data/hw1_test.csv")
    test_df["CORE RELATIONS"] = final_labels
    return test_df

In [None]:
test_df = get_submission(models[0]["trained_model"])

In [None]:
today = datetime.datetime.today().strftime("%b%d")
test_df.set_index("ID")[["CORE RELATIONS"]].to_csv(
    "./data/submissions/{}.csv".format(today))