In [132]:
import pandas as pd
import numpy as np
import datetime
import importlib

import data_utils
import models
import train_utils
import evaluation
importlib.reload(data_utils)
importlib.reload(models)
importlib.reload(train_utils)
importlib.reload(evaluation)

import torch
import torch.nn as nn

### read data and put in batch

In [133]:
BATCH_SIZE = 64
VOCAB_SIZE = len(np.load("./data/vocab.npy"))
data_path = "data"
train_file = "train_real.csv"
val_file = "val.csv"
test_file = "holdout_test.csv"

In [134]:
train_data, val_data = data_utils.prep_train_val(
    data_path, train_file, val_file, batch_size=BATCH_SIZE)

### Get models

In [135]:
m = models.MultiLayerMLP(emb_dim=300)

In [136]:
m

MultiLayerMLP(
  (embedding): Embedding(2043, 300)
  (fcs): ModuleList(
    (0): Linear(in_features=300, out_features=300, bias=True)
  )
  (final_layer): Linear(in_features=300, out_features=46, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

### Training 

In [137]:
best_model, val_loss, val_f1 = train_utils.train(
    train_data, val_data, m)

TypeError: float() argument must be a string or a number, not 'NoneType'

### parameter searching - MLP

In [None]:
models = []
for num_layer in range(1, 6):
    print("number of layers", num_layer)
    model = MultiLayerMLP(num_layers=num_layer)
    best_model, best_val_loss, best_val_f1 = train(train_data, val_data, model)
    models.append({"num_layer": num_layer,
                   "trained_model": best_model, 
                   "best_val_f1": best_val_f1})

In [None]:
pd.DataFrame(models)

In [None]:
models = []
for emb_dim in [10, 20, 40, 80, 100, 150, 200, 250, 300, 400, 500]:
    model = MultiLayerMLP(num_layers=2, emb_dim=emb_dim)
    best_model, best_val_loss, best_val_f1 = train(train_data, val_data, model)
    models.append({"emb_dim": emb_dim,
                   "trained_model": best_model, 
                   "best_val_f1": best_val_f1})

In [None]:
pd.DataFrame(models)

In [None]:
models = []
for p in [0, 0.1, 0.2, 0.3, 0.4]:
    print("p", p)
    model = MultiLayerMLP(num_layers=2, emb_dim=500, p_dropout=p)
    best_model, best_val_loss, best_val_f1 = train(train_data, val_data, model)
    models.append({"dropout": p, 
                   "trained_model": best_model, 
                   "best_val_f1": best_val_f1})

In [None]:
pd.DataFrame(models)

### Parameter Searching Bidirectional LSTM 

In [111]:
emb_dims = [100, 200, 300, 500]

In [None]:
models = []
for emb_dim in emb_dims:
    for dropout in [0, 0.1, 0.2, 0.3, 0.4]:
        print(emb_dim, dropout, bi)
        model = BILSTM(emb_dim=emb_dim, num_layers=2, dropout=dropout, bi=True)
        best_model, best_val_loss, best_val_f1, = train(train_data, val_data, model, lr=1e-2)
        models.append({"emb_dim": emb_dim,
                       "dropout": dropout, 
                       "trained_model": best_model, 
                       "best_val_f1": best_val_f1})

In [None]:
pd.DataFrame(models)

In [95]:
models = []
for unit in [20, 40, 80, 100, 150, 200, 300]:
    print(unit)
    model = BILSTM(emb_dim=500, lstm_unit=unit)
    best_model, best_val_loss, best_val_f1 = train(train_data, val_data, model, lr=1e-2)
    models.append({"lstm_unit": unit,
                   "trained_model": best_model, 
                   "best_val_f1": best_val_f1})

20


  "num_layers={}".format(dropout, num_layers))


Epoch: 0, LR: 0.01, Train Loss: 0.0530, Val Loss: 0.0145, Val F1 score 0.0961
Epoch: 10, LR: 0.01, Train Loss: 0.0089, Val Loss: 0.0116, Val F1 score 0.4486
Epoch: 20, LR: 0.0001, Train Loss: 0.0069, Val Loss: 0.0114, Val F1 score 0.4714
40
Epoch: 0, LR: 0.01, Train Loss: 0.0402, Val Loss: 0.0134, Val F1 score 0.1131
Epoch: 10, LR: 0.001, Train Loss: 0.0058, Val Loss: 0.0102, Val F1 score 0.4989
Epoch: 20, LR: 0.0001, Train Loss: 0.0051, Val Loss: 0.0100, Val F1 score 0.5020
80
Epoch: 0, LR: 0.01, Train Loss: 0.0325, Val Loss: 0.0123, Val F1 score 0.2172
Epoch: 10, LR: 0.001, Train Loss: 0.0040, Val Loss: 0.0111, Val F1 score 0.5272
Epoch: 20, LR: 0.0001, Train Loss: 0.0031, Val Loss: 0.0110, Val F1 score 0.5378
100
Epoch: 0, LR: 0.01, Train Loss: 0.0322, Val Loss: 0.0115, Val F1 score 0.2392
Epoch: 10, LR: 0.01, Train Loss: 0.0023, Val Loss: 0.0089, Val F1 score 0.6197
Epoch: 20, LR: 0.001, Train Loss: 0.0010, Val Loss: 0.0089, Val F1 score 0.6418
150
Epoch: 0, LR: 0.01, Train Loss: 0

In [96]:
pd.DataFrame(models)

Unnamed: 0,best_val_f1,lstm_unit,trained_model
0,0.653468,20,"BILSTM(\n (embedding): Embedding(1687, 500)\n..."
1,0.653468,40,"BILSTM(\n (embedding): Embedding(1687, 500)\n..."
2,0.653468,80,"BILSTM(\n (embedding): Embedding(1687, 500)\n..."
3,0.653468,100,"BILSTM(\n (embedding): Embedding(1687, 500)\n..."
4,0.653468,150,"BILSTM(\n (embedding): Embedding(1687, 500)\n..."
5,0.653468,200,"BILSTM(\n (embedding): Embedding(1687, 500)\n..."
6,0.653468,300,"BILSTM(\n (embedding): Embedding(1687, 500)\n..."


In [97]:
models = []
for unit in [100, 200]:
    for num_layer in [1, 2, 3]:
        print(unit)
        print(num_layer)
        model = BILSTM(emb_dim=500, lstm_unit=unit, num_layers=num_layer)
        best_model, best_val_loss, best_val_f1 = train(train_data, val_data, model)
        models.append({"lstm_unit": unit,
                       "num_layers": num_layer,
                       "trained_model": best_model, 
                       "best_val_f1": best_val_f1})

100
1


  "num_layers={}".format(dropout, num_layers))


Epoch: 0, LR: 0.001, Train Loss: 0.0749, Val Loss: 0.0157, Val F1 score 0.0961
Epoch: 10, LR: 0.001, Train Loss: 0.0114, Val Loss: 0.0101, Val F1 score 0.3485
Epoch: 20, LR: 0.0001, Train Loss: 0.0080, Val Loss: 0.0103, Val F1 score 0.4146
Epoch: 30, LR: 1e-05, Train Loss: 0.0076, Val Loss: 0.0102, Val F1 score 0.4106
100
2
Epoch: 0, LR: 0.001, Train Loss: 0.0649, Val Loss: 0.0150, Val F1 score 0.0961
Epoch: 10, LR: 0.001, Train Loss: 0.0116, Val Loss: 0.0092, Val F1 score 0.4064
Epoch: 20, LR: 0.001, Train Loss: 0.0037, Val Loss: 0.0063, Val F1 score 0.6616
Epoch: 30, LR: 0.001, Train Loss: 0.0015, Val Loss: 0.0060, Val F1 score 0.7089
Epoch: 40, LR: 1e-05, Train Loss: 0.0011, Val Loss: 0.0057, Val F1 score 0.7406
Epoch: 50, LR: 1.0000000000000002e-07, Train Loss: 0.0010, Val Loss: 0.0057, Val F1 score 0.7306
100
3
Epoch: 0, LR: 0.001, Train Loss: 0.0627, Val Loss: 0.0148, Val F1 score 0.0961
Epoch: 10, LR: 0.001, Train Loss: 0.0212, Val Loss: 0.0140, Val F1 score 0.0961
200
1
Epoch: 

In [98]:
pd.DataFrame(models)

Unnamed: 0,best_val_f1,lstm_unit,num_layers,trained_model
0,0.414629,100,1,"BILSTM(\n (embedding): Embedding(1687, 500)\n..."
1,0.740605,100,2,"BILSTM(\n (embedding): Embedding(1687, 500)\n..."
2,0.096096,100,3,"BILSTM(\n (embedding): Embedding(1687, 500)\n..."
3,0.441942,200,1,"BILSTM(\n (embedding): Embedding(1687, 500)\n..."
4,0.734384,200,2,"BILSTM(\n (embedding): Embedding(1687, 500)\n..."
5,0.737609,200,3,"BILSTM(\n (embedding): Embedding(1687, 500)\n..."


In [221]:
### LSTM with added length as additional feature
model = BILSTM_WITH_LEN(emb_dim=300, lstm_unit=150, num_layers=2)
best_model, best_val_loss, best_val_f1 = train(train_data, val_data, model)

Epoch: 0, LR: 0.001, Train Loss: 0.2256, Val Loss: 0.1109, Val F1 score 0.0961
Epoch: 10, LR: 0.001, Train Loss: 0.0357, Val Loss: 0.0577, Val F1 score 0.5142
Epoch: 20, LR: 0.001, Train Loss: 0.0100, Val Loss: 0.0484, Val F1 score 0.6688
Epoch: 30, LR: 0.001, Train Loss: 0.0037, Val Loss: 0.0471, Val F1 score 0.7245
Epoch: 40, LR: 0.001, Train Loss: 0.0017, Val Loss: 0.0488, Val F1 score 0.7277
Epoch: 50, LR: 0.0001, Train Loss: 0.0009, Val Loss: 0.0489, Val F1 score 0.7406


### evaluation

In [220]:
def evaluate(data, m):
    "average f1 score of individual entry"
    m.eval()
    total_f1 = 0
    num_samples = 0
    for x, y in data:
        pred = m(x)
        total_f1 += f1(pred, y)
        num_samples += y.shape[0]
    return(total_f1/num_samples)

    
def f1(y_pred, y_true):
    total_f1 = 0
    y_pred = (torch.sigmoid(y_pred) > 0.5).int().cpu().numpy()  
    batch_size, num_class = y_true.shape    
    for sample_idx in range(batch_size):
        true_idx = np.arange(num_class)[(y_true[sample_idx] == 1).astype(bool)]
        pred_idx = np.arange(num_class)[(y_pred[sample_idx] == 1).astype(bool)]
        # make sure at least to predict one
        assert (y_true[sample_idx].sum() > 0)
        if len(pred_idx) == 0:
            pred_idx = [np.argmax(y_pred[sample_idx]).item()]

        tp = len(np.intersect1d(true_idx, pred_idx))        
        precision = tp/len(pred_idx)
        recall = tp/len(true_idx)
        if (precision + recall) == 0:
            f1_score = 0
        else:
            f1_score = 2 * precision * recall/(precision + recall)
        total_f1 += f1_score
    return total_f1

In [101]:
for m in models:
    print(m)
    print("train f1:, ", evaluate(train_data, m["trained_model"]))
    print("val f1:, ", evaluate(val_data, m["trained_model"]))

{'lstm_unit': 100, 'num_layers': 1, 'trained_model': BILSTM(
  (embedding): Embedding(1687, 500)
  (lstm): LSTM(500, 100, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=200, out_features=46, bias=True)
), 'best_val_f1': 0.41462891462891466}
train f1:,  0.6214038983469036
val f1:,  0.41062491062491063
{'lstm_unit': 100, 'num_layers': 2, 'trained_model': BILSTM(
  (embedding): Embedding(1687, 500)
  (lstm): LSTM(500, 100, num_layers=2, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=200, out_features=46, bias=True)
), 'best_val_f1': 0.7406048906048905}
train f1:,  0.9864298050826548
val f1:,  0.7305948805948805
{'lstm_unit': 100, 'num_layers': 3, 'trained_model': BILSTM(
  (embedding): Embedding(1687, 500)
  (lstm): LSTM(500, 100, num_layers=3, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=200, out_features=46, bias=True)
), 'best_val_f1': 0.0960960960960961}
train f1:,  0.09733530717986677
val f1:,  0.0960960960960961
{'lstm_unit': 200, 'num_layers

### Error analysis

In [None]:
VOCAB_LOOKUP = np.array(TEXT.vocab.itos)

In [None]:
def x2text(seq, onehot=False):
    if onehot:
        idx = np.argmax(seq.cpu(), axis=2)
    else:
        idx = seq.cpu().numpy()
    text = []
    for i in range(seq.shape[1]):
        text.append(VOCAB_LOOKUP[idx[:, i]])
    return text
    

def y2text(manyhot_label):
    labels = np.array(np.load("./data/labels.npy"))
    all_labels = []    
    for i in range(manyhot_label.shape[0]):
        assert manyhot_label[i].sum() > 0
        all_labels.append(labels[manyhot_label[i].astype(bool)])
    return(all_labels)


def error_analysis(data, m, onehot=False):
    for x, y in data:
        pred = m(x)
        batch_size = x.shape[1]
        y_pred = (torch.sigmoid(pred) > 0.5).int().cpu().numpy()
        for i in range(batch_size):
            if y_pred[i, :].sum() == 0:
                y_pred[i, np.argmax(y_pred[i, :])] = 1
        
        correct = np.all(y_pred==y, axis=1)
        print("total correct: {} out of {}".format(correct.sum(), batch_size))
        
        incorrect_idx = np.arange(batch_size)[(1-correct).astype(bool)]

        x_text = x2text(x, onehot=onehot)
        y_label_true = y2text(y)
        y_label_pred = y2text(y_pred)
        
        for i in incorrect_idx:
#             break
            print(x_text[i])
            print("--------------")
            print("true", y_label_true[i])
            print("pred", y_label_pred[i])
            print("correct?", correct[i])
            print()

In [None]:
error_analysis(val_data, models[1]["trained_model"], onehot=False)

### Generate submission file for kaggle 

In [None]:
def get_submission(m):
    m.eval()
    labels = np.load("./data/labels.npy")
    final_labels = []
    tokenize = lambda x: x.split()
    text_field = Field(sequential=True, 
                       tokenize=tokenize, 
                       lower=True, 
                       include_lengths=False)
    tst_datafields = [("ID", RawField()),
                      ("UTTERANCE", TEXT)]
    tst = TabularDataset(
        path="data/original_data/hw1_test.csv", # the file path
        format='csv',
        skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
        fields=tst_datafields)
    test_iter = Iterator(tst, batch_size=len(tst), device='cuda', shuffle=False,
                         sort=False, sort_within_batch=False, repeat=False)
    for batch in test_iter:
        pred = (torch.sigmoid(m(batch.UTTERANCE)) > 0.5).int().cpu()
        for sample in pred:
            if sample.sum() == 0:
                pred_label = [labels[np.argmax(sample)]]
            else:
                pred_label = np.array(labels)[sample==1]
            final_labels.append(" ".join(pred_label))
    test_df = pd.read_csv("./data/original_data/hw1_test.csv")
    test_df["CORE RELATIONS"] = final_labels
    return test_df

In [None]:
test_df = get_submission(models[0]["trained_model"])

In [None]:
today = datetime.datetime.today().strftime("%b%d")
test_df.set_index("ID")[["CORE RELATIONS"]].to_csv(
    "./data/submissions/{}.csv".format(today))