In [309]:
import pandas as pd
import numpy as np
import datetime
import torch
from torchtext.data import TabularDataset, Field, RawField, BucketIterator, Iterator

### read data and put in batch

In [646]:
BATCH_SIZE = 64

In [647]:
tokenize = lambda x: x.split()
TEXT = Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=False)
# optional for field: pad_first=True to pad in the beginning

LABEL = RawField(preprocessing=lambda x: np.array([int(i) for i in list(x)]))

In [648]:
datafields = [("ID", None), # we won't be needing the id, so we pass in None as the field
              ("text", TEXT),
              ("label", LABEL)]

In [649]:
trn, vld = TabularDataset.splits(
        path="data", 
        train='train_real.csv', validation="val.csv",
        format='csv',
        skip_header=True,
        fields=datafields)

In [650]:
TEXT.build_vocab(trn, vld)

In [651]:
train_iter, val_iter = BucketIterator.splits(
        (trn, vld), # we pass in the datasets we want the iterator to draw data from
        batch_sizes=(BATCH_SIZE, BATCH_SIZE),
        device=torch.device("cuda"), # specify the GPU number here
        sort_key=lambda x: len(x.text), # the BucketIterator needs to be told what function it should use to group the data.
        sort_within_batch=False,
        repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)

In [652]:
# batch wrapper
class BatchWrapper:
    def __init__(self, data, x_name, y_name, onehot=False):
        self.data, self.x_name, self.y_name = data, x_name, y_name
        self.onehot = onehot
    
    def one_hot(self, seq_batch,vocab):
        # seq_batch.size() should be [seq,batch] or [batch,]
        # return size() would be [seq,batch,depth] or [batch,depth]
        out = torch.zeros(seq_batch.size()+torch.Size([vocab]))
        dim = len(seq_batch.size())
        index = seq_batch.view(seq_batch.size()+torch.Size([1]))
        return out.scatter_(dim,index,1)
    
    def __iter__(self):
        for batch in self.data:
            x = getattr(batch, self.x_name) # we assume only one input in this wrapper
            if self.onehot:
                x = one_hot(x.cpu(), len(TEXT.vocab)).cuda()
            y = np.stack(getattr(batch, self.y_name))
            y = np.stack(y, axis=0)
            yield (x, y)
    
    def __len__(self):
        return len(self.data)

In [653]:
train_data = BatchWrapper(train_iter, "text", "label")
val_data = BatchWrapper(val_iter, "text", "label")
train_onehot = BatchWrapper(train_iter, "text", "label", onehot=True)
val_onehot = BatchWrapper(val_iter, "text", "label", onehot=True)

### simple linear baseline 

In [388]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

In [390]:
class MultiLayerMLP(nn.Module):
    def __init__(self, emb_dim=100, hidden_dim=200, output_dim=46):
        super().__init__() # don't forget to call this!
        self.embedding = nn.Embedding(len(TEXT.vocab), emb_dim)
        self.fc1 = nn.Linear(emb_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.bn = nn.BatchNorm1d(hidden_dim)
        
    def forward(self, seq):
        emb = self.embedding(seq).sum(dim=0) # sum of word embedding
        # hidden = F.relu(self.bn(self.fc1(emb))) batch norm is no good
        hidden = F.relu(self.fc1(emb))
        preds = self.fc2(hidden)
        return preds

In [389]:
class Baseline(nn.Module):
    def __init__(self, emb_dim=100, hidden_dim=200, output_dim=46):
        super().__init__() # don't forget to call this!
        self.embedding = nn.Embedding(len(TEXT.vocab), emb_dim)
        self.linear = nn.Linear(emb_dim, output_dim)
    
    def forward(self, seq):
        emb = self.embedding(seq).sum(dim=0) # sum of word embedding
        preds = self.linear(emb)
        return preds

In [403]:
class OneHotMLP2LSTM(nn.Module):
    """ v1 is to flatten the one hot vector first """
    def __init__(self, vocab=len(TEXT.vocab), emb_dim=100, lstm_unit=100, output_dim=46):
        super().__init__()
        self.fc1 = nn.Linear(vocab, emb_dim)
        self.lstm = nn.LSTM(emb_dim, lstm_unit)
        self.fc2 = nn.Linear(lstm_unit, output_dim)
        
    def forward(self, seq):
        emb = F.relu(self.fc1(seq.view(seq.shape[1], seq.shape[0], -1)))
        hidden, _ = self.lstm(emb.view(emb.shape[1], emb.shape[0], -1))
        preds = self.fc2(hidden[-1, :, :])
        return preds

In [404]:
class LSTM(nn.Module):
    def __init__(self, emb_dim=100, lstm_unit=100, output_dim=46):
        super().__init__() 
        self.embedding = nn.Embedding(len(TEXT.vocab), emb_dim)
        self.lstm = nn.LSTM(emb_dim, lstm_unit)
        self.linear = nn.Linear(lstm_unit, output_dim)
    
    def forward(self, seq):
        emb = self.embedding(seq)
        hidden, _ = self.lstm(emb)        
        preds = self.linear(hidden[-1, :, :])
        return preds

In [405]:
class LSTM_ONEHOT(nn.Module):
    def __init__(self, onehot_dim=len(TEXT.vocab), lstm_unit=100, output_dim=46):
        super().__init__() 
        self.lstm = nn.LSTM(onehot_dim, lstm_unit)
        self.linear = nn.Linear(lstm_unit, output_dim)
    
    def forward(self, seq):
        hidden, _ = self.lstm(seq)        
        preds = self.linear(hidden[-1, :, :])
        return preds

In [406]:
class BI_LSTM(nn.Module):
    def __init__(self, emb_dim=100, lstm_unit=100, output_dim=46):
        super().__init__() 
        self.embedding = nn.Embedding(len(TEXT.vocab), emb_dim)
        self.lstm = nn.LSTM(emb_dim, lstm_unit, bidirectional=True)
        self.fc = nn.Linear(lstm_unit*2, output_dim)
    
    def forward(self, seq):
        emb = self.embedding(seq)
        lstm_out, _ = self.lstm(emb)
        preds = self.fc(lstm_out[-1, :, :])
        return preds

### Training 

In [407]:
import tqdm

In [447]:
patience = 10
max_epoch = 100
learning_rate = 1e-2
VOCAB = len(TEXT.vocab)
NUM_CLASS = 46
embedding_dim = 10

model = OneHotMLP2LSTM()
model.to(torch.device("cuda"))
loss_func = nn.BCEWithLogitsLoss()

opt = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer=opt, factor=0.1, patience=5)

In [448]:
model

OneHotMLP2LSTM(
  (fc1): Linear(in_features=1687, out_features=100, bias=True)
  (lstm): LSTM(100, 100)
  (fc2): Linear(in_features=100, out_features=46, bias=True)
)

In [450]:
no_improvement = 0
best_val_loss = 1e10
best_model = model
for epoch in range(max_epoch):
    if no_improvement > patience:
        break
    running_loss = 0.0
    running_corrects = 0
    model.train() # turn on training mode
    for x, y in train_onehot: 
        opt.zero_grad()
        preds = model(x)        
        loss = loss_func(preds, torch.tensor(y).type_as(preds))
        loss.backward()
        opt.step()
        running_loss += loss.item() * x.size(0)
        
    epoch_loss = running_loss / len(trn)
    # calculate the validation loss for this epoch
    val_loss = 0.0
    model.eval() # turn on evaluation mode
    for x, y in val_onehot:
        preds = model(x)
        loss = loss_func(preds, torch.tensor(y).type_as(preds))
        val_loss += loss.item() * x.size(0)
    val_loss /= len(vld)
    if val_loss < best_val_loss:
        no_improvement = 0
        best_val_loss = val_loss
        best_model = model
    else:
        no_improvement += 1
    
    scheduler.step(val_loss)
    
    print('Epoch: {}, LR: {}, Train Loss: {:.4f}, Val Loss: {:.4f}'.format(
        epoch, opt.param_groups[0]['lr'], epoch_loss, val_loss))

Epoch: 0, LR: 0.01, Train Loss: 0.0235, Val Loss: 0.0121
Epoch: 1, LR: 0.01, Train Loss: 0.0192, Val Loss: 0.0109
Epoch: 2, LR: 0.01, Train Loss: 0.0172, Val Loss: 0.0090
Epoch: 3, LR: 0.01, Train Loss: 0.0150, Val Loss: 0.0081
Epoch: 4, LR: 0.01, Train Loss: 0.0130, Val Loss: 0.0085
Epoch: 5, LR: 0.001, Train Loss: 0.0115, Val Loss: 0.0072
Epoch: 6, LR: 0.001, Train Loss: 0.0108, Val Loss: 0.0071
Epoch: 7, LR: 0.001, Train Loss: 0.0101, Val Loss: 0.0072
Epoch: 8, LR: 0.001, Train Loss: 0.0099, Val Loss: 0.0071
Epoch: 9, LR: 0.001, Train Loss: 0.0098, Val Loss: 0.0070
Epoch: 10, LR: 0.001, Train Loss: 0.0097, Val Loss: 0.0070
Epoch: 11, LR: 0.0001, Train Loss: 0.0096, Val Loss: 0.0069
Epoch: 12, LR: 0.0001, Train Loss: 0.0093, Val Loss: 0.0070
Epoch: 13, LR: 0.0001, Train Loss: 0.0093, Val Loss: 0.0070
Epoch: 14, LR: 0.0001, Train Loss: 0.0093, Val Loss: 0.0070
Epoch: 15, LR: 0.0001, Train Loss: 0.0090, Val Loss: 0.0070
Epoch: 16, LR: 0.0001, Train Loss: 0.0094, Val Loss: 0.0070
Epoch:

In [494]:
def evaluate(data, m):
    "average f1 score of individual entry"
    total_f1 = 0
    num_samples = 0
    for x, y in data:
        pred = m(x)
        total_f1 += f1(pred, y)
        num_samples += x.size()[1]
    print(total_f1/num_samples)

    
def f1(y_pred, y_true):
    total_f1 = 0
    y_pred = (torch.sigmoid(y_pred) > 0.5).int().cpu()    
    for sample_idx in range(y_pred.size()[0]):
        true_idx = np.arange(len(y_true[sample_idx]))[y_true[sample_idx] == 1]
        pred_idx = np.arange(len(y_pred[sample_idx]))[y_pred[sample_idx] == 1]
        # make sure at least to predict one
        assert (y_true[sample_idx].sum() > 0)
        if len(pred_idx) == 0:
            pred_idx = [np.argmax(y_pred[sample_idx]).item()]

        tp = len(np.intersect1d(true_idx, pred_idx))        
        precision = tp/len(pred_idx)
        recall = tp/len(true_idx)
        if (precision + recall) == 0:
            f1_score = 0
        else:
            f1_score = 2 * precision * recall/(precision + recall)
        
#         print(true_idx)
#         print(pred_idx)
#         print(f1_score)
        
        total_f1 += f1_score
        break
    return total_f1

In [495]:
evaluate(train_onehot, best_model)

0.011226252158894647


In [679]:
evaluate(val_onehot, best_model)

0.012012012012012012


### Error analysis

In [548]:
VOCAB = np.array(TEXT.vocab.itos)

In [615]:
np.array([0,1]).astype(bool)

array([False,  True])

In [694]:
def x2text(seq):
    idx = np.argmax(seq.cpu(), axis=2)
    text = []
    for i in range(seq.shape[1]):
        text.append(VOCAB[idx[:, i]])
    return text
    

def y2text(manyhot_label):
    labels = np.array(np.load("./data/labels.npy"))
    all_labels = []    
    for i in range(manyhot_label.shape[0]):
        if manyhot_label[i].sum() == 0:
            all_labels.append([labels[np.argmax(manyhot_label[i])]])
        else:
            all_labels.append(labels[manyhot_label[i].astype(bool)])
    return(all_labels)


def error_analysis(data, m):
    for x, y in data:
        pred = m(x)
        batch_size = x.shape[1]
        y_pred = (torch.sigmoid(pred) > 0.5).int().cpu().numpy()
        
        correct = np.all(y_pred==y, axis=1)
        print("total correct: {} out of {}".format(correct.sum(), batch_size))
        
        incorrect_idx = np.arange(batch_size)[1-correct]

        x_text = x2text(x)
        y_label_true = y2text(y)
        y_label_pred = y2text(y_pred)
        
        for i in incorrect_idx:
#             break
            print(x_text[i])
            print("--------------")
            print("true", y_label_true[i])
            print("pred", y_label_pred[i])
            print("correct?", correct[i])
            print()
        break

In [695]:
error_analysis(train_onehot, best_model)

total correct: 34 out of 64
['who' 'played' 'opposite' 'sidney' 'poitier' 'in' 'shoot' 'to' 'kill'
 '<pad>' '<pad>' '<pad>' '<pad>' '<pad>' '<pad>' '<pad>' '<pad>' '<pad>']
--------------
true ['movie.starring.actor' 'movie.starring.character']
pred ['movie.starring.actor']
correct? False

['who' 'played' 'opposite' 'sidney' 'poitier' 'in' 'shoot' 'to' 'kill'
 '<pad>' '<pad>' '<pad>' '<pad>' '<pad>' '<pad>' '<pad>' '<pad>' '<pad>']
--------------
true ['movie.starring.actor' 'movie.starring.character']
pred ['movie.starring.actor']
correct? False

['show' 'me' 'the' 'comedy' 'pitch' 'perfect' '<pad>' '<pad>' '<pad>'
 '<pad>' '<pad>' '<pad>' '<pad>' '<pad>' '<pad>' '<pad>' '<pad>' '<pad>']
--------------
true ['movie.genre']
pred ['NO_REL']
correct? False

['show' 'me' 'the' 'comedy' 'pitch' 'perfect' '<pad>' '<pad>' '<pad>'
 '<pad>' '<pad>' '<pad>' '<pad>' '<pad>' '<pad>' '<pad>' '<pad>' '<pad>']
--------------
true ['movie.genre']
pred ['NO_REL']
correct? False

['who' 'played' 'oppos

### Generate submission file for kaggle 

In [155]:
def get_submission(m):
    labels = np.load("./data/labels.npy")
    final_labels = []
    tokenize = lambda x: x.split()
    text_field = Field(sequential=True, 
                       tokenize=tokenize, 
                       lower=True, 
                       include_lengths=False)
    tst_datafields = [("ID", RawField()),
                      ("UTTERANCE", TEXT)]
    tst = TabularDataset(
        path="data/original_data/hw1_test.csv", # the file path
        format='csv',
        skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
        fields=tst_datafields)
    test_iter = Iterator(tst, batch_size=len(tst), device='cuda', shuffle=False,
                         sort=False, sort_within_batch=False, repeat=False)
    for batch in test_iter:
        pred = (torch.sigmoid(m(batch.UTTERANCE)) > 0.5).int().cpu()
        for sample in pred:
            if sample.sum() == 0:
                pred_label = [labels[np.argmax(sample)]]
            else:
                pred_label = np.array(labels)[sample==1]
            final_labels.append(" ".join(pred_label))
    test_df = pd.read_csv("./data/original_data/hw1_test.csv")
    test_df["CORE RELATIONS"] = final_labels
    return test_df

In [156]:
test_df = get_submission(best_model)

In [167]:
today = datetime.datetime.today().strftime("%b%d")
test_df.set_index("ID")[["CORE RELATIONS"]].to_csv(
    "./data/submissions/bi-lstm_{}.csv".format(today))

'Jan21'