In [67]:
import pandas as pd
import numpy as np
import datetime
import importlib
import pickle

import data_utils
import model_utils
import train_utils
import evaluation
importlib.reload(data_utils)
importlib.reload(model_utils)
importlib.reload(train_utils)
importlib.reload(evaluation)

import torch
import torch.nn as nn
from torchtext.data import TabularDataset, Field, RawField, BucketIterator, Iterator

In [103]:
DEVICE = "cuda"
HOLDOUT = False

### read data and put in batch

In [104]:
train_data, val_data, test_data = data_utils.prep_all_data(
    device=DEVICE, use_holdout_test=HOLDOUT, x_type="ngram", ngram=2)

### Get models

In [23]:
m = model_utils.GRU()

### Training 

In [82]:
# one hidden layer, various size:
importlib.reload(model_utils)
vocab = list(np.load("./data/1grams.npy")) + list(np.load("./data/2grams.npy"))

results = []
for hidden_dim in [600]:
    print(hidden_dim)
    m = model_utils.BaseModelNGram(len(vocab), hidden_dim=hidden_dim)
    result = train_utils.train(train_data, val_data, m, device=DEVICE,
                              lr=1e-2, print_freq=5, max_epoch=100)
    results.append(result)

600
Epoch: 0, LR: 0.01, Train Loss: 248.7706, Val Loss: 99.9745, Val f1 0.734
Epoch: 5, LR: 0.01, Train Loss: 1.6081, Val Loss: 64.8234, Val f1 0.822
Epoch: 10, LR: 0.01, Train Loss: 0.4995, Val Loss: 68.9143, Val f1 0.826
Epoch: 15, LR: 0.001, Train Loss: 0.3478, Val Loss: 71.8305, Val f1 0.829
Epoch: 20, LR: 0.001, Train Loss: 0.1624, Val Loss: 72.7420, Val f1 0.828
Epoch: 25, LR: 0.001, Train Loss: 0.2038, Val Loss: 71.1568, Val f1 0.825


In [83]:
result

{'train f1 score': 0.9995065383666419,
 'train loss': 0.20380361552186846,
 'trained_model': BaseModelNGram(
   (fc1): Linear(in_features=8927, out_features=600, bias=True)
   (fc3): Linear(in_features=600, out_features=46, bias=True)
   (batchnorm1): BatchNorm1d(600, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
 ),
 'val f1 score': 0.8441441441441441,
 'val loss': 71.15677584398973}

In [78]:
best_model = results[-1].copy()

In [80]:
evaluation.calculate_f1(train_data, best_model["trained_model"])

0.9996299037749815

In [81]:
evaluation.calculate_f1(val_data, best_model["trained_model"])

0.8431431431431431

In [87]:
evaluation.calculate_f1(test_data, best_model["trained_model"])

0.8258258258258259

In [48]:
# same as above, but with batch norm
importlib.reload(model_utils)
vocab = np.load("./data/1grams.npy")

results = []
for hidden_dim in [100, 200, 300, 400]:
    print(hidden_dim)
    m = model_utils.BaseModelNGram(len(vocab), hidden_dim=hidden_dim, p=0.2)
    result = train_utils.train(train_data, val_data, m, device=DEVICE,
                              lr=1e-2, print_freq=5, max_epoch=100)
    results.append(result)

100
Epoch: 0, LR: 0.01, Train Loss: 344.8926, Val Loss: 124.3824, Val f1 0.638
Epoch: 5, LR: 0.01, Train Loss: 9.1615, Val Loss: 65.6312, Val f1 0.816
Epoch: 10, LR: 0.01, Train Loss: 3.0905, Val Loss: 83.9289, Val f1 0.819
Epoch: 15, LR: 0.001, Train Loss: 2.2632, Val Loss: 92.3204, Val f1 0.812
Epoch: 20, LR: 0.001, Train Loss: 0.7796, Val Loss: 92.0266, Val f1 0.810
200
Epoch: 0, LR: 0.01, Train Loss: 272.1828, Val Loss: 111.0880, Val f1 0.689
Epoch: 5, LR: 0.01, Train Loss: 6.6243, Val Loss: 74.4376, Val f1 0.808
Epoch: 10, LR: 0.01, Train Loss: 2.3046, Val Loss: 92.5186, Val f1 0.815
Epoch: 15, LR: 0.001, Train Loss: 1.1693, Val Loss: 89.8095, Val f1 0.817
Epoch: 20, LR: 0.001, Train Loss: 0.7890, Val Loss: 91.4343, Val f1 0.826
Epoch: 25, LR: 0.0001, Train Loss: 0.5500, Val Loss: 92.6521, Val f1 0.820
Epoch: 30, LR: 0.0001, Train Loss: 0.5602, Val Loss: 93.5865, Val f1 0.826
300
Epoch: 0, LR: 0.01, Train Loss: 247.2898, Val Loss: 99.3908, Val f1 0.742
Epoch: 5, LR: 0.01, Train Lo

In [50]:
pd.DataFrame(results)

Unnamed: 0,train f1 score,train loss,trained_model,val f1 score,val loss
0,0.601394,3.370253,BaseModelNGram(\n (fc1): Linear(in_features=1...,0.492693,253.025268
1,0.665347,1.808499,BaseModelNGram(\n (fc1): Linear(in_features=1...,0.555656,238.746633


In [60]:
# same as above, but with two hidden layers
importlib.reload(model_utils)
vocab = np.load("./data/1grams.npy")

results = []
for hidden_dim in [400]:
    print(hidden_dim)
    m = model_utils.BaseModelNGram(len(vocab), hidden_dim=hidden_dim)
    result = train_utils.train(train_data, val_data, m, device=DEVICE,
                              lr=1e-2, print_freq=5, max_epoch=100)
    results.append(result)

400
Epoch: 0, LR: 0.01, Train Loss: 235.9029, Val Loss: 99.8498, Val f1 0.705
Epoch: 5, LR: 0.01, Train Loss: 3.9677, Val Loss: 69.1205, Val f1 0.830
Epoch: 10, LR: 0.01, Train Loss: 3.8108, Val Loss: 91.8424, Val f1 0.811


In [61]:
result

{'train f1 score': 0.9934122871946703,
 'train loss': 2.6497786963740424,
 'trained_model': BaseModelNGram(
   (fc1): Linear(in_features=1977, out_features=400, bias=True)
   (fc3): Linear(in_features=400, out_features=46, bias=True)
   (batchnorm1): BatchNorm1d(400, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
 ),
 'val f1 score': 0.8324324324324324,
 'val loss': 95.39459948496776}

### parameter searching - MLP
best model so far: 3 middle layer, 100 hidden units, dropout=0.2 at second to last layer

saved as ./data/model_checkpoints/MLP_Jan23.mdl

### Parameter Searching Bidirectional LSTM 

### Evaluation

In [1112]:
emb = nn.Embedding(10, 3)

In [1114]:
emb.weight

Parameter containing:
tensor([[-0.0974,  0.2542,  0.8995],
        [ 0.3284, -1.3262,  0.2224],
        [-0.1773, -0.5326,  0.5931],
        [-0.4633,  0.8940,  0.3648],
        [ 2.7178, -0.6496, -1.2509],
        [ 2.0111,  0.7471, -2.4884],
        [ 0.8162,  0.1524,  0.8900],
        [-0.5783, -1.2482,  1.3301],
        [-0.9398, -0.9525, -1.8860],
        [-0.1170,  0.2772,  1.4776]], requires_grad=True)

### Generate submission file for kaggle 

In [88]:
def get_submission(m, test_data):
    m.eval()
    labels = np.load("./data/labels.npy")
    final_labels = []
    for x, y, extra in test_data:
        pred = (torch.sigmoid(m((x, extra["raw_text"]))) > 0.5).int().cpu().numpy()
        batch_size, num_class = pred.shape
        for i in range(batch_size):
            pred_idx = np.arange(num_class)[(pred[i] == 1).astype('bool')] 
            if len(pred_idx) == 0:
                pred_idx = [np.argmax(pred[i]).item()]
            pred_label = [labels[j] for j in pred_idx]
            if "NO_REL" in pred_label and len(pred_label) > 1:
                pred_label.remove("NO_REL")
            final_labels.append(
                {"ID": int(extra["ID"][i]),
                 "CORE RELATIONS": " ".join(pred_label)})
    return pd.DataFrame(final_labels)

In [105]:
label_df = get_submission(best_model["trained_model"], test_data).sort_values(by="ID").set_index("ID")

In [107]:
df_test = pd.read_csv("./data/test.csv", index_col="ID")[["text", "raw_label"]]
df_train = pd.read_csv("./data/train_real.csv", index_col="ID")[["text", "raw_label"]]
df_val = pd.read_csv("./data/val.csv", index_col="ID")[["text", "raw_label"]]

In [84]:
today = datetime.datetime.now().strftime("%b%d")
label_df.to_csv("./data/submissions/{}GRU.csv".format(today))

In [108]:
label_df.join(df_test)

Unnamed: 0_level_0,CORE RELATIONS,text,raw_label
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,movie.starring.actor,star of thor,dummy
1,movie.starring.actor,who is in the movie the campaign,dummy
2,movie.starring.actor,list the cast of the movie the campaign,dummy
3,movie.starring.actor,who was in twilight,dummy
4,movie.starring.actor,who is in vulguria,dummy
5,movie.starring.actor,actor from lost,dummy
6,movie.starring.actor,who played in the movie rocky,dummy
7,movie.starring.actor,who played in the movie captain america,dummy
8,movie.starring.actor,cast and crew for in july,dummy
9,movie.starring.actor,who is in movie in july,dummy


In [94]:
x

Unnamed: 0_level_0,CORE RELATIONS,text,raw_label
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
24,movie.starring.actor,show me movie credits,dummy
25,NO_REL,who does the voices for the movie,dummy
67,movie.starring.actor,what country was black swan from,dummy
84,NO_REL,when was spaceballs released,dummy
86,actor.gender movie.starring.actor,when was pretty woman released,dummy
91,movie.starring.actor,release date for finding nemo,dummy
94,movie.starring.actor,when was freedom writers released,dummy
99,NO_REL,when was date night released,dummy
130,movie.starring.actor,who is the second director on finding nemo,dummy
136,movie.starring.actor,i want to know who is the director of the movi...,dummy


In [106]:
other = lambda x: "other"  in x
norel = lambda x: "NO_REL"  in x

In [124]:
x[(x["CORE RELATIONS"].apply(norel) | x.raw_label.apply(norel))]

Unnamed: 0_level_0,CORE RELATIONS,text,raw_label
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
147,NO_REL,who is in barfi,movie.starring.actor
219,NO_REL,how much did it take to make the avengers,movie.estimated_budget
500,NO_REL,who is the director of ghost busters,movie.directed_by
634,NO_REL,what is the movie category,movie.genre
700,NO_REL,what language was aguirre wrath of god in,movie.language
736,NO_REL,life is beautiful movie page,NO_REL
751,NO_REL,i would like to see will ferrell movies,movie.starring.actor
951,NO_REL,show me information on movie,NO_REL
985,NO_REL,search for movies by language,movie.language
985,NO_REL,search for movies by language,movie.language


In [125]:
np.load("./data/label_vocab_restrict.npy")

array(['actor', 'amount', 'award', 'budget', 'by', 'category',
       'character', 'companies', 'country', 'date', 'description',
       'directed', 'director', 'genre', 'gross', 'language', 'locations',
       'media', 'movie', 'music', 'of', 'other', 'person', 'picture',
       'produced', 'production', 'rating', 'release', 'revenue', 'review',
       'showing', 'star', 'starring', 'synopsis', 'trailer', 'winning',
       'work', 'written'], dtype='<U11')