In [None]:
!pip install fasttext transformers sacremoses memory-profiler
!pip install -U kaleido
from memory_profiler import memory_usage
from matplotlib import pyplot as plt
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.model_selection import StratifiedKFold
from google.colab import drive
import pandas as pd
import plotly.express as px
import os
import json
import spacy
import re
import pickle
import sklearn
from pathlib import Path
import time
import fasttext
import torch
import torch.nn as nn
from tqdm.notebook import tqdm
import numpy as np
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader
from sklearn.metrics import f1_score  
from sklearn.experimental import enable_halving_search_cv 
from sklearn.model_selection import HalvingGridSearchCV
from transformers import AutoTokenizer, AutoModel
from copy import deepcopy
import warnings
import json
from torch.utils.tensorboard import SummaryWriter
warnings.filterwarnings("ignore")

tqdm.pandas()

## Description
This notebook presents the results of classification (pos/neg) of the reviews from the Booking.com dataset.  The classification was performed using MLP, Convolution, LSTM and embedding models trained with different techniques on multiple language corporas.

In [None]:
drive_path = '/content/drive/'
BASE_DIR = drive_path + 'MyDrive/ColabNotebooks/studia/L3'
drive.mount(drive_path, force_remount=True)
log_dir = BASE_DIR + '/logs/'
FIGURES_DIR = BASE_DIR + "/figures"

os.makedirs(log_dir, exist_ok=True)
writer = SummaryWriter(log_dir)

booking_path = os.path.join(BASE_DIR, "booking.txt")
dataframes_path = os.path.join(BASE_DIR, "dataframes")

EMBEDDING_FILES = [
    (os.path.join(dataframes_path, "clean_inputs_clean_booking_cbow_sampled.tar.gz"), "Booking CBOW"),
    (os.path.join(dataframes_path, "clean_inputs_clean_booking_skipgram_sampled.tar.gz"), "Booking Skip-Gram"),
    (os.path.join(dataframes_path, "clean_inputs_clean_oscar_cbow_sampled.tar.gz"), "Oscar CBOW"),
    (os.path.join(dataframes_path, "clean_inputs_clean_oscar_skipgram_sampled.tar.gz"), "Oscar Skip-Gram"),
    # (os.path.join(dataframes_path, "clean_herbert.pkl"), "HerBERT"), TODO: sample
    (os.path.join(dataframes_path, "clean_kgr10_sampled.tar.gz"), "KGR10 CBOW"),
]

Mounted at /content/drive/


In [None]:
def get_training_data(df, column, max_samples):
    df = df[df[column].notnull()].sample(max_samples, random_state=1)
    return np.array(df[column].values.tolist())

def load_embeddings(path, max_samples_per_class=100):
    df = pd.read_pickle(path)
    x_res = []
    y_res = []
    for y_val, col in [(1, "vec_positive"), (0, "vec_negative")]:
        x = get_training_data(df, col, max_samples_per_class)
        x_res.append(x)
        y_res.append(np.full((x.shape[0], 1), y_val))
    
    return np.concatenate(x_res, axis=0), np.concatenate(y_res, axis=0)

## Model klasyfikacji

In [None]:
class training_set(torch.utils.data.Dataset):
    def __init__(self,X,Y):
        self.X = torch.from_numpy(X).float()                          
        self.Y = torch.from_numpy(Y).float()                     

    def __len__(self):
        return len(self.X)                   

    def __getitem__(self, idx):
      return [self.X[idx], self.Y[idx]] 

def to_dl(x, y):
  train_data = training_set(x, y)
  return torch.utils.data.DataLoader(train_data, shuffle=False, batch_size=32)

In [None]:
class BookingClassifier(sklearn.base.BaseEstimator, sklearn.base.ClassifierMixin):
    def __init__(self, **kwargs):
        self.model = ...

    def init_model(self):
        pass

    def fit(self, X, y):
        # print("FIT")
        self.init_model()
        train_dl = to_dl(X, y)
        optimiser = optim.Adam(self.model.parameters())
        loss_fn = torch.nn.BCELoss()
        bar = tqdm(range(self.epochs))
        # print("le")
        for epoch in bar:
            self.model.train()
            for X_batch, y_batch in train_dl:
                X_batch = self.adjust(X_batch)
                y_pred = self.model(X_batch.cuda())
                loss = loss_fn(y_pred, y_batch.cuda())

                loss.backward()
                optimiser.step()
                optimiser.zero_grad() 
                bar.set_description(f"Loss: {loss:.4f}")
        return self

    def predict(self, X):
        return torch.round(self.model(self.adjust(torch.from_numpy(X).float().cuda())).detach().cpu()).numpy()

    def predict_proba(self, X):
        return self.model(self.adjust(torch.from_numpy(X).float().cuda())).detach().cpu().numpy()

    def adjust(self, X):
        return X

In [None]:
class MLP(BookingClassifier):
    def __init__(self, epochs=20, in_dim=100, hid_dim=50, layers=2, **kwargs): 
        super(MLP, self).__init__(**kwargs)
        self.hid_dim = hid_dim
        self.layers = layers
        self.epochs = epochs
        self.in_dim = in_dim

    def init_model(self):
        self.model = nn.Sequential()
        curr_in = self.in_dim
        for i in range(self.layers):
            self.model.add_module(f"lin_{i}", torch.nn.Linear(curr_in, self.hid_dim)) 
            self.model.add_module(f"lin_{i}_act" , torch.nn.ReLU())
            curr_in = self.hid_dim

        self.model.add_module("lin_last", torch.nn.Linear(curr_in, 1)) 
        self.model.add_module("last_sigm" , torch.nn.Sigmoid())
        self.model.cuda()
    
    def adjust(self, X):
        return torch.mean(X, dim=1).squeeze()
    
    def fit(self, X, y):
        return super().fit(X, y)

In [None]:
class LSTMReshape(nn.Module):
   
    def forward(self,x):
        out, hidden = x
        return out[: ,-1]


class RNN(BookingClassifier):
    def __init__(self, epochs=20, in_dim=100, hid_dim=50, layers=2, lin_hid=128, bi=True, dropout=0.1, **kwargs): 
        super(RNN, self).__init__(**kwargs)
        self.hid_dim = hid_dim
        self.layers = layers
        self.epochs = epochs
        self.in_dim = in_dim
        self.lin_hid = lin_hid
        self.bi = bi
        self.dropout = dropout

    def init_model(self):
        self.model = nn.Sequential()
        self.model.add_module("lstm", torch.nn.LSTM(batch_first=True, num_layers=self.layers, bidirectional=self.bi, dropout=self.dropout, input_size=self.in_dim, hidden_size=self.hid_dim))
        self.model.add_module("reshape", LSTMReshape())
        self.model.add_module("lstm_act", torch.nn.Tanh())
        # self.model.add_module("lin_pre_last", torch.nn.LazyLinear(self.lin_hid)) 
        # self.model.add_module("lin_pre_last_act", torch.nn.ReLU()) 
        self.model.add_module("lin_last", torch.nn.LazyLinear(1))  
        self.model.add_module("last_sigm" , torch.nn.Sigmoid())
        self.model.cuda()

    def fit(self, X, y):
        return super().fit(X, y)

In [None]:
class Conv(BookingClassifier):
    def __init__(self, epochs=20, in_dim=100, hid_ch=50, layers=2, kernel_size=2, lin_hid=128, **kwargs): 
        super(Conv, self).__init__(**kwargs)
        self.hid_ch = hid_ch
        self.layers = layers
        self.epochs = epochs
        self.in_dim = in_dim
        self.kernel_size = kernel_size
        self.lin_hid = lin_hid

    def init_model(self):
        self.model = nn.Sequential()
        curr_in = self.in_dim
        curr_ch = self.in_dim
        for i in range(self.layers):
            self.model.add_module(f"conv_{i}", torch.nn.Conv1d(curr_in, self.hid_ch, self.kernel_size)) 
            self.model.add_module(f"conv_{i}_act" , torch.nn.ReLU())
            curr_in = self.hid_ch

        self.model.add_module("pool", torch.nn.MaxPool1d(self.kernel_size)) 
        self.model.add_module("flatten", torch.nn.Flatten()) 
        # self.model.add_module("lin_pre_last", torch.nn.LazyLinear(self.lin_hid)) 
        # self.model.add_module("lin_pre_last_act", torch.nn.ReLU()) 
        self.model.add_module("lin_last", torch.nn.LazyLinear(1))  
        self.model.add_module("last_sigm" , torch.nn.Sigmoid())
        self.model.cuda()
    
    def adjust(self, X):
        return X.permute(0,2,1)
    
    def fit(self, X, y):
        return super().fit(X, y)

## Uczenie i wyniki

In [None]:
# X, y = load_embeddings(EMBEDDING_FILES[3][0], max_samples_per_class=5000)


In [None]:
# X.shape

In [None]:
# model = RNN(**MODELS_CONFIG["LSTM"][1])
# cv_results = cross_validate(
#             model, X, y, 
#             cv=StratifiedKFold(n_splits=10, random_state=1, shuffle=True), 
#             scoring={
#                 "accuracy": "accuracy",
#                 'f1_weighted':'f1_weighted',
#                 'recall':'recall',
#                 # "memory": forward_memory_scorer
#             },
#             n_jobs=-1
#         )
# model.fit(X, y)

# cv_results

In [None]:
# print(model.predict(X[0:2000]), y[0:2000])
# print(y.shape[0])
# len(y[y == 1])

In [None]:
# model.model

In [None]:
def forward_memory_scorer(clf, x, y):
    try:
        return max(memory_usage((clf.predict, [x], {})))
    except:
        return 0



In [None]:
metrics = ["Accuracy", "F1", "Recall", "Forward time", "Fit time"] #, "Forward memory"

def run_test(model_cls, params):
    results = {}
    for path, name in tqdm(EMBEDDING_FILES):
        # print(name)
        X, y = load_embeddings(path, max_samples_per_class=5000)
        print("loaded ", X.shape)
        model = model_cls()
        params["in_dim"] = [X.shape[-1]]
        # params["in_dim"] = X.shape[-1]
        best_params = HalvingGridSearchCV(
            model,
            params, 
            resource='n_samples',
            scoring='accuracy',
            random_state=1, 
            cv=StratifiedKFold(n_splits=3, random_state=1, shuffle=True),
            verbose=1,
            n_jobs=-1,
            refit=False,
            # aggressive_elimination=True,
            # max_resources=5000
        ).fit(X, y).best_params_
        # print(params)

        model = model_cls(**best_params)
        cv_results = cross_validate(
            model, X, y, 
            cv=StratifiedKFold(n_splits=5, random_state=2, shuffle=True), 
            scoring={
                "accuracy": "accuracy",
                'f1_weighted':'f1_weighted',
                'recall':'recall',
                # "memory": forward_memory_scorer
            },
            n_jobs=3
        )
        print(cv_results)
        results[name] = {
            "Accuracy": cv_results["test_accuracy"].tolist(),
            "F1": cv_results["test_f1_weighted"].tolist(),
            "Recall": cv_results["test_recall"].tolist(),
            # "Forward memory": cv_results["test_memory"].tolist(),
            "Forward time": cv_results["score_time"].tolist(),
            "Fit time": cv_results["fit_time"].tolist(),
            "params": params
        }
    return results

In [None]:
MODELS_CONFIG = {
    "LSTM": (RNN, {
        "epochs": [20, 50],
        "layers": [1, 3],
        "hid_dim": [50, 100],
        # "lin_hid": [64, 128],
        "bi": [True], 
        "dropout": [0.2, 0.4]
    }),
    "Conv": (Conv, {
        "epochs": [20, 50],
        "layers": [1, 3],
        "hid_ch": [40, 60],
        "kernel_size": [2, 4],
        # "lin_hid": [64, 128],
    }),
    "MLP": (MLP, {
        "epochs": [20,  50],
        "layers": [1, 2, 3, 5],
        "hid_dim": [30, 50, 100]
    }),
}

# MODELS_CONFIG = {
#     "MLP": (MLP, {
#         "epochs": 50,
#         "layers":  3,
#         "hid_dim": 50
#     }),
#     "Conv": (Conv, {
#         "epochs": 50,
#         "layers": 1,
#         "hid_ch": 60,
#         "kernel_size": 4,
#         # "lin_hid": [64, 128],
#     }),
#     "LSTM": (RNN, {
#         "epochs": 50,
#         "layers": 1,
#         "hid_dim": 50,
#         # "lin_hid": [64, 128],
#         "bi": True, 
#         "dropout": 0.3
#     }),
    
# }

In [None]:
# results = {name: run_test(config[0], config[1]) for name, config in MODELS_CONFIG.items()}

results = {}
# os.system(f'rm -rf {log_dir}*')
for name, config in MODELS_CONFIG.items():
    results[name] = run_test(config[0], config[1])
    with open(os.path.join(BASE_DIR, 'results_3.json'), 'w') as f:
        json.dump(results, f)

  0%|          | 0/5 [00:00<?, ?it/s]

loaded  (10000, 128, 100)
{'epochs': 50, 'layers': 3, 'hid_dim': 50, 'in_dim': 100}


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

{'fit_time': array([57.75106597, 58.1938951 , 58.23615885, 58.49484515, 58.37687635]), 'score_time': array([0.03720212, 0.03081107, 0.03192163, 0.03299403, 0.03438377]), 'test_accuracy': array([0.896, 0.836, 0.919, 0.901, 0.907]), 'test_f1_weighted': array([0.89501217, 0.83174638, 0.918554  , 0.90015491, 0.90629071]), 'test_recall': array([0.799, 0.677, 0.845, 0.809, 0.82 ])}
loaded  (10000, 128, 100)
{'epochs': 50, 'layers': 3, 'hid_dim': 50, 'in_dim': 100}


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

{'fit_time': array([58.41750717, 58.54174209, 58.43543196, 58.20845747, 60.99621844]), 'score_time': array([0.03271914, 0.02921176, 0.02915168, 0.02804685, 0.03352857]), 'test_accuracy': array([0.905 , 0.9115, 0.913 , 0.91  , 0.915 ]), 'test_f1_weighted': array([0.90422422, 0.91096523, 0.91243961, 0.90936045, 0.91449303]), 'test_recall': array([0.815, 0.834, 0.833, 0.826, 0.838])}
loaded  (10000, 128, 100)
{'epochs': 50, 'layers': 3, 'hid_dim': 50, 'in_dim': 100}


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

{'fit_time': array([67.20361066, 61.63337541, 62.86663437, 59.31912255, 59.28322363]), 'score_time': array([0.02690101, 0.03135157, 0.02685094, 0.03008795, 0.02682066]), 'test_accuracy': array([0.826 , 0.7975, 0.818 , 0.841 , 0.836 ]), 'test_f1_weighted': array([0.82125081, 0.79036394, 0.81238487, 0.83738989, 0.8318554 ]), 'test_recall': array([0.663, 0.613, 0.645, 0.692, 0.679])}
loaded  (10000, 128, 100)
{'epochs': 50, 'layers': 3, 'hid_dim': 50, 'in_dim': 100}


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

{'fit_time': array([58.93593454, 58.98741674, 59.30282021, 61.23019361, 62.14673257]), 'score_time': array([0.03446364, 0.03371763, 0.02782083, 0.03517985, 0.03107738]), 'test_accuracy': array([0.8335, 0.861 , 0.8455, 0.833 , 0.831 ]), 'test_f1_weighted': array([0.8292647 , 0.85889987, 0.84225161, 0.828556  , 0.82627021]), 'test_recall': array([0.676, 0.739, 0.702, 0.672, 0.666])}
loaded  (10000, 128, 100)
{'epochs': 50, 'layers': 3, 'hid_dim': 50, 'in_dim': 100}


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

{'fit_time': array([60.82814598, 59.63933301, 59.46607804, 59.09530354, 59.05023241]), 'score_time': array([0.0288074 , 0.03392267, 0.03332829, 0.02849174, 0.03546238]), 'test_accuracy': array([0.7915, 0.81  , 0.7345, 0.8   , 0.8045]), 'test_f1_weighted': array([0.78285768, 0.80420768, 0.71517146, 0.79345586, 0.79745503]), 'test_recall': array([0.592, 0.638, 0.474, 0.622, 0.618])}


  0%|          | 0/5 [00:00<?, ?it/s]

loaded  (10000, 128, 100)
{'epochs': 50, 'layers': 1, 'hid_ch': 60, 'kernel_size': 4, 'in_dim': 100}


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

{'fit_time': array([55.55399656, 54.81396866, 55.49070883, 55.02548933, 55.51215911]), 'score_time': array([0.04138255, 0.04730701, 0.04170561, 0.04394746, 0.04628086]), 'test_accuracy': array([0.863 , 0.8815, 0.8655, 0.853 , 0.867 ]), 'test_f1_weighted': array([0.8607541 , 0.88016622, 0.86351825, 0.85031221, 0.86508953]), 'test_recall': array([0.736, 0.776, 0.745, 0.719, 0.748])}
loaded  (10000, 128, 100)
{'epochs': 50, 'layers': 1, 'hid_ch': 60, 'kernel_size': 4, 'in_dim': 100}


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

{'fit_time': array([55.30697584, 56.18122792, 55.49352002, 55.28658247, 55.32499051]), 'score_time': array([0.04086161, 0.04014087, 0.04255724, 0.04220223, 0.04277873]), 'test_accuracy': array([0.894 , 0.905 , 0.8685, 0.898 , 0.8785]), 'test_f1_weighted': array([0.89301402, 0.90429214, 0.86646326, 0.89707107, 0.87694255]), 'test_recall': array([0.798, 0.819, 0.745, 0.803, 0.766])}
loaded  (10000, 128, 100)
{'epochs': 50, 'layers': 1, 'hid_ch': 60, 'kernel_size': 4, 'in_dim': 100}


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

{'fit_time': array([54.90241027, 54.77647042, 54.49123049, 54.20304441, 56.02287221]), 'score_time': array([0.04299998, 0.04518127, 0.04286528, 0.04009438, 0.04123449]), 'test_accuracy': array([0.8055, 0.745 , 0.787 , 0.7535, 0.803 ]), 'test_f1_weighted': array([0.80093092, 0.73062593, 0.7792817 , 0.74029388, 0.79662858]), 'test_recall': array([0.654, 0.514, 0.6  , 0.528, 0.626])}
loaded  (10000, 128, 100)
{'epochs': 50, 'layers': 1, 'hid_ch': 60, 'kernel_size': 4, 'in_dim': 100}


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

{'fit_time': array([56.64195848, 56.80411434, 56.76825857, 56.16349006, 56.23916912]), 'score_time': array([0.04307818, 0.044029  , 0.04452586, 0.04176235, 0.03899693]), 'test_accuracy': array([0.8355, 0.8485, 0.8545, 0.8115, 0.7985]), 'test_f1_weighted': array([0.83142378, 0.84526918, 0.85156995, 0.80485729, 0.79040715]), 'test_recall': array([0.68 , 0.704, 0.714, 0.627, 0.602])}
loaded  (10000, 128, 100)
{'epochs': 50, 'layers': 1, 'hid_ch': 60, 'kernel_size': 4, 'in_dim': 100}


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

{'fit_time': array([52.76034212, 34.05209684, 33.73370934, 34.62857056, 34.11960411]), 'score_time': array([0.04149628, 0.03711557, 0.03895593, 0.04199862, 0.04304338]), 'test_accuracy': array([0.7055, 0.5   , 0.5   , 0.5   , 0.5   ]), 'test_f1_weighted': array([0.6842364 , 0.33333333, 0.33333333, 0.33333333, 0.33333333]), 'test_recall': array([0.446, 1.   , 1.   , 1.   , 1.   ])}


  0%|          | 0/5 [00:00<?, ?it/s]

loaded  (10000, 128, 100)
{'epochs': 50, 'layers': 1, 'hid_dim': 50, 'bi': True, 'dropout': 0.3, 'in_dim': 100}


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

{'fit_time': array([82.74716473, 82.24817824, 82.3700738 , 83.17861676, 82.6752007 ]), 'score_time': array([0.05300355, 0.05265856, 0.05610347, 0.05535173, 0.05507278]), 'test_accuracy': array([0.929 , 0.9215, 0.927 , 0.917 , 0.928 ]), 'test_f1_weighted': array([0.92888622, 0.92127467, 0.92699117, 0.91663687, 0.9279262 ]), 'test_recall': array([0.889, 0.868, 0.916, 0.851, 0.896])}
loaded  (10000, 128, 100)
{'epochs': 50, 'layers': 1, 'hid_dim': 50, 'bi': True, 'dropout': 0.3, 'in_dim': 100}


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

{'fit_time': array([82.9241116 , 83.22846603, 84.49794698, 83.52485991, 84.15824366]), 'score_time': array([0.05283618, 0.05227304, 0.05471253, 0.05224991, 0.05832601]), 'test_accuracy': array([0.498 , 0.5005, 0.4995, 0.505 , 0.5165]), 'test_f1_weighted': array([0.33681308, 0.34140068, 0.34008216, 0.34943664, 0.37600806]), 'test_recall': array([0.005, 0.009, 0.008, 0.016, 0.042])}
loaded  (10000, 128, 100)
{'epochs': 50, 'layers': 1, 'hid_dim': 50, 'bi': True, 'dropout': 0.3, 'in_dim': 100}


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

{'fit_time': array([85.17907071, 87.63726687, 86.01248765, 84.1522212 , 84.11641693]), 'score_time': array([0.05231547, 0.05578613, 0.05991125, 0.05692649, 0.0598495 ]), 'test_accuracy': array([0.905 , 0.5005, 0.8765, 0.498 , 0.5025]), 'test_f1_weighted': array([0.90471175, 0.34478377, 0.8751887 , 0.33681308, 0.34488595]), 'test_recall': array([0.85 , 0.013, 0.774, 0.005, 0.012])}
loaded  (10000, 128, 100)
{'epochs': 50, 'layers': 1, 'hid_dim': 50, 'bi': True, 'dropout': 0.3, 'in_dim': 100}


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

{'fit_time': array([83.66513848, 84.04511189, 83.98623157, 84.57212281, 84.30518556]), 'score_time': array([0.05260682, 0.05551863, 0.05781579, 0.05427361, 0.05536342]), 'test_accuracy': array([0.5   , 0.501 , 0.5025, 0.499 , 0.5025]), 'test_f1_weighted': array([0.33510285, 0.34163307, 0.33973711, 0.33552967, 0.34403771]), 'test_recall': array([0.002, 0.009, 0.006, 0.003, 0.011])}
loaded  (10000, 128, 100)
{'epochs': 50, 'layers': 1, 'hid_dim': 50, 'bi': True, 'dropout': 0.3, 'in_dim': 100}


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

{'fit_time': array([84.46912384, 84.81758523, 84.05219436, 84.18989205, 83.94679832]), 'score_time': array([0.05664134, 0.05437088, 0.0549953 , 0.06041694, 0.05587411]), 'test_accuracy': array([0.497 , 0.4975, 0.501 , 0.4945, 0.5   ]), 'test_f1_weighted': array([0.3346244 , 0.34000357, 0.34333465, 0.3369133 , 0.33772641]), 'test_recall': array([0.003, 0.009, 0.011, 0.007, 0.005])}


In [None]:
with open(os.path.join(BASE_DIR, 'results_2.json'), 'r') as f:
    results = json.load(f)
print(results)

{'MLP': {'Booking CBOW': {'Accuracy': [0.896, 0.836, 0.919, 0.901, 0.907], 'F1': [0.8950121695028523, 0.8317463802388174, 0.9185540017133825, 0.9001549111681271, 0.9062907144174255], 'Recall': [0.799, 0.677, 0.845, 0.809, 0.82], 'Forward time': [0.03720211982727051, 0.030811071395874023, 0.0319216251373291, 0.03299403190612793, 0.03438377380371094], 'Fit time': [57.75106596946716, 58.19389510154724, 58.23615884780884, 58.494845151901245, 58.37687635421753], 'params': {'epochs': 50, 'layers': 3, 'hid_dim': 50, 'in_dim': 100}}, 'Booking Skip-Gram': {'Accuracy': [0.905, 0.9115, 0.913, 0.91, 0.915], 'F1': [0.9042242161508217, 0.9109652349423725, 0.9124396135265701, 0.9093604473162636, 0.9144930291699486], 'Recall': [0.815, 0.834, 0.833, 0.826, 0.838], 'Forward time': [0.03271913528442383, 0.029211759567260742, 0.02915167808532715, 0.028046846389770508, 0.03352856636047363], 'Fit time': [58.41750717163086, 58.54174208641052, 58.43543195724487, 58.208457469940186, 60.99621844291687], 'params

In [None]:
for metric in metrics:
    fig = go.Figure(
        layout=go.Layout(
                        template='presentation+none',
                        title=dict(
                            text=f"{metric}"
                        ),
                        barmode='overlay',
                          margin=dict(b=240),
                         xaxis=dict(tickangle=90)
                    )
    )
    traces = []
    for model, res in results.items():
        for name, data in res.items():
            n = f"{name}_{model}"
            traces.append((n, go.Box(y=data[metric], name=n, boxpoints='outliers', showlegend=False)))
            # fig.add_trace(go.Box(y=data[metric], name=n, boxpoints='outliers'))
    traces.sort(key=lambda k: k[0])
    for t in traces:
        fig.add_trace(t[1])
    fig.show(renderer='colab')
    fig.write_image(os.path.join(FIGURES_DIR, f"L3_{metric}_2.png"))

In [None]:
for model_name, res in results.items():
    headers = ["Dataset"]
    vals = [[]]
    for row_idx, (ds_name, r) in enumerate(res.items()):
        params = r['params']
        vals[0].append(ds_name)
        for col_idx, (n, v) in enumerate(params.items()):
            if len(headers) <= len(params):
                headers.append(n)
            if len(vals) <= col_idx+1:
                vals.append([])
            vals[col_idx+1].append(v)
    fig = go.Figure(
        data=[go.Table(header=dict(values=headers),cells=dict(values=vals))],
        layout=go.Layout(
                        template='presentation+none',
                        title=dict(
                            text=f"Params: {model_name}"
                        ),
                        barmode='overlay'
                    )
    )
    fig.show(renderer='colab')
    fig.write_image(os.path.join(FIGURES_DIR, f"L3_{model_name}_params_2.png"))
