https://colab.research.google.com/github/pytorch/tutorials/blob/gh-pages/_downloads/c24b93738bc036c1b66d0387555bf69a/hyperparameter_tuning_tutorial.ipynb#scrollTo=AJhAsV0FqHcc

https://docs.ray.io/en/latest/tune/api_docs/schedulers.html

In [1]:
from collections import Counter
from functools import partial
import numpy as np
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import random_split
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
from ray.tune.stopper import ExperimentPlateauStopper
import random
from sklearn.metrics import f1_score, accuracy_score
import spacy
import copy
SEED = 42

In [2]:
def set_seed(seed):
    """
    Fixa semente aleatória para garantir que os resultados possam ser reproduzidos
    """
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    np.random.seed(seed)
    random.seed(seed)
    
def tokenize (text, tok):
    return [token.text for token in tok.tokenizer(text)]

def encode_sentence(text, vocab2index, tok, N=1000):
    tokenized = tokenize(text, tok)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

In [3]:
class load_data(torch.utils.data.Dataset):
    """
    Classe auxiliar para utilização do DataLoader
    """
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1]

In [4]:
class LSTM(nn.Module):
    """
    Bidirectional LSTM running over word embeddings.
    """
    def __init__(self, embedding_matrix, word_embedding_dimension: int, hidden_dim: int, num_layers: int = 1, num_classes: int = 13, vocab_size: int = 0, dropout: float = 0, bidirectional: bool = True):
        nn.Module.__init__(self)
        self.config_keys = ['word_embedding_dimension', 'hidden_dim', 'num_layers', 'dropout', 'bidirectional']
        self.word_embedding_dimension = word_embedding_dimension
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = dropout
        self.bidirectional = bidirectional
        self.vocab_size = vocab_size

        self.embeddings_dimension = hidden_dim
        if self.bidirectional:
            self.embeddings_dimension *= 2
            
        self.embeddings = nn.Embedding(vocab_size, word_embedding_dimension, padding_idx=0)
        self.embeddings.weight=nn.Parameter(torch.tensor(embedding_matrix,dtype=torch.float32))
        self.embeddings.weight.requires_grad=False
        self.dropout = nn.Dropout(dropout)
        self.encoder = nn.LSTM(word_embedding_dimension, int(word_embedding_dimension/2), num_layers=num_layers, dropout=dropout, bidirectional=bidirectional, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(int(word_embedding_dimension/2), num_classes)

    def forward(self, features, sentence_length):
        x = self.embeddings(features)
        x = self.dropout(x)
        out_pack, (ht, ct) = self.encoder(x)
        ht[-1] = self.dropout(ht[-1])
        out = self.linear(ht[-1])
        return out

    """def get_word_embedding_dimension(self) -> int:
        return self.embeddings_dimension

    def tokenize(self, text: str) -> List[int]:
        raise NotImplementedError()

    def save(self, output_path: str):
        with open(os.path.join(output_path, 'lstm_config.json'), 'w') as fOut:
            json.dump(self.get_config_dict(), fOut, indent=2)

        torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin'))

    def get_config_dict(self):
        return {key: self.__dict__[key] for key in self.config_keys}

    @staticmethod
    def load(input_path: str):
        with open(os.path.join(input_path, 'lstm_config.json'), 'r') as fIn:
            config = json.load(fIn)

        weights = torch.load(os.path.join(input_path, 'pytorch_model.bin'))
        model = LSTM(**config)
        model.load_state_dict(weights)
        return model
    """

In [5]:
def split_data(df_data, fold):
    X = df_data.loc[df_data['fold'] == fold, 'four_pages_encoded'].values
    y = df_data.loc[df_data['fold'] == fold, 'label_int'].values
    
    return X, y

In [6]:
def encode_data(df_data, config):
    tok = spacy.load('pt_core_news_sm')

    # count frequency of each word
    
    counts = Counter()
    for index, row in df_data.loc[df_data['fold']=="train"].iterrows():
        counts.update(tokenize(row['four_pages_processed'], tok))

    #creating vocabulary
    vocab2index = {"":0, "UNK":1}
    words = ["", "UNK"]
    for word in counts:
        vocab2index[word] = len(words)
        words.append(word)
        
    vocab_size = len(words)

    # encoding
    df_data['four_pages_encoded'] = None
    df_data.loc[df_data['fold'] == 'train','four_pages_encoded'] = df_data.loc[df_data['fold'] == 'train','four_pages_processed'].apply(lambda x: np.array(encode_sentence(x,vocab2index, tok, config["num_terms"] )))
    df_data.loc[df_data['fold'] == 'val','four_pages_encoded'] = df_data.loc[df_data['fold'] == 'val','four_pages_processed'].apply(lambda x: np.array(encode_sentence(x,vocab2index, tok, config["num_terms"] )))
    df_data.loc[df_data['fold'] == 'test','four_pages_encoded'] = df_data.loc[df_data['fold'] == 'test','four_pages_processed'].apply(lambda x: np.array(encode_sentence(x,vocab2index, tok, config["num_terms"] )))
    
    return df_data, vocab_size, vocab2index

In [7]:
def calculate_metrics(y_true, y_pred):
    f1_macro = f1_score(y_true, y_pred, average='macro')
    f1_weighted = f1_score(y_true, y_pred, average='weighted')
    acc = accuracy_score(y_true, y_pred)
    
    return acc, f1_macro, f1_weighted

In [8]:
def data_load(config, df_data):
    # Split data
    X_train, y_train = split_data(df_data, "train")
    X_val, y_val = split_data(df_data, "val")
    X_test, y_test = split_data(df_data, "test")
    
    # Load dataset
    train_set = load_data(X_train, y_train)
    val_set = load_data(X_val, y_val)
    test_set = load_data(X_test, y_test)
    
    data_loaders = dict()

    data_loaders["train"] = torch.utils.data.DataLoader(
        train_set,
        batch_size=int(config["batch_size"]),
        shuffle=True,
        num_workers=8)
    data_loaders["val"] = torch.utils.data.DataLoader(
        val_set,
        batch_size=int(config["batch_size"]),
        shuffle=False,
        num_workers=8)
    data_loaders["test"] = torch.utils.data.DataLoader(
        test_set,
        batch_size=int(config["batch_size"]),
        shuffle=False,
        num_workers=8)
    
    return data_loaders

def create_embedding_matrix(word_index,dimension, emb_model):    
    emb_dir = "/dados01/workspace/ufmg.f01dcc/m03/business_understanding/notebooks/lstm_data/embeddings/"
    if emb_model == "wang2vec":
        df_emb = pd.read_csv(emb_dir+"wang2vec_skip_s600.txt", header=None, sep = " ", index_col=0, skiprows=1)
    elif emb_model == "w2v_jur":
        df_emb = pd.read_csv(emb_dir+"modelo_w2v_vec600_wd10_ct5_tec1.txt", header=None, sep = " ", index_col=0, skiprows=1)
    elif emb_model == "glove":
        df_emb = pd.read_csv(emb_dir+"glove_s600.txt", header=None, sep = " ", index_col=0, skiprows=1)
    elif emb_model == "fastTex":
        df_emb = pd.read_csv(emb_dir+"fasttext_skip_s600.txt", header=None, sep = " ", index_col=0, skiprows=1)
        
    df_emb = df_emb.loc[:, :600]
        

    embedding_dict = {key: val.values for key, val in df_emb.T.items()}
    embedding_matrix=np.zeros((len(word_index)+1,dimension))

    for word,index in word_index.items():
        if word in embedding_dict:
            embedding_matrix[index]=embedding_dict[word]
    return embedding_matrix

In [9]:
def eval_model(model, loader, fold=None, labels_dict = None, device="cpu"):
    model.eval()
    criterion = nn.CrossEntropyLoss()

    steps = 0
    sum_loss = 0.0
    y_true = []
    y_pred = []
    with torch.no_grad():
        for data in loader:
            inputs, labels, sentence_length = data
            inputs, labels = inputs.long().to(device), labels.long().to(device)
            outputs = model(inputs, sentence_length)
            loss = criterion(outputs, labels)
            _, predicted = torch.max(outputs.data, 1)
            
            steps += 1
            sum_loss += loss.cpu().detach().numpy()
            
            y_pred.extend(predicted.cpu().tolist())
            y_true.extend(labels.cpu().tolist())
    
    metrics = [(sum_loss/steps)]
    metrics.extend(calculate_metrics(y_true, y_pred))
    #if fold:
    #    plot_confusion_matrix(y_pred, y_true, fold, labels_dict)
    return metrics

In [10]:
def train_model(config, checkpoint_dir=None, df_data=None):
    df_data, vocab_size, vocab2index = encode_data(df_data, config)       
    config.update({
        "num_classes": df_data['label'].nunique(),
        "vocab_size": vocab_size
    })
    
    data_loaders = data_load(config, df_data)
    embedding_matrix=create_embedding_matrix(vocab2index,dimension=600, emb_model=config["embeddings"])
    
    train_loader = data_loaders["train"]
    val_loader = data_loaders["val"]
    
    model = LSTM(
        embedding_matrix = embedding_matrix,
        word_embedding_dimension = config["embedding_dim"], 
        hidden_dim = config["embedding_dim"], 
        num_classes = config["num_classes"], 
        vocab_size = config["vocab_size"],
        num_layers = config["num_layers"], 
        dropout = config["dropout"], 
        bidirectional = True
    )

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
    model.to(device)
    
    best_model = copy.deepcopy(model)
    best_loss = float("inf")

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=config["lr"])

    """if checkpoint_dir:
        model_state, optimizer_state = torch.load(
            os.path.join(checkpoint_dir, "checkpoint"))
        model.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)"""

    for epoch in range(150):  # loop over the dataset multiple times
        model.train()
        training_loss = 0.0
        epoch_steps = 0
        y_pred = []
        y_true = []
        for i, data in enumerate(train_loader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels, sentence_length = data
            inputs, labels = inputs.long().to(device), labels.long().to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs, sentence_length)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            epoch_steps += 1
            training_loss += loss.cpu().detach().numpy()
            
            _, predicted = torch.max(outputs.data, 1)
            
            y_pred.extend(predicted.cpu().tolist())
            y_true.extend(labels.cpu().tolist())
            
        train_metrics = [(training_loss/epoch_steps)]
        train_metrics.extend(calculate_metrics(y_true, y_pred))

        val_metrics = eval_model(model, val_loader, device=device)
        
        if val_metrics[0] < best_loss:
            best_loss = val_metrics[0]
            with tune.checkpoint_dir(epoch) as checkpoint_dir:
                path = os.path.join(checkpoint_dir, "model.pth")
                torch.save((model.state_dict(), optimizer.state_dict()), path)
        elif val_metrics[0] == np.nan:
            val_metrics[0] = 10
        tune.report(loss=val_metrics[0], accuracy=val_metrics[1], F1_Macro=val_metrics[2], F1_Weighted=val_metrics[3])
    print("Finished Training")

In [None]:
def main(num_samples=10, max_num_epochs=10):
    set_seed(SEED)
    df_data = pd.read_csv("./lstm_data/preprocessed_data_v2_2.csv")

    #load_data(df_data)
    config = {
        "embedding_dim": tune.choice([600]),
        "hidden_dim": tune.choice([150, 300]),
        "dropout": tune.choice([0.2, 0.5]),
        "lr": tune.choice([0.00001, 0.00005]),
        "num_terms": tune.choice([300, 500, 1000]),
        "batch_size": tune.choice([24, 72]),
        "embeddings": tune.choice(["w2v_jur", "glove", "fastTex"]),
        "num_layers": tune.choice([1, 3]),
    }
    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=max_num_epochs,
        grace_period=50,
        reduction_factor=2)
    reporter = CLIReporter(
        # parameter_columns=["l1", "l2", "lr", "batch_size"],
        metric_columns=["loss", "accuracy", "F1_Macro", "F1_Weighted", "training_iteration"])

    result = tune.run(
        partial(train_model, checkpoint_dir="./lstm_data/models/", df_data=df_data),
        resources_per_trial={"cpu": 0, "gpu": 1},
        stop=ExperimentPlateauStopper(metric="loss", mode='min', patience=10),
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter,
        keep_checkpoints_num=1,
        checkpoint_score_attr="loss",
        verbose = 0
    )

    """best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Val:\\n loss %.3f, accuracy %.3f, F1-Macro %.3f, F1-Weighted %.3f, " % (
        best_trial.last_result["loss"], best_trial.last_result["accuracy"], best_trial.last_result["F1_Macro"], best_trial.last_result["F1_Weighted"]))

    df_data, vocab_size = encode_data(df_data, best_trial.config)
    num_classes = df_data['label'].nunique()
    best_trained_model = LSTM_fixed_len(vocab_size, best_trial.config["embedding_dim"], best_trial.config["embedding_dim"], num_classes, best_trial.config["dropout"])
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
    best_trained_model.to(device)

    best_checkpoint_dir = best_trial.checkpoint.value
    try:
        model_state, optimizer_state = torch.load(os.path.join(
            best_checkpoint_dir, "checkpoint"))
        best_trained_model.load_state_dict(model_state)

        test_metrics = test_accuracy(best_trained_model, df_data, device)
        print("Test:\\n loss %.3f, accuracy %.3f, F1-Macro %.3f, F1-Weighted %.3f, " % (test_metrics[0], test_metrics[1], test_metrics[2], test_metrics[3]))
    except:
        print("Model not saved, BAD LOSS")"""


if __name__ == "__main__":
    # You can change the number of GPUs per trial here:
    main(num_samples=250, max_num_epochs=100)

2021-09-21 15:24:08,151	INFO registry.py:67 -- Detected unknown callable for trainable. Converting to class.
[2m[36m(pid=26335)[0m   "--node-ip-address",
[2m[36m(pid=26335)[0m   required=True,
[2m[36m(pid=26335)[0m   type=str,
[2m[36m(pid=26335)[0m   "num_layers={}".format(dropout, num_layers))


[2m[36m(pid=26333)[0m   "--node-ip-address",
[2m[36m(pid=26333)[0m   required=True,
[2m[36m(pid=26333)[0m   type=str,
