<a href="https://colab.research.google.com/github/Krankile/ensemble_forecasting/blob/main/notebooks/weight_net/2_fit_weight_net.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Setup

**Note:** Data set classes expect data to be normalized

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%%capture
!pip install wandb

Go here to find wandb API key:

[https://wandb.ai/settings](https://wandb.ai/settings)

In [3]:
import wandb as wb
wb.login()


[34m[1mwandb[0m: Currently logged in as: [33mkrankile[0m (use `wandb login --relogin` to force relogin)


True

In [4]:
%%capture
!git clone https://github.com/Krankile/ensemble_forecasting.git
!mv ensemble_forecasting ef

In [5]:
%%capture
!cd ef && git pull

In [6]:
import os
import copy
import math
import random
from multiprocessing import cpu_count
from pathlib import Path
from collections import namedtuple
from functools import partial

import numpy as np
import pandas as pd

from tqdm import tqdm
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

import torch
from torch import nn, optim
import torch.nn.functional as F

from ef.models import weightnets
from ef.utils import loss_functions, activations, optimizers, schedulers, scalers

from ef.data import ensemble_loaders, ensemble_loaders_kfold

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [8]:
def artifact_to_path(run, art_name, *, root="krankile/data-processing/"):
    art = run.use_artifact(root + art_name); art.download()
    return art.file()

# Debug area

# Training loop

## Normal train-val split

In [9]:
def train_model(model, train_loader, val_loader, num_examples, conf):    
    batch_size = conf.batch_size

    optimizer = optimizers[conf.optimizer](model.parameters(), lr=conf.learning_rate, weight_decay=conf.weight_decay)
    scheduler = schedulers[conf.schedule](
        optimizer, conf.learning_rate,
        epochs=conf.epochs,
        steps_per_epoch=math.ceil(num_examples / batch_size),
    )

    loss_func = loss_functions[conf.loss_func]
    it = tqdm(range(1, conf.epochs+1))
    
    best_loss = float("inf")
    step = 0

    for epoch in it:

        #Each epoch has a training and validation phase
        train_losses = []
        val_losses = []
        for phase in ['train','val']:
            if phase == 'train':
                model.train()  # Set model to training mode
                batches = train_loader
            else:
                model.eval()  # Set model to evaluate mode
                batches = val_loader
            for i, tensors in enumerate(batches):
                cats, inputs, forecasts, actuals, *loss_args = map(lambda x: x.to(device), tensors)
                optimizer.zero_grad()

                y_pred = model(cats, inputs.float()).unsqueeze(2)

                prediction = torch.matmul(forecasts, y_pred).squeeze(2)
                loss = loss_func(prediction, actuals, *loss_args)
                if phase == 'train':
                    train_losses.append(loss.item())
                    loss.backward()

                    optimizer.step()
                    scheduler.step()
                    
                    step += 1
                else:
                    val_losses.append(loss.item())

        train_loss = np.mean(train_losses)
        val_loss = np.mean(val_losses)

        if val_loss < best_loss: 
            best_loss = val_loss
            best_model_wts = copy.deepcopy(model.state_dict())

            filepath = "model.pth"
            torch.save(best_model_wts, filepath)
            wb.save(filepath)

        wb.log({"train_loss": train_loss, "val_loss": val_loss, "epoch": epoch, "best_loss": best_loss, "n_examples":batch_size*step, "lr": optimizer.param_groups[0]["lr"]}, step=step)
        it.set_postfix({"train_loss": train_loss, "val_loss": val_loss, "best_loss": best_loss, "lr": f'{optimizer.param_groups[0]["lr"]:.2e}'})
        
    model.load_state_dict(best_model_wts)
    return model.eval()

In [10]:
def train(config=None, project=None, entity=None, enablewb=True):
    mode = "online" if enablewb else "online"
    with wb.init(config=config, project=project, entity=entity, job_type="training", mode=mode) as run:
        conf = run.config
        print(conf)

        datapath = artifact_to_path(run, conf.data)
        splitpath = artifact_to_path(run, conf.data_split)

        (
            train_loader,
            val_loader,
            emb_dims,
            num_cont,
            num_examples,
        ) = ensemble_loaders(
                    datapath=datapath, splitpath=splitpath,
                    batch_size=conf.batch_size,
                    feature_set=conf.feature_set,
                    n_models=conf.num_models,)
        
        model = weightnets[conf.architecture](
            num_cont=num_cont,
            out_size=conf.num_models,
            n_hidden=conf.n_hidden,
            hidden_dim=conf.hidden_dim,
            dropout=conf.dropout,
            bn=conf.bn,
            activation=conf.act,
            emb_dims=emb_dims,
        )

        print(f"Moving model to device: {device}")
        model = model.float().to(device)

        model = train_model(
            model,
            train_loader,
            val_loader,
            num_examples,
            conf=conf,
        )
    return model
    

## Train with k-fold cross-validation

In [32]:
def train_model_kfold(model, train_loader, val_loader, num_examples, conf, fold_num):    
    batch_size = conf.batch_size

    optimizer = optimizers[conf.optimizer](model.parameters(), lr=conf.learning_rate, weight_decay=conf.weight_decay)
    scheduler = schedulers[conf.schedule](
        optimizer, conf.learning_rate,
        epochs=conf.epochs,
        steps_per_epoch=math.ceil(num_examples / batch_size),
    )

    loss_func = loss_functions[conf.loss_func]
    it = tqdm(range(1, conf.epochs+1), desc=f"Fold {fold_num+1} of {conf.k}")
    
    for epoch in it:

        #Each epoch has a training and validation phase
        train_losses = []
        val_losses = []
        for phase in ['train','val']:
            if phase == 'train':
                model.train()  # Set model to training mode
                batches = train_loader
            else:
                model.eval()  # Set model to evaluate mode
                batches = val_loader
            for i, tensors in enumerate(batches):
                cats, inputs, forecasts, actuals, *loss_args = map(lambda x: x.to(device), tensors)
                optimizer.zero_grad()

                y_pred = model(cats, inputs.float()).unsqueeze(2)

                prediction = torch.matmul(forecasts, y_pred).squeeze(2)
                loss = loss_func(prediction, actuals, *loss_args)
                if phase == 'train':
                    train_losses.append(loss.item())
                    loss.backward()

                    optimizer.step()
                    scheduler.step()
                else:
                    val_losses.append(loss.item())

        train_loss = np.mean(train_losses)
        val_loss = np.mean(val_losses)

        wb.log({f"train_loss/{fold_num}": train_loss, f"val_loss/{fold_num}": val_loss, "epoch": epoch, "lr": optimizer.param_groups[0]['lr']})
        it.set_postfix({"train_loss": train_loss, "val_loss": val_loss, "lr": f"{optimizer.param_groups[0]['lr']:.2e}"})

    return val_loss

In [33]:
def standardize(df, scaler=None):
    feats = df.loc[:, "x_acf1":"lstm_31"]
    if scaler is None:
        scaler = StandardScaler().fit(feats)

    index, columns = feats.index, feats.columns
    df.loc[:, "x_acf1":"lstm_31"] = pd.DataFrame(scaler.transform(feats), index=index, columns=columns)

    return df, scaler

In [40]:
def train_kfold(config=None, project=None, entity=None, enablewb=True):
    mode = "online" if enablewb else "online"
    with wb.init(config=config, project=project, entity=entity, job_type="training", mode=mode) as run:
        conf = run.config
        print(conf)

        rnd_seed = np.random.randint(1e9)
        run.log({"random_seed": rnd_seed})
        datapath = artifact_to_path(run, conf.data)

        df = pd.read_feather(datapath).set_index("m4id")
        
        outer_losses = []
        for s, seed in enumerate([69, 420, 666]):
            df = shuffle(df, random_state=seed)
            folds = np.array_split(df, conf.k)
            losses = []
            for i, val in enumerate(folds, start=(s*conf.k)):
                data = pd.concat(folds[:i] + folds[(i+1):], axis=0)
                data, scaler = standardize(data, scaler=None)
                val, _ = standardize(val, scaler=scaler)

                (
                    train_loader,
                    val_loader,
                    emb_dims,
                    num_cont,
                    num_examples,
                ) = ensemble_loaders_kfold(
                            data=data, val=val,
                            batch_size=conf.batch_size,
                            feature_set=conf.feature_set,
                            n_models=conf.num_models,
                            cpus=None,)

                torch.manual_seed(rnd_seed)
                model = weightnets[conf.architecture](
                    num_cont=num_cont,
                    out_size=conf.num_models,
                    n_hidden=conf.n_hidden,
                    hidden_dim=conf.hidden_dim,
                    dropout=conf.dropout,
                    bn=conf.bn,
                    activation=conf.act,
                    emb_dims=emb_dims,
                )

                print(f"Moving model to device: {device}")
                model = model.float().to(device)

                loss = train_model_kfold(
                    model,
                    train_loader,
                    val_loader,
                    num_examples,
                    conf=conf,
                    fold_num=i,
                )

                losses.append(loss)

        overall_loss = np.mean(losses)
        run.log({"overall_loss": overall_loss})

    return overall_loss

## Run config

### Normal config

In [36]:
norm_config = dict(
    k=None,
    epochs=20,
    hidden_dim=256,
    n_hidden=2,
    learning_rate=2e-3,
    optimizer="adamw",
    architecture="WeightNetV4",
    data="ensemble_traval:standard",
    data_split="traval_split_80_20:v0",
    batch_size=1024,
    loss_func="owa",
    dropout=0.5,
    weight_decay=0.05,
    bn=False,
    feature_set="ma",
    act="leaky",
    num_models=14,
    schedule=None,
)

### K-fold config

In [37]:
kfold_config = dict(
    k=5,
    epochs=5,
    hidden_dim=256,
    n_hidden=2,
    learning_rate=2e-3,
    optimizer="adamw",
    architecture="WeightNetV4",
    data="ensemble_traval:non-standard",
    batch_size=1024,
    loss_func="owa",
    dropout=0.5,
    weight_decay=0.05,
    bn=False,
    feature_set="ma",
    act="leaky",
    num_models=14,
    schedule=None,
)

## Start run

In [None]:
sweepid = "krankile/weight-net/4r44tbf3"
enablewb = True
project = "weight-net"
usecv = True

train_func, config = (train_kfold, kfold_config) if usecv else (train, norm_config)

if sweepid:
    count = 500 # number of runs to execute
    wb.agent(sweepid, function=partial(train_func, config=config), count=count)
else:
    res = train_func(config=config, project=project, entity="krankile", enablewb=enablewb)

[34m[1mwandb[0m: Agent Starting Run: 2h2oz37b with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	dropout: 0.4495462534810624
[34m[1mwandb[0m: 	epochs: 11
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.0022238325304523507
[34m[1mwandb[0m: 	n_hidden: 2
[34m[1mwandb[0m: 	weight_decay: 0.032496945846470844


{'batch_size': 128, 'dropout': 0.4495462534810624, 'epochs': 11, 'hidden_dim': 64, 'learning_rate': 0.0022238325304523507, 'n_hidden': 2, 'weight_decay': 0.032496945846470844, 'k': 5, 'optimizer': 'adamw', 'architecture': 'WeightNetV4', 'data': 'ensemble_traval:non-standard', 'data_split': 'traval_split_80_20:v0', 'loss_func': 'owa', 'bn': False, 'feature_set': 'ma', 'act': 'leaky', 'num_models': 14, 'schedule': None}


[34m[1mwandb[0m: Downloading large artifact ensemble_traval:non-standard, 119.61MB. 1 files... Done. 0:0:0


CPU count: 2
Loaded df of shape (79996, 801)
Loaded df of shape (19999, 801)
Moving model to device: cuda


Fold 1 of 5: 100%|██████████| 11/11 [01:30<00:00,  8.25s/it, train_loss=0.767, val_loss=0.775, lr=2.22e-03]


CPU count: 2
Loaded df of shape (79996, 801)
Loaded df of shape (19999, 801)
Moving model to device: cuda


Fold 2 of 5: 100%|██████████| 11/11 [01:31<00:00,  8.36s/it, train_loss=0.771, val_loss=0.777, lr=2.22e-03]


CPU count: 2
Loaded df of shape (79996, 801)
Loaded df of shape (19999, 801)
Moving model to device: cuda


Fold 3 of 5: 100%|██████████| 11/11 [01:29<00:00,  8.15s/it, train_loss=0.773, val_loss=0.776, lr=2.22e-03]


CPU count: 2
Loaded df of shape (79996, 801)
Loaded df of shape (19999, 801)
Moving model to device: cuda


Fold 4 of 5: 100%|██████████| 11/11 [01:32<00:00,  8.44s/it, train_loss=0.773, val_loss=0.778, lr=2.22e-03]


CPU count: 2
Loaded df of shape (79996, 801)
Loaded df of shape (19999, 801)
Moving model to device: cuda


Fold 5 of 5: 100%|██████████| 11/11 [01:32<00:00,  8.39s/it, train_loss=0.77, val_loss=0.824, lr=2.22e-03]


CPU count: 2
Loaded df of shape (99995, 801)
Loaded df of shape (19999, 801)
Moving model to device: cuda


Fold 6 of 5: 100%|██████████| 11/11 [01:51<00:00, 10.14s/it, train_loss=0.767, val_loss=0.759, lr=2.22e-03]


CPU count: 2
Loaded df of shape (99995, 801)
Loaded df of shape (19999, 801)
Moving model to device: cuda


Fold 7 of 5:  73%|███████▎  | 8/11 [01:20<00:30, 10.13s/it, train_loss=0.774, val_loss=0.772, lr=2.22e-03]