## Machine Learning Pipeline
In this file there is the pipeline for the machine learning part of the project. The pipeline is composed by the following steps:
1. Load the dataset
2. Split the dataset into train, validation and test set
3. Create the dataset class
4. Create the model class
5. Create the training loop
6. Create the test loop

For the hp search we use Weights and Biases.

In [6]:
# load packages
import numpy as np
from datetime import datetime
from tqdm import tqdm
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import torch
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from torch.utils import data
import torch.nn as nn
import pandas as pd
import wandb
import pickle

# Log in to your W&B account
wandb.login(key='d29d51017f4231b5149d36ad242526b374c9c60a')

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\leona/.netrc


## Controlling the setup

In [7]:
torch.__version__

'2.0.1'

In [8]:
# The flag below controls whether to allow TF32 on matmul.
torch.backends.cuda.matmul.allow_tf32 = True

# The flag below controls whether to allow TF32 on cuDNN.
torch.backends.cudnn.allow_tf32 = True

print(torch.cuda.is_available())
print(torch.backends.cudnn.enabled)
print(torch.cuda.get_device_name(0))



!nvidia-smi

True
True
NVIDIA GeForce RTX 3090
Wed Jun  7 18:49:43 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 531.61                 Driver Version: 531.61       CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                      TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090       WDDM | 00000000:01:00.0  On |                  N/A |
|  0%   46C    P8               25W / 350W|   1896MiB / 24576MiB |      4%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                  

In [9]:
#to debug
%env CUDA_LAUNCH_BLOCKING=1

env: CUDA_LAUNCH_BLOCKING=1


## Load the dataset

In [10]:
#load the dataset
df = pd.read_csv("dataset_normalized.csv")
total_rows = df.shape[0]

In [11]:
#splitting the dataset into train, validation and test set (70%, 10%, 20%)
train_end = int(total_rows*0.7)
val_end = int(total_rows*0.8)

labels = df["market_value"].values
df = df.drop(columns=["market_value"])

to_encode = df[['citizenship', 'current_club_id', 'position', 'sub_position', "competitions_id", "clubs_id"]]
#to_encode = df[['citizenship', 'current_club_id', 'position', 'sub_position']]
df = df.drop(['citizenship', 'current_club_id', 'position', 'sub_position', "competitions_id", "clubs_id"], axis=1)

train_to_encode = to_encode.iloc[:train_end].values
val_to_encode = to_encode.iloc[train_end:val_end].values
test_to_encode = to_encode.iloc[val_end:].values

train_set = df.iloc[:train_end].values
val_set = df.iloc[train_end:val_end].values
test_set = df.iloc[val_end:].values

y_train = labels[:train_end]
y_val = labels[train_end:val_end]
y_test = labels[val_end:]

train_set_len = train_set.shape[0]
val_set_len = val_set.shape[0]
test_set_len = test_set.shape[0]

In [12]:
print("the len of train set is: {}".format(train_set_len))
print("the len of validation set is: {}".format(val_set_len))
print("the len of test set is: {}".format(test_set_len))

the len of train set is: 95114
the len of validation set is: 13588
the len of test set is: 27176


## Create the pytorch Dataset

In [13]:
class Dataset(data.Dataset):
    """Characterizes a dataset for PyTorch"""
    def __init__(self, x, to_encode, y, length):
        """Initialization"""

        self.length = length
        self.to_encode = torch.tensor(to_encode, device=device, dtype=torch.int32)
        self.y = torch.tensor(y, device=device, dtype=torch.float32)
        self.x = torch.tensor(x, device=device, dtype=torch.float32)

    def __len__(self):
        """Denotes the total number of samples"""
        return self.length

    def __getitem__(self, i):
        return self.x[i], self.to_encode[i] ,self.y[i]

In [14]:
dataset_val = Dataset(val_set, val_to_encode, y_val, val_set_len)
dataset_test = Dataset(test_set, test_to_encode, y_test, test_set_len)
dataset_train = Dataset(train_set, train_to_encode, y_train, train_set_len)

In [15]:
#input parameters
num_input_size = train_set.shape[1]
cat_input_size = to_encode.shape[1]
total_input_size = num_input_size + cat_input_size
voc_size_citizenship = len(to_encode['citizenship'].unique())
voc_size_current_club_id = len(to_encode['current_club_id'].unique())
voc_size_position = len(to_encode['position'].unique())
voc_size_sub_position = len(to_encode['sub_position'].unique())
voc_size_competitions = len(to_encode['competitions_id'].unique())
voc_size_clubs = len(to_encode['clubs_id'].unique())
#print them all
print("input size of numerical features is: {}".format(num_input_size))
print("input size of categorical features is: {}".format(cat_input_size))
print("voc size citizenship is: {}".format(voc_size_citizenship))
print("voc size current club id is: {}".format(voc_size_current_club_id))
print("voc size position is: {}".format(voc_size_position))
print("voc size sub position is: {}".format(voc_size_sub_position))
print("voc size competitions is: {}".format(voc_size_competitions))
print("voc size clubs is: {}".format(voc_size_clubs))

input size of numerical features is: 18
input size of categorical features is: 6
voc size citizenship is: 162
voc size current club id is: 391
voc size position is: 4
voc size sub position is: 16
voc size competitions is: 2717
voc size clubs is: 8054


## Models

In [16]:
# Define a linear regression model class that inherits from nn.Module
class LinearRegression(nn.Module):
  # Define the constructor method that takes the input size and the vocabulary sizes of four categorical features as arguments
  def __init__(self, num_input_size, voc_size_citizenship, voc_size_current_club_id, voc_size_position, voc_size_sub_position, voc_size_competitions, voc_size_clubs, device):
    # Call the parent constructor
    super(LinearRegression, self).__init__()

    # Compute the embedding sizes for each categorical feature using the fourth root of the vocabulary size
    emb_size_citizenship = int(voc_size_citizenship ** (1/4))
    emb_size_current_club_id = int(voc_size_current_club_id ** (1/4))
    emb_size_position = int(voc_size_position ** (1/4))
    emb_size_sub_position = int(voc_size_sub_position ** (1/4))
    emb_size_competitions = int(voc_size_competitions ** (1/4))
    emb_size_clubs = int(voc_size_clubs ** (1/4))

    # Define embedding layers for each categorical feature using the computed embedding sizes
    self.emb_cit = nn.Embedding(voc_size_citizenship, emb_size_citizenship, device=device)
    self.emb_club = nn.Embedding(voc_size_current_club_id, emb_size_current_club_id, device=device)
    self.emb_pos = nn.Embedding(voc_size_position, emb_size_position,  device=device)
    self.emb_sub_pos = nn.Embedding(voc_size_sub_position, emb_size_sub_position, device=device)
    self.emb_comp = nn.Embedding(voc_size_competitions, emb_size_competitions, device=device)
    self.emb_clubs = nn.Embedding(voc_size_clubs, emb_size_clubs, device=device)

    # Compute the total input size by adding the input size and the embedding sizes
    total_input_size = num_input_size + emb_size_citizenship + emb_size_current_club_id + emb_size_position + emb_size_sub_position + emb_size_competitions + emb_size_clubs

    # Define a linear layer that takes the total input size and outputs a single value
    self.fc1 = nn.Linear(total_input_size, 1, device=device)

  # Define the forward method that takes the numerical input and the categorical features to encode as arguments
  def forward(self, x, to_encode):
    # x: a tensor of shape (batch_size, num_input_size) containing the numerical features
    # to_encode: a tensor of shape (batch_size, 6) containing the values of the categorical features

    # Get the embeddings for each categorical feature using the corresponding embedding layer and indexing by the feature values
    cit_emb = self.emb_cit(to_encode[:, 0])

    club_emb = self.emb_club(to_encode[:, 1])

    pos_emb = self.emb_pos(to_encode[:, 2])

    sub_pos_emb = self.emb_sub_pos(to_encode[:, 3])

    comp_emb = self.emb_comp(to_encode[:, 4])

    clubs_emb = self.emb_clubs(to_encode[:, 5])

    # Concatenate the numerical input and the embeddings along the second dimension
    x = torch.cat((x, cit_emb, club_emb, pos_emb, sub_pos_emb, comp_emb, clubs_emb), dim=1)

    # Apply the linear layer to get the output
    output = self.fc1(x)

    # Return the output
    return output.flatten()

In [17]:
# Define a multilayer perceptron model class that inherits from nn.Module
class MLP(nn.Module):
  # Define the constructor method that takes the input size and the vocabulary sizes of four categorical features as arguments
  def __init__(self, num_input_size, voc_size_citizenship, voc_size_current_club_id, voc_size_position, voc_size_sub_position, voc_size_competitions, voc_size_clubs, hidden_size1, hidden_size2, hidden_size3, dropout, device):
    # Call the parent constructor
    super(MLP, self).__init__()

    # Compute the embedding sizes for each categorical feature using the fourth root of the vocabulary size
    emb_size_citizenship = int(voc_size_citizenship ** (1/4))
    emb_size_current_club_id = int(voc_size_current_club_id ** (1/4))
    emb_size_position = int(voc_size_position ** (1/4))
    emb_size_sub_position = int(voc_size_sub_position ** (1/4))
    emb_size_competitions = int(voc_size_competitions ** (1/4))
    emb_size_clubs = int(voc_size_clubs ** (1/4))

    # Define embedding layers for each categorical feature using the computed embedding sizes
    self.emb_cit = nn.Embedding(voc_size_citizenship, emb_size_citizenship, device=device)
    self.emb_club = nn.Embedding(voc_size_current_club_id, emb_size_current_club_id, device=device)
    self.emb_pos = nn.Embedding(voc_size_position, emb_size_position,  device=device)
    self.emb_sub_pos = nn.Embedding(voc_size_sub_position, emb_size_sub_position, device=device)
    self.emb_comp = nn.Embedding(voc_size_competitions, emb_size_competitions, device=device)
    self.emb_clubs = nn.Embedding(voc_size_clubs, emb_size_clubs, device=device)

    # Compute the total input size by adding the input size and the embedding sizes
    total_input_size = num_input_size + emb_size_citizenship + emb_size_current_club_id + emb_size_position + emb_size_sub_position + emb_size_competitions + emb_size_clubs

    # Define a linear layer that takes the total input size and outputs a single value
    self.fc1 = nn.Linear(total_input_size, hidden_size1, device=device)
    self.relu = nn.ReLU()
    self.batchnorm1 = nn.BatchNorm1d(hidden_size1, device=device)
    self.fc2 = nn.Linear(hidden_size1, hidden_size2, device=device)
    self.batchnorm2 = nn.BatchNorm1d(hidden_size2, device=device)
    self.fc3 = nn.Linear(hidden_size2, hidden_size3, device=device)
    self.batchnorm3 = nn.BatchNorm1d(hidden_size3, device=device)
    self.fc4 = nn.Linear(hidden_size3, 1, device=device)
    self.dropout = nn.Dropout(dropout)

  # Define the forward method that takes the numerical input and the categorical features to encode as arguments
  def forward(self, x, to_encode):
    # x: a tensor of shape (batch_size, input_size) containing the numerical features
    # to_encode: a tensor of shape (batch_size, 6) containing the values of the categorical features

    # Get the embeddings for each categorical feature using the corresponding embedding layer and indexing by the feature values
    cit_emb = self.emb_cit(to_encode[:, 0])

    club_emb = self.emb_club(to_encode[:, 1])

    pos_emb = self.emb_pos(to_encode[:, 2])

    sub_pos_emb = self.emb_sub_pos(to_encode[:, 3])

    comp_emb = self.emb_comp(to_encode[:, 4])

    clubs_emb = self.emb_clubs(to_encode[:, 5])

    # Concatenate the numerical input and the embeddings along the second dimension
    x = torch.cat((x, cit_emb, club_emb, pos_emb, sub_pos_emb, comp_emb, clubs_emb), dim=1)

    # Apply the linear layers, dropout and batchnorm to get the output
    x = self.fc1(x)
    x = self.dropout(x)
    x = self.batchnorm1(x)
    x = self.relu(x)

    x = self.fc2(x)
    x = self.dropout(x)
    x = self.batchnorm2(x)
    x = self.relu(x)

    x = self.fc3(x)
    x = self.dropout(x)
    x = self.batchnorm3(x)
    x = self.relu(x)

    output = self.fc4(x)

    # Return the output
    return output.flatten()

In [18]:
# Define a LSTM model class that inherits from nn.Module
class LSTM(nn.Module):
  # Define the constructor method that takes the input size and the vocabulary sizes of four categorical features as arguments
  def __init__(self, num_input_size, voc_size_citizenship, voc_size_current_club_id, voc_size_position, voc_size_sub_position, voc_size_competitions, voc_size_clubs, hidden_size1, num_layers, dropout, device):
    # Call the parent constructor
    super(LSTM, self).__init__()

    # Compute the embedding sizes for each categorical feature using the fourth root of the vocabulary size
    emb_size_citizenship = int(voc_size_citizenship ** (1/4))
    emb_size_current_club_id = int(voc_size_current_club_id ** (1/4))
    emb_size_position = int(voc_size_position ** (1/4))
    emb_size_sub_position = int(voc_size_sub_position ** (1/4))
    emb_size_competitions = int(voc_size_competitions ** (1/4))
    emb_size_clubs = int(voc_size_clubs ** (1/4))

    # Define embedding layers for each categorical feature using the computed embedding sizes
    self.emb_cit = nn.Embedding(voc_size_citizenship, emb_size_citizenship, device=device)
    self.emb_club = nn.Embedding(voc_size_current_club_id, emb_size_current_club_id, device=device)
    self.emb_pos = nn.Embedding(voc_size_position, emb_size_position,  device=device)
    self.emb_sub_pos = nn.Embedding(voc_size_sub_position, emb_size_sub_position, device=device)
    self.emb_comp = nn.Embedding(voc_size_competitions, emb_size_competitions, device=device)
    self.emb_clubs = nn.Embedding(voc_size_clubs, emb_size_clubs, device=device)

    # Compute the total input size by adding the input size and the embedding sizes
    total_input_size = num_input_size + emb_size_citizenship + emb_size_current_club_id + emb_size_position + emb_size_sub_position + emb_size_competitions + emb_size_clubs

    self.lstm1 = nn.LSTM(total_input_size, hidden_size1, num_layers=num_layers, batch_first=True, dropout=dropout, device=device)

    # Define a linear layer that takes the total input size and outputs a single value
    self.fc1 = nn.Linear(hidden_size1, 1, device=device)



  # Define the forward method that takes the numerical input and the categorical features to encode as arguments
  def forward(self, x, to_encode):
    # x: a tensor of shape (batch_size, num_input_size) containing the numerical features
    # to_encode: a tensor of shape (batch_size, 6) containing the values of the categorical features

    # Get the embeddings for each categorical feature using the corresponding embedding layer and indexing by the feature values
    cit_emb = self.emb_cit(to_encode[:, 0])

    club_emb = self.emb_club(to_encode[:, 1])

    pos_emb = self.emb_pos(to_encode[:, 2])

    sub_pos_emb = self.emb_sub_pos(to_encode[:, 3])

    comp_emb = self.emb_comp(to_encode[:, 4])

    clubs_emb = self.emb_clubs(to_encode[:, 5])

    # Concatenate the numerical input and the embeddings along the second dimension
    x = torch.cat((x, cit_emb, club_emb, pos_emb, sub_pos_emb, comp_emb, clubs_emb), dim=1)

    # Add a dimension to the numerical input tensor corresponding to the sequence length
    x = x[:, None, :]

    # Apply the linear layer to get the output
    out1, (h1, c1) = self.lstm1(x)
    output = self.fc1(h1[0])

    # Return the output
    return output.flatten()

In [19]:
def build_model(model_type, num_input_size, voc_size_citizenship, voc_size_current_club_id, voc_size_position, voc_size_sub_position, voc_size_competitions, voc_size_clubs, dropout):

  if (model_type == "linear"):
    return LinearRegression(num_input_size, voc_size_citizenship, voc_size_current_club_id, voc_size_position, voc_size_sub_position, voc_size_competitions, voc_size_clubs, device)

  elif (model_type == "mlp"):
    hidden_size1 = 176
    hidden_size2 = 64
    hidden_size3 = 16
    return MLP(num_input_size, voc_size_citizenship, voc_size_current_club_id, voc_size_position, voc_size_sub_position, voc_size_competitions, voc_size_clubs, hidden_size1, hidden_size2, hidden_size3, dropout, device)

  elif (model_type == "lstm"):
    hidden_size1 = 32
    num_layers = 2
    return LSTM(num_input_size, voc_size_citizenship, voc_size_current_club_id, voc_size_position, voc_size_sub_position, voc_size_competitions, voc_size_clubs, hidden_size1, num_layers, dropout, device)

  else:
    raise Exception("wrong model")

In [20]:
def build_optimizer(model, opt, lr, eps):
  if (opt == "adam"):
    return torch.optim.Adam(model.parameters(), lr=lr, eps=eps)
  elif (opt == "sgd"):
    return torch.optim.SGD(model.parameters(), lr, momentum=0.9)
  else:
    raise Exception("wrong optimizer")

In [21]:
def build_dataloaders(batch_size):
  train_loader = torch.utils.data.DataLoader(dataset=dataset_train, batch_size=batch_size, shuffle=True)
  val_loader = torch.utils.data.DataLoader(dataset=dataset_val, batch_size=batch_size, shuffle=False)
  test_loader = torch.utils.data.DataLoader(dataset=dataset_test, batch_size=batch_size, shuffle=False)
  return train_loader, val_loader, test_loader

In [22]:
def run(config=None):
  # Initialize a new wandb run
  with wandb.init(config=config):

    # If called by wandb.agent, as below, this config will be set by Sweep Controller
    config = wandb.config
    wandb.log({"model name": model_type})

    val_r2score = 0
    val_mse = 0
    val_rmse = 0
    val_mae = 0

    metric_val = {"val_r2score": val_r2score, "val_mse": val_mse, "val_rmse": val_rmse, "val_mae": val_mae}
    wandb.log(metric_val)

    test_r2score = 0
    test_mse = 0
    test_rmse = 0
    test_mae = 0

    metric_test = {"test_r2score": test_r2score, "test_mse": test_mse, "test_rmse": test_rmse, "test_mae": test_mae}
    wandb.log(metric_test)

    #Defining model, criterion, optimizer, scheduler and dataloaders
    criterion = nn.MSELoss()

    if (model_type != "rf" and model_type != "gbr" and model_type != "bagging"):

        model = build_model(model_type, num_input_size, voc_size_citizenship, voc_size_current_club_id, voc_size_position, voc_size_sub_position, voc_size_competitions, voc_size_clubs, dropout=config.dropout)


        optimizer = build_optimizer(model, config.optimizer, config.lr, config.eps)
        #i want to log the name of the model

        train_loader, val_loader, test_loader = build_dataloaders(config.batch_size)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=config.epochs, eta_min=0.000001)

        #Train and validation
        wandb.watch(model, criterion, log="all", log_freq=100)

        all_targets, all_predictions = launch_model_pytorch(model, criterion, optimizer, scheduler, train_loader, val_loader, test_loader, config.epochs, config.batch_size, config.lr)

    else:
        if (model_type == "gbr"):
            model = GradientBoostingRegressor(n_estimators=config.n_est, learning_rate=config.lr, max_features=config.max_features, max_depth=config.m_depth)

        elif (model_type == "bagging"):
            #setting max_features to 1.0 is equivalent to do bagging
            model = RandomForestRegressor(n_estimators=config.n_est, max_features=1.0, max_depth=config.m_depth, n_jobs=4)

        elif (model_type == "rf"):
            model = RandomForestRegressor(n_estimators=config.n_est, max_features=config.max_features, max_depth=config.m_depth, n_jobs=4)
        else:
            raise Exception("wrong model type")

        all_predictions, all_targets = launch_model_sk(model, train_set, val_set, y_train, y_val)

    #Saving test metrics
    test_r2score = r2_score(all_targets, all_predictions)
    test_mse = mean_squared_error(all_targets, all_predictions)
    test_rmse = np.sqrt(test_mse)
    test_mae = mean_absolute_error(all_targets, all_predictions)

    metric_test = {"test_r2score": test_r2score, "test_mse": test_mse, "test_rmse": test_rmse, "test_mae": test_mae}

    print(metric_test)

    #Logging the metrics
    wandb.log(metric_test)

    #terminate the run
    wandb.finish()

In [23]:
def launch_model_sk(model, train_set, val_set, y_train, y_val):
    #training
    x_train = np.concatenate((train_to_encode, train_set), axis=1)
    model.fit(x_train, y_train)
    #save the model
    pickle.dump(model, open("models/best_{}".format(model_type), 'wb'))

    #validation
    x_val = np.concatenate((val_to_encode, val_set), axis=1)
    all_predictions_val = model.predict(x_val)
    all_targets_val = y_val
    val_r2score = r2_score(all_targets_val, all_predictions_val)
    val_mse = mean_squared_error(all_targets_val, all_predictions_val)
    val_rmse = np.sqrt(val_mse)
    val_mae = mean_absolute_error(all_targets_val, all_predictions_val)
    metric_val = {"val_r2score": val_r2score, "val_mse": val_mse, "val_rmse": val_rmse, "val_mae": val_mae}
    wandb.log(metric_val)

    #testing
    x_test = np.concatenate((test_to_encode, test_set), axis=1)
    all_predictions = model.predict(x_test)
    all_targets = y_test

    return all_predictions, all_targets

In [24]:
def launch_model_pytorch(model, criterion, optimizer, scheduler, train_loader, val_loader, test_loader, epochs, batch_size, lr):
    train_losses = np.zeros(epochs)
    val_losses = np.zeros(epochs)
    best_val_loss = np.inf
    best_val_epoch = 0

    for it in tqdm(range(epochs)):
        t0 = datetime.now()
        train_loss = train(model, criterion, optimizer, train_loader)
        scheduler.step()

        val_loss = validate(model, criterion, val_loader)
        wandb.log({
            'val_loss': val_loss,
            'epochs': it+1
          })

        # Save losses
        train_losses[it] = train_loss
        val_losses[it] = val_loss

        #We save the best model
        if val_loss < best_val_loss:
            torch.save(model.state_dict(), 'models/{}_lr={}_bs={}_opt={}.pth'.format(model.__class__.__name__, lr, batch_size, type(optimizer).__name__))

            best_val_loss = val_loss
            best_val_epoch = it
            #print('model saved')

        dt = datetime.now() - t0
        #print(f'Epoch {it+1}/{epochs}, Train Loss: {train_loss:.4f}, \
        #  Validation Loss: {test_loss:.4f}, Duration: {dt}, Best Val Epoch: {best_test_epoch}')

    #load the best model saved
    model.load_state_dict(torch.load('models/{}_lr={}_bs={}_opt={}.pth'.format(model.__class__.__name__, lr, batch_size, type(optimizer).__name__)))
    #Validate
    final_val_loss = validate(model, criterion, val_loader)
    wandb.log({
        'final_val_loss': final_val_loss,
      })

    #Testing
    all_targets, all_predictions = testing(model, test_loader)

    return all_targets, all_predictions

In [25]:
def train(model, criterion, optimizer, train_loader):
    model.train()
    train_loss = []

    for inputs, to_encode, targets in train_loader:

        # zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs, to_encode)

        loss = criterion(outputs, targets)

        # Backward and optimize
        loss.backward()
        optimizer.step()
        train_loss.append(loss.item())
    return np.sum(train_loss)

In [26]:
def validate(model, criterion, val_loader):
    model.eval()
    test_loss = []
    all_targets = []
    all_predictions = []

    for inputs, to_encode, targets in val_loader:

        outputs = model(inputs, to_encode)
        loss = criterion(outputs, targets)
        test_loss.append(loss.item())
        copy_prediction = outputs.clone()
        all_targets.append(targets.cpu().numpy())
        all_predictions.append(copy_prediction.detach().cpu().numpy())

    all_targets = np.concatenate(all_targets)
    all_predictions = np.concatenate(all_predictions)

    val_r2score = r2_score(all_targets, all_predictions)
    val_mse = mean_squared_error(all_targets, all_predictions)
    val_rmse = np.sqrt(val_mse)
    val_mae = mean_absolute_error(all_targets, all_predictions)
    metric_val = {"val_r2score": val_r2score, "val_mse": val_mse, "val_rmse": val_rmse, "val_mae": val_mae}
    wandb.log(metric_val)

    return np.sum(test_loss)

In [27]:
def testing(model, test_loader):
  all_targets = []
  all_predictions = []

  for inputs, to_encode, targets in test_loader:
      # Forward pass
      outputs = model(inputs, to_encode)
      copy_prediction = outputs.clone()
      all_targets.append(targets.cpu().numpy())
      all_predictions.append(copy_prediction.detach().cpu().numpy())

  all_targets = np.concatenate(all_targets)
  all_predictions = np.concatenate(all_predictions)

  return all_targets, all_predictions

## Launching an Hyperparameter Search (a sweep of 10 runs) for every model

In [28]:
models = ["rf", "gbr", "bagging", "mlp", "lstm", "linear"]

for model_type in models:
    sweep_config = {
        'method': 'random',
        'metric': {
            'goal': 'minimize',
            'name': 'val_mse'
        },
        'early_terminate' : {
            'type': 'hyperband',
            'min_iter': 3,
            'eta': 2
        },
        'run_cap': 10
      }
    if (model_type != "rf" and model_type != "gbr" and model_type != "bagging"):
        parameters_dict = {
            'epochs': {
                'value': 50
                },
            'optimizer': {
                'values': ['adam', 'sgd']
                },
            'dropout': {
                  'values': [0.2]
                },
            'lr': {
                'distribution': 'uniform',
                'max': 0.01,
                'min': 0.0001,
                },
            'batch_size': {
                'values': [32, 128]
                },
            'eps': {
                'value': 1e-08
                }
            }
    elif (model_type == "bagging"):
        parameters_dict = {
            'm_depth': {
                'distribution': 'int_uniform',
                'max': total_input_size,
                'min': 10,
                },
            'n_est':{
                'distribution': 'int_uniform',
                'max': 500,
                'min': 50,
                },
            }
    elif (model_type == 'rf'):
        parameters_dict = {
            'm_depth': {
                'distribution': 'int_uniform',
                'max': total_input_size,
                'min': 10,
                },
            'n_est':{
                'distribution': 'int_uniform',
                'max': 500,
                'min': 50,
                },
            'max_features':{
                'distribution': 'int_uniform',
                'max': total_input_size,
                'min': 4,
                },
            }
    elif (model_type == 'gbr'):
        parameters_dict = {
            'm_depth': {
                'distribution': 'int_uniform',
                'max': total_input_size,
                'min': 10,
                },
            'n_est':{
                'distribution': 'int_uniform',
                'max': 500,
                'min': 50,
                },
            'max_features':{
                'distribution': 'int_uniform',
                'max': total_input_size,
                'min': 4,
                },
            'lr': {
                'distribution': 'uniform',
                'max': 1.0,
                'min': 0.001,
                },
            }
    parameters_dict['model_type'] = {
        'value': model_type
        }

    sweep_config['parameters'] = parameters_dict
    sweep_id = wandb.sweep(sweep_config, project="BDproject2")
    wandb.agent(sweep_id, run, count=sweep_config["run_cap"])

In [31]:
#create class config used to test and debug

model_type = "rf"
class config:
    def __init__(self, epochs, batch_size, eps, weight_decay, optimizer, model_type):
        self.epochs = epochs
        self.batch_size = batch_size
        self.eps = eps
        self.weight_decay = weight_decay
        self.optimizer = optimizer
        self.dropout = 0.3
        self.model_type = model_type
        self.n_est = 143
        self.m_depth = 20
        self.lr = 0.274
        self.max_features = 20

config = config(epochs=30, batch_size=128, eps=1e-08, weight_decay=0.0, optimizer="sgd", model_type=model_type)

run(config)


{'test_r2score': 0.9561701986008412, 'test_mse': 0.04715347911198005, 'test_rmse': 0.21714851855810588, 'test_mae': 0.08774438604767303}


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
test_mae,▁█
test_mse,▁█
test_r2score,▁█
test_rmse,▁█
val_mae,▁█
val_mse,▁█
val_r2score,▁█
val_rmse,▁█

0,1
model name,rf
test_mae,0.08774
test_mse,0.04715
test_r2score,0.95617
test_rmse,0.21715
val_mae,0.08465
val_mse,0.04687
val_r2score,0.95599
val_rmse,0.21649
