# CUP dataset

Members:
- Dieudunne
- Mostafa
- Matteo

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
import os

# Add the src directory to the Python path
src_path = os.path.join(os.getcwd(), 'src')
if src_path not in sys.path:
    sys.path.append(src_path)


In [None]:
from src.activation_functions import *
from src.loss_functions import *
from src.random_search import *
from src.batch_normalization import *
from src.layer import *
from src.early_stopping import *
from src.utils import *
from src.optimizers import *
import itertools
import pandas as pd
import numpy as np
import sys
import os


## Data loading and splitting

In [None]:
df_path = "../ML_project/data/cup/ML-CUP24-TR.csv"
df_path2 = "../ML_project/data/cup/ML-CUP24-TS.csv"

# Preview the dataset
df = pd.read_csv(df_path, skiprows=7, header=None)
df.head()

In [None]:
df.columns = ["ID"] + [f'input_{i}' for i in range(12)] + ['target_x', 'target_y', 'target_z']
df = df.drop("ID", axis=1)

Splitting training set into training and validation set

In [None]:
X_train, X_val, y_train, y_val, X_scaler, y_scaler = preprocess_data(
    df, 
    target=["target_x", "target_y", "target_z"],
    normalize_type="z-score",
    val_ratio=0.2,
    regression=True
)

In [None]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape

In [None]:
X_train = np.asarray(X_train)
y_train = np.asarray(y_train)
X_val = np.asarray(X_val)
y_val = np.asarray(y_val)

## Random search

Defining a grid or parameter distribution and searching over them for parameter selection

In [None]:
param_distributions = {
    'hidden_size': [8, 16, 32, 64],
    'n_h_layers': [2,3],
    'hidden_activation': [Activation_Tanh, Activation_Leaky_ReLU, Activation_Sigmoid, Activation_ReLU],
    'batch_norm': [True, False],
    'learning_rate': np.logspace(-3, -1, num=5).tolist(),
    'l1': np.logspace(-5, -1, num=5).tolist(),
    'l2': np.logspace(-5, -1, num=5).tolist(),
    'dropout_rate': np.logspace(-5, -1, num=20).tolist(),
    'batch_size': [8, 16, 32,64],
    'n_epochs': [100, 200, 300],
    'weight_decay': [0, 5e-2, 1e-2, 1e-3, 1e-4, 1e-5], #np.logspace(-4, -1, num=5).tolist(),
    'patience': [0, 30, 50],
    'weights_init': ['gaussian', 'gaussian_scaled', 'xavier', 'he', 'random'],
    'output_activation': [Activation_Linear()],
    'sched_decay': [2]
}

print(f"Number of possible combinations: {count_permutations(param_distributions)}")

In [None]:
best_hyperparams, best_performance = random_search(X_train=X_train,
                                                   y_train=y_train,
                                                   param_distributions=param_distributions,
                                                   n_iters=500, regression=True, csv_path="cup_top5res.csv")  # adjust n_iters as needed


In [None]:
print(best_hyperparams)

## Final model training

In [None]:
if best_hyperparams['CC']:
    model = CascadeCorrelation(input_size = 12, output_size=3, activation=Activation_Leaky_ReLU, output_activation = Activation_Sigmoid)
else:
    model = NN(
        l1=best_hyperparams['l1'],
        l2=best_hyperparams['l2'],
        input_size=12,
        hidden_size=best_hyperparams['hidden_size'],
        output_size=3,
        hidden_activation=best_hyperparams['hidden_activation'],
        dropout_rate=best_hyperparams['dropout_rate'],
        use_batch_norm=best_hyperparams['batch_norm'],
        weights_init=best_hyperparams['weights_init'],
        n_h_layers=best_hyperparams['n_h_layers'],
        output_activation=best_hyperparams['output_activation']
    )

train = Train(best_hyperparams, model, regression=True)
train.train_and_evaluate(X_train, y_train, X_val, y_val)
print(f"Final Validation R² Score: {train.val_scores[-1]:.4f}; Loss {train.val_losses[-1]:.4f}")
train.plot(score=True)