## Libraries

In [1]:
# For handling data
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# For plotting
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# For machine learning
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

from sklearn.multioutput import MultiOutputRegressor
from functions import mean_euclidean_distance_error, RandomizedSearchHO, SFSExtractor

In [2]:
path = './DSL_Winter_Project_2024/'
df_dev = pd.read_csv(path + 'development.csv')
df_evaluation = pd.read_csv(path + 'evaluation.csv')

In [3]:
X = df_dev.loc[:, 'pmax[0]':]
y = df_dev[['x', 'y']]

In [4]:
# Let's split X, y into train_val and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y.values, test_size=0.3, random_state=8)

In [5]:
features_selected = ['pmax[1]', 'pmax[3]', 'pmax[4]', 'pmax[5]', 'pmax[6]', 'pmax[8]', 'pmax[9]', 'pmax[10]', 'pmax[11]', 'pmax[13]', 'pmax[14]', 'pmax[15]', 'negpmax[1]', 'negpmax[2]', 'negpmax[3]', 'negpmax[4]', 'negpmax[5]', 'negpmax[6]', 'negpmax[8]', 'negpmax[9]', 'negpmax[10]', 'negpmax[11]', 'negpmax[13]', 'negpmax[14]', 'tmax[1]', 'tmax[3]', 'tmax[4]', 'tmax[5]', 'tmax[8]', 'tmax[10]', 'tmax[11]', 'tmax[13]', 'tmax[14]', 'tmax[15]', 'tmax[16]', 'tmax[17]', 'area[1]', 'area[2]', 'area[3]', 'area[4]', 'area[5]', 'area[6]', 'area[8]', 'area[9]', 'area[10]', 'area[11]', 'area[13]', 'area[14]', 'rms[0]', 'rms[1]', 'rms[2]', 'rms[4]', 'rms[5]', 'rms[6]', 'rms[7]', 'rms[9]', 'rms[10]', 'rms[11]', 'rms[14]', 'rms[17]']

In [6]:
preprocessing_pipeline = Pipeline([
    ('dim_red', SFSExtractor(features_selected)),
])

In [7]:
X_train_SFS = preprocessing_pipeline.fit_transform(X_train).values
X_test_SFS = preprocessing_pipeline.transform(X_test).values

## Param grid of the different estimators

In [8]:
rf_param_grid = {
    'estimator__n_estimators': [50, 100, 200, 400, 600, 1000],
    'estimator__max_features': ['sqrt', None],
    'estimator__min_samples_split': [2, 5, 10],
    'estimator__bootstrap': [True, False],
    'estimator__criterion': ["squared_error", "poisson"],
    'estimator__n_jobs': [-1],
}

gb_param_grid = {
    'estimator__n_estimators': [50, 100, 200, 400, 600, 1000],
    'estimator__loss': ['huber', 'squared_error', 'absolute_error'],
    'estimator__learning_rate': [0.01, 0.05, 0.1, 0.25, 0.5],
    'estimator__max_depth': [3, 7, 13, 19, 29],
}

knn_param_grid = {
    'estimator__n_neighbors': [7, 9, 11, 13, 15],
    'estimator__weights': ['uniform', 'distance'],
    'estimator__p': [1, 2, 3, 4],
}

svr_param_grid = {
    'estimator__C': [0.1, 1, 10, 100],
    'estimator__gamma': [0.001, 0.01, 0.1, 1, 'scale', 'auto'],
    'estimator__kernel': ['rbf', 'poly', 'sigmoid'],
    'estimator__degree': [3, 4, 5],
    'estimator__epsilon': [0.01, 0.1, 1, 10],
}

mlp_param_grid = {
    'estimator__hidden_layer_sizes': [(7,), (7,7,7), (16, 16, 16), (64, 64, 64), (128, 128, 128) ],
    'estimator__activation': ['tanh', 'relu'],
    'estimator__solver': ['sgd', 'adam'],
    'estimator__alpha': [0.0001, 0.001, 0.05],
    'estimator__learning_rate': ['constant', 'adaptive'],
    'max_iter': [200, 1000, 10000],
}

## Random forest
- Using the approach Sequential Feature Selection

In [9]:
rf_randomized_search = RandomizedSearchHO(MultiOutputRegressor(RandomForestRegressor()), rf_param_grid, 10, scoring=mean_euclidean_distance_error)
rf_randomized_search.fit(X_train_SFS, y_train)

Training model 1/10
Hyperparameters: {'estimator__n_estimators': 100, 'estimator__max_features': None, 'estimator__min_samples_split': 2, 'estimator__bootstrap': True, 'estimator__criterion': 'squared_error', 'estimator__n_jobs': -1}
Score: (4.783390094179218, 3.5987518984947062)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

MemoryError: 

In [None]:
rf_randomized_search.best_params_, mean_euclidean_distance_error(rf_randomized_search, X_test_SFS, y_test)

NameError: name 'rf_randomized_search' is not defined

## Gradient Boosting
- Using the approach Sequential Feature Selection

In [None]:
gb_randomized_search = RandomizedSearchHO(MultiOutputRegressor(GradientBoostingRegressor()), gb_param_grid, 10, scoring=mean_euclidean_distance_error)
gb_randomized_search.fit(X_train_SFS, y_train)

In [None]:
gb_randomized_search.best_params_, mean_euclidean_distance_error(gb_randomized_search, X_test_SFS, y_test)

## Support Vector Regression
- Using the approach Sequential Feature Selection

In [None]:
svr_randomized_search = RandomizedSearchHO(MultiOutputRegressor(SVR()), svr_param_grid, 10, scoring=mean_euclidean_distance_error)
svr_randomized_search.fit(X_train_SFS, y_train)

In [None]:
svr_randomized_search.best_params_, mean_euclidean_distance_error(svr_randomized_search, X_test_SFS, y_test)

## MLP
- Using the approach Sequential Feature Selection

In [None]:
mlp_randomized_search = RandomizedSearchHO(MultiOutputRegressor(MLPRegressor()), mlp_param_grid, 10, scoring=mean_euclidean_distance_error)
mlp_randomized_search.fit(X_train_SFS, y_train)

In [None]:
mlp_randomized_search.best_params_, mean_euclidean_distance_error(mlp_randomized_search, X_test_SFS, y_test)

## KNN
- Using the approach Sequential Feature Selection

In [None]:
knn_randomized_search = RandomizedSearchHO(MultiOutputRegressor(KNeighborsRegressor()), knn_param_grid, 10, scoring=mean_euclidean_distance_error)
knn_randomized_search.fit(X_train_SFS, y_train)

In [None]:
knn_randomized_search.best_params_, mean_euclidean_distance_error(knn_randomized_search, X_test_SFS, y_test)

In [10]:
nlayers = [i for i in range(2, 17, 3)]
lsize = [2**i for i in range(5, 10)]

In [11]:
[(neuron, ) * layer for neuron in lsize for layer in nlayers]

[(32, 32),
 (32, 32, 32, 32, 32),
 (32, 32, 32, 32, 32, 32, 32, 32),
 (32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32),
 (32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32),
 (64, 64),
 (64, 64, 64, 64, 64),
 (64, 64, 64, 64, 64, 64, 64, 64),
 (64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64),
 (64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64),
 (128, 128),
 (128, 128, 128, 128, 128),
 (128, 128, 128, 128, 128, 128, 128, 128),
 (128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128),
 (128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128),
 (256, 256),
 (256, 256, 256, 256, 256),
 (256, 256, 256, 256, 256, 256, 256, 256),
 (256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256),
 (256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256),
 (512, 512),
 (512, 512, 512, 512, 512),
 (512, 512, 512, 512, 512, 512, 512, 512),
 (512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512),
 (512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512)]