## Libraries

In [1]:
# For handling data
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import csv

# For plotting
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# For machine learning
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputRegressor

from functions import custom_error, custom_error3, save_submission

In [2]:
path = './DSL_Winter_Project_2024/'
df_train = pd.read_csv(path + 'train.csv')
df_test = pd.read_csv(path + 'test.csv')

In [3]:
X_train, y_train = df_train.loc[:, :'rms[17]'], df_train[['x', 'y']]
X_test, y_test = df_test.loc[:, :'rms[17]'], df_test[['x', 'y']]

In [4]:
# Standar scaler
scaler_pipeline = Pipeline(
    [('scaler', StandardScaler())]
)

# PCA
pca_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=73)),
])

In [5]:
X_train_scaled = scaler_pipeline.fit_transform(X_train)
X_test_scaled = scaler_pipeline.transform(X_test)

X_train_pca = pca_pipeline.fit_transform(X_train)
X_test_pca = pca_pipeline.transform(X_test)

## Param grid of the different estimators

In [6]:
rf_param_grid = {
    'estimator__n_estimators': [10, 100, 500, 1000, 2000],
}

gb_param_grid = {
    'estimator__n_estimators': [10, 100, 500, 1000, 2000],
}

knn_param_grid = {
    'estimator__n_neighbors': [1, 10, 50, 100, 1000, 5000],
}

svr_param_grid = {
    'estimator__C': [0.1, 1, 10, 100],
    'estimator__gamma': [0.001, 0.01, 0.1, 1],
}

## Random forest
- Using ```train.csv```

In [None]:
rf_grid_search = GridSearchCV(MultiOutputRegressor(RandomForestRegressor()), rf_param_grid, cv=5, scoring=custom_error3)
rf_grid_search.fit(X_train, y_train)

In [None]:
rf_grid_search.best_params_, custom_error3(rf_grid_search, X_test, y_test)

- Using ```train_pca.csv```

In [None]:
rf_pca_grid_search = GridSearchCV(MultiOutputRegressor(RandomForestRegressor()), rf_param_grid, cv=5, scoring=custom_error3)
rf_pca_grid_search.fit(X_train_pca, y_train)

In [None]:
rf_pca_grid_search.best_params_, custom_error3(rf_pca_grid_search, X_test_pca, y_test)

- Using ```train_scaled.csv```

In [None]:
rf_scaled_grid_search = GridSearchCV(MultiOutputRegressor(RandomForestRegressor()), rf_param_grid, cv=5, scoring=custom_error3)
rf_scaled_grid_search.fit(X_train_scaled, y_train)

In [None]:
rf_scaled_grid_search.best_params_, custom_error3(rf_scaled_grid_search, X_test_scaled, y_test)

## Gradient boosting
- Using ```train.csv```

In [7]:
gb_grid_search = GridSearchCV(MultiOutputRegressor(GradientBoostingRegressor()), gb_param_grid, cv=5, scoring=custom_error3)
gb_grid_search.fit(X_train, y_train)

Traceback (most recent call last):
  File "c:\Users\mateo\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\mateo\Documents\TODO\Ingeniería\Ingeniería de Sistemas U. de A\PoliTo\SEMESTRES\Primer año\Data science lab - Process and methods\Data science lab - Laboratories\Final project\functions.py", line 5, in custom_error
    return np.mean(np.diag(euclidean_distances(y_test, y_pred)))
                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\mateo\anaconda3\Lib\site-packages\sklearn\metrics\pairwise.py", line 338, in euclidean_distances
    return _euclidean_distances(X, Y, X_norm_squared, Y_norm_squared, squared)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\mateo\anaconda3\Lib\site-packages\sklearn\metrics\pairwise.py", line 379, in _euclidean_distances
    distance

In [None]:
gb_grid_search.best_params_, custom_error3(gb_grid_search, X_test, y_test)

- Using ```train_pca.csv```

In [None]:
gb_pca_grid_search = GridSearchCV(MultiOutputRegressor(GradientBoostingRegressor()), gb_param_grid, cv=5, scoring=custom_error3)
gb_pca_grid_search.fit(X_train_pca, y_train)

In [None]:
gb_pca_grid_search.best_params_, custom_error3(gb_pca_grid_search, X_test_pca, y_test)

- Using ```train_scaled.csv```

In [None]:
gb_scaled_grid_search = GridSearchCV(MultiOutputRegressor(GradientBoostingRegressor()), gb_param_grid, cv=5, scoring=custom_error3)
gb_scaled_grid_search.fit(X_train_scaled, y_train)

In [None]:
gb_scaled_grid_search.best_params_, custom_error3(gb_scaled_grid_search, X_test_scaled, y_test)