## Libraries

In [None]:
# For handling data
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import make_scorer
import csv

# For plotting
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# For machine learning
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputRegressor

from functions import custom_error, custom_error_for_large_datasets, save_submission

In [None]:
path = './DSL_Winter_Project_2024/'
df_train = pd.read_csv(path + 'train.csv')
df_test = pd.read_csv(path + 'test.csv')

In [None]:
X_train, y_train = df_train.loc[:, :'rms[17]'], df_train[['x', 'y']]
X_test, y_test = df_test.loc[:, :'rms[17]'], df_test[['x', 'y']]

In [None]:
# Standar scaler
scaler_pipeline = Pipeline(
    [('scaler', StandardScaler())]
)

# PCA
pca_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=73)),
])

In [None]:
X_train_scaled = scaler_pipeline.fit_transform(X_train)
X_test_scaled = scaler_pipeline.transform(X_test)

X_train_pca = pca_pipeline.fit_transform(X_train)
X_test_pca = pca_pipeline.transform(X_test)

## Param grid of the different estimators

In [None]:
rf_param_grid = {
    'estimator__n_estimators': [10, 100, 500, 1000, 2000],
}

gb_param_grid = {
    'estimator__n_estimators': [10, 100, 500, 1000, 2000],
}

knn_param_grid = {
    'estimator__n_neighbors': [1, 10, 50, 100, 1000, 5000],
}

svr_param_grid = {
    'estimator__C': [0.1, 1, 10, 100],
    'estimator__gamma': [0.001, 0.01, 0.1, 1],
}

## Custom score (mean of the euclidean distance)

In [None]:
scorer = make_scorer(custom_error_for_large_datasets, greater_is_better=False)

## Random forest
- Using ```train.csv```

In [None]:
rf_grid_search = GridSearchCV(MultiOutputRegressor(RandomForestRegressor()), rf_param_grid, cv=5, scoring='neg_mean_squared_error')
rf_grid_search.fit(X_train, y_train)

In [None]:
rf_grid_search.best_params_, scorer(rf_grid_search, X_test, y_test)

- Using ```train_pca.csv```

In [None]:
rf_pca_grid_search = GridSearchCV(MultiOutputRegressor(RandomForestRegressor()), rf_param_grid, cv=5, scoring='neg_mean_squared_error')
rf_pca_grid_search.fit(X_train_pca, y_train)

In [None]:
rf_pca_grid_search.best_params_, scorer(rf_pca_grid_search, X_test_pca, y_test)

- Using ```train_scaled.csv```

In [None]:
rf_scaled_grid_search = GridSearchCV(MultiOutputRegressor(RandomForestRegressor()), rf_param_grid, cv=5, scoring='neg_mean_squared_error')
rf_scaled_grid_search.fit(X_train_scaled, y_train)

In [None]:
rf_scaled_grid_search.best_params_, scorer(rf_scaled_grid_search, X_test_scaled, y_test)

## Gradient boosting
- Using ```train.csv```

In [None]:
gb_grid_search = GridSearchCV(MultiOutputRegressor(GradientBoostingRegressor()), gb_param_grid, cv=5, scoring='neg_mean_squared_error')
gb_grid_search.fit(X_train, y_train)

In [None]:
gb_grid_search.best_params_, scorer(gb_grid_search, X_test, y_test)

- Using ```train_pca.csv```

In [None]:
gb_pca_grid_search = GridSearchCV(MultiOutputRegressor(GradientBoostingRegressor()), gb_param_grid, cv=5, scoring='neg_mean_squared_error')
gb_pca_grid_search.fit(X_train_pca, y_train)

In [None]:
gb_pca_grid_search.best_params_, scorer(gb_pca_grid_search, X_test_pca, y_test)

- Using ```train_scaled.csv```

In [None]:
gb_scaled_grid_search = GridSearchCV(MultiOutputRegressor(GradientBoostingRegressor()), gb_param_grid, cv=5, scoring='neg_mean_squared_error')
gb_scaled_grid_search.fit(X_train_scaled, y_train)

In [None]:
gb_scaled_grid_search.best_params_, scorer(gb_scaled_grid_search, X_test_scaled, y_test)

## SVR
- Using ```train.csv```

In [None]:
svr_grid_search = GridSearchCV(MultiOutputRegressor(SVR()), svr_param_grid, cv=5, scoring='neg_mean_squared_error')
svr_grid_search.fit(X_train, y_train)

In [None]:
svr_grid_search.best_params_, scorer(svr_grid_search, X_test, y_test)

- Using ```train_pca.csv```

In [None]:
svr_pca_grid_search = GridSearchCV(MultiOutputRegressor(SVR()), svr_param_grid, cv=5, scoring='neg_mean_squared_error')
svr_pca_grid_search.fit(X_train_pca, y_train)

In [None]:
svr_pca_grid_search.best_params_, scorer(svr_pca_grid_search, X_test_pca, y_test)

- Using ```train_scaled.csv```

In [None]:
svr_scaled_grid_search = GridSearchCV(MultiOutputRegressor(SVR()), svr_param_grid, cv=5, scoring='neg_mean_squared_error')
svr_scaled_grid_search.fit(X_train_scaled, y_train)

In [None]:
svr_scaled_grid_search.best_params_, scorer(svr_scaled_grid_search, X_test_scaled, y_test)

## KNN
- Using ```train.csv```

In [None]:
knn_grid_search = GridSearchCV(MultiOutputRegressor(KNeighborsRegressor()), knn_param_grid, cv=5, scoring='neg_mean_squared_error')
knn_grid_search.fit(X_train, y_train)

In [None]:
knn_grid_search.best_params_, scorer(knn_grid_search, X_test, y_test)

- Using ```train_pca.csv```

In [None]:
knn_pca_grid_search = GridSearchCV(MultiOutputRegressor(KNeighborsRegressor()), knn_param_grid, cv=5, scoring='neg_mean_squared_error')
knn_pca_grid_search.fit(X_train_pca, y_train)

In [None]:
knn_pca_grid_search.best_params_, scorer(knn_pca_grid_search, X_test_pca, y_test)

- Using ```train_scaled.csv```

In [None]:
knn_scaled_grid_search = GridSearchCV(MultiOutputRegressor(KNeighborsRegressor()), knn_param_grid, cv=5, scoring='neg_mean_squared_error')
knn_scaled_grid_search.fit(X_train_scaled, y_train)

In [None]:
knn_scaled_grid_search.best_params_, scorer(knn_scaled_grid_search, X_test_scaled, y_test)