## Libraries

In [1]:
# For handling data
import numpy as np
import pandas as pd

# For plotting
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# For machine learning
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import GridSearchCV
from functions import custom_error

In [2]:
path = './DSL_Winter_Project_2024/'
df_without_otiliers_pca = pd.read_csv(path + 'train_without_outliers_pca.csv')
df_without_scaler=pd.read_csv(path + 'train_without_outliers_scaled.csv')
df_without_pca=pd.read_csv(path + 'train_without_outliers.csv')

In [3]:
df_without_otiliers_pca

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC66,PC67,PC68,PC69,PC70,PC71,PC72,PC73,x,y
0,1.737486,-2.731021,1.003991,-0.749037,0.343120,0.137300,-0.636270,1.110771,0.716239,-0.910031,...,-0.434828,0.097037,-0.538475,-0.391294,0.255213,-0.158027,0.119743,0.068547,425.0,285.0
1,4.065744,-2.786316,2.362573,0.100823,-3.157451,-0.773902,-0.613166,1.171056,-1.146384,0.220550,...,1.305885,0.227704,0.129430,0.120253,0.704566,0.516766,-0.532212,0.085923,575.0,250.0
2,-1.150287,-4.763248,-2.610161,-2.203354,0.662713,-0.600236,-2.152416,-0.909347,-0.245220,1.599408,...,-0.407897,-0.104403,-0.314425,0.310318,0.068415,-0.420980,-0.171047,-0.072021,245.0,230.0
3,-1.861710,4.349421,-0.142349,0.402060,-1.094392,6.480412,-1.559656,-0.020604,-1.249346,-1.369321,...,0.687174,-0.034180,1.320186,-0.667891,0.400407,-0.347652,-0.463541,1.118421,395.0,555.0
4,-4.101982,1.267833,2.009555,-0.822039,0.393089,0.163441,0.653145,0.437987,1.574366,-0.093781,...,0.168010,-0.163811,0.932782,0.929322,-0.015037,-1.069594,-0.199652,-0.251304,275.0,490.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267606,4.308645,0.943745,0.173160,0.638737,-0.904907,-1.192973,2.258960,0.121981,-1.075240,-1.107762,...,0.055056,-0.511925,0.015117,0.091863,-0.000452,0.407204,0.567554,-0.640532,580.0,390.0
267607,4.802572,-2.544687,2.720127,2.083392,-3.946778,-3.476563,0.308218,0.347116,-1.853717,5.699646,...,0.450210,0.233386,1.010854,-0.238552,0.103711,1.291865,-0.655887,-0.371007,580.0,285.0
267608,-2.871135,3.626576,1.294119,0.568461,-0.734781,-0.577068,-0.642085,0.240285,-0.551313,-0.299072,...,0.670614,0.043020,0.249646,0.551378,0.226044,0.939201,-0.258860,-0.573653,360.0,550.0
267609,-0.552108,-1.180191,0.466246,1.071564,2.974300,-0.606148,-1.863909,-2.358189,1.045945,-0.040700,...,-0.140421,-0.528426,0.756357,0.186438,0.309575,0.860150,0.309954,-0.068060,325.0,355.0


In [4]:
#Split the datasets in X and y
X_train = df_without_otiliers_pca.loc[:,:'PC60']
y = df_without_otiliers_pca[['x', 'y']]

In [5]:
param_grid_rf = {
    'estimator__n_estimators': [10, 100, 500, 1000],
    'estimator__max_depth': [10, 100, 500, None],
}

In [6]:
X_train

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC51,PC52,PC53,PC54,PC55,PC56,PC57,PC58,PC59,PC60
0,1.737486,-2.731021,1.003991,-0.749037,0.343120,0.137300,-0.636270,1.110771,0.716239,-0.910031,...,0.349431,1.677243,0.045026,0.058968,0.644882,-0.334647,-0.525434,-0.078617,-0.148611,0.553067
1,4.065744,-2.786316,2.362573,0.100823,-3.157451,-0.773902,-0.613166,1.171056,-1.146384,0.220550,...,-0.331855,-0.110009,-0.802279,1.627539,-2.065427,1.019473,0.366251,1.418207,0.139856,-1.455523
2,-1.150287,-4.763248,-2.610161,-2.203354,0.662713,-0.600236,-2.152416,-0.909347,-0.245220,1.599408,...,0.342383,-0.123345,0.615184,2.297369,0.338536,-1.170694,0.126134,2.126381,0.759301,-1.156468
3,-1.861710,4.349421,-0.142349,0.402060,-1.094392,6.480412,-1.559656,-0.020604,-1.249346,-1.369321,...,-1.408002,0.677241,0.212914,-1.061706,0.611651,1.527259,1.032460,-0.630776,-0.544068,0.245981
4,-4.101982,1.267833,2.009555,-0.822039,0.393089,0.163441,0.653145,0.437987,1.574366,-0.093781,...,-0.271198,-0.397106,0.796185,-0.516639,0.071735,0.195821,1.177861,-0.474617,0.370597,-0.871449
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267606,4.308645,0.943745,0.173160,0.638737,-0.904907,-1.192973,2.258960,0.121981,-1.075240,-1.107762,...,-1.353034,0.674847,0.706876,-0.839135,0.818072,1.540475,-1.079323,-0.081620,0.514737,-0.517755
267607,4.802572,-2.544687,2.720127,2.083392,-3.946778,-3.476563,0.308218,0.347116,-1.853717,5.699646,...,-1.313277,0.566303,-0.120462,0.978422,-0.904991,3.412659,7.444010,0.410137,-1.003981,0.224835
267608,-2.871135,3.626576,1.294119,0.568461,-0.734781,-0.577068,-0.642085,0.240285,-0.551313,-0.299072,...,-0.017882,-0.066101,-0.611579,1.154850,-0.702385,-0.290137,-0.602963,0.142659,-2.428341,0.337367
267609,-0.552108,-1.180191,0.466246,1.071564,2.974300,-0.606148,-1.863909,-2.358189,1.045945,-0.040700,...,0.219699,-0.944044,0.452017,-0.432191,-0.268575,-0.658749,-1.058882,-0.977207,0.704415,-0.824268


In [7]:
MultiOutputRegressor(RandomForestRegressor(random_state=0)).fit(X_train, y)

In [8]:
rf_train = MultiOutputRegressor(RandomForestRegressor())
rf_grid_search = GridSearchCV(rf_train, param_grid_rf, cv=5, scoring=custom_error)

In [10]:
y_pred=rf_grid_search.fit(X_train, y)

Traceback (most recent call last):
  File "c:\Users\alejandrs\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "c:\UdeA\Lab\Final project\functions.py", line 5, in custom_error
    return np.mean(np.diag(euclidean_distances(y_test, y_pred)))
  File "c:\Users\alejandrs\anaconda3\lib\site-packages\sklearn\metrics\pairwise.py", line 328, in euclidean_distances
    return _euclidean_distances(X, Y, X_norm_squared, Y_norm_squared, squared)
  File "c:\Users\alejandrs\anaconda3\lib\site-packages\sklearn\metrics\pairwise.py", line 369, in _euclidean_distances
    distances = -2 * safe_sparse_dot(X, Y.T, dense_output=True)
  File "c:\Users\alejandrs\anaconda3\lib\site-packages\sklearn\utils\extmath.py", line 152, in safe_sparse_dot
    ret = a @ b
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 21.3 GiB for an array with shape (53523, 53523) and data type float64

Traceback (most recent ca