In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score,confusion_matrix

# Datos preprocesados
data = np.load('datos_preprocesados.npz')
X = data['X']

# Lee el DF y el target, 'sy_pm'
df = pd.read_csv('06_dataset.csv', skiprows=322)

df['sy_pm'] = df['sy_pm'].fillna(df['sy_pm'].mode()[0])
y = df['sy_pm'].values

# Hace el split de train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# GridSearch para regresión
param_grid = {'n_estimators':[100,200], 'max_depth':[None,10,20]}
grid = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)
print("Best params:", grid.best_params_)

# Evaluacion del modelo
y_pred = grid.predict(X_test)
print("MSE:", mean_squared_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))
np.save('y_pred_model1.npy', y_pred)

# Binning para matriz de confusión
n_bins = 4
y_test_binned = pd.qcut(y_test, q=n_bins, labels=False, duplicates='drop')
y_pred_binned = pd.qcut(y_pred, q=n_bins, labels=False, duplicates='drop')

# Matriz de confusión
cm = confusion_matrix(y_test_binned, y_pred_binned)
print("Matriz de confusion (4 bins):\n", cm)

Best params: {'max_depth': 20, 'n_estimators': 200}
MSE: 406.47017211114115
R2: 0.9979863180605242
Confusion Matrix (with 4 bins):
 [[366   1   2   0]
 [  3 365   0   0]
 [  0   2 365   1]
 [  0   0   1 368]]
