In [1]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Visualizaciones
# -----------------------------------------------------------------------
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import tree

# Para realizar la regresión lineal y la evaluación del modelo
# -----------------------------------------------------------------------
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error


from sklearn.model_selection import KFold,LeaveOneOut, cross_val_score


from sklearn.preprocessing import StandardScaler

from tqdm import tqdm


# Ignorar los warnings
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings('ignore')

In [32]:
df = pd.read_csv("../datos/online_shoppers_intention_clase_nonulls_estan_sinout_encoding.csv", index_col=0)

In [8]:
X = df.drop(columns="PageValues")
y = df["PageValues"]

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=42)

In [None]:
# Definimos un espacio de parámetros para optimizar
params_rf = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4], 
    'max_leaf_nodes': [4, 6, 8, 10, 20, 30]
}

In [None]:
rf = RandomForestRegressor(random_state=42, bootstrap=True)

grid_search_rf = GridSearchCV(rf,
                              params_rf,
                              cv=3,
                              scoring="neg_mean_squared_error",
                              n_jobs=-1)

grid_search_rf.fit(X_train, y_train)
best_params= grid_search_rf.best_params_
best_params

In [15]:
modelo_final_rf = grid_search_rf.best_estimator_

In [17]:
modelo_final_rf.fit(X_train, y_train)

y_train_pred = modelo_final_rf.predict(X_train)
y_test_pred = modelo_final_rf.predict(X_test)

In [29]:
# Step 1: Compute the mean of y_true
y_mean_train = np.mean(y_train)

# Step 2: Use the mean as the prediction for all instances
y_mean_pred_train = np.full_like(y_train, y_mean_train)

# Step 1: Compute the mean of y_true
y_mean_test = np.mean(y_test)

# Step 2: Use the mean as the prediction for all instances
y_mean_pred_test = np.full_like(y_test, y_mean_test)

In [30]:
metricas = {
    'train': {
        'r2_score': r2_score(y_train, y_train_pred),
        'MAE': mean_absolute_error(y_train, y_train_pred),
        'MSE': mean_squared_error(y_train, y_train_pred),
        'RMSE': np.sqrt(mean_squared_error(y_train, y_train_pred)),
        'RMSE_mean': np.sqrt(mean_squared_error(y_train, y_mean_pred_train))
    },
    'test': {
        'r2_score': r2_score(y_test, y_test_pred),
        'MAE': mean_absolute_error(y_test, y_test_pred),
        'MSE': mean_squared_error(y_test, y_test_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, y_test_pred)),
        'RMSE_mean': np.sqrt(mean_squared_error(y_test, y_mean_pred_test))

    }
}
pd.DataFrame(metricas).T

Unnamed: 0,r2_score,MAE,MSE,RMSE,RMSE_mean
train,0.218712,6.877425,213.711228,14.618865,16.538953
test,0.092151,7.079813,233.272395,15.273258,16.02968


Probar dropeando 2 features

In [33]:
X = df.drop(columns=["PageValues","InformationalDuration","Informational"])
y = df["PageValues"]

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=42)

In [34]:
# Definimos un espacio de parámetros para optimizar
params_rf_v2 = {
    'n_estimators': [90, 100, 110],
    'max_depth': [4,5,6,7],
    'max_features': [2,4,5,6,8,10,13],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [5,7,4], 
    'max_leaf_nodes': [4, 6, 8, 10, 20, 30]
}

In [None]:
rf = RandomForestRegressor(random_state=42, bootstrap=True)

grid_search_rf_v2 = GridSearchCV(rf,
                              params_rf_v2,
                              cv=3,
                              scoring="neg_mean_squared_error",
                              n_jobs=-1,
                              verbose=1)

grid_search_rf_v2.fit(X_train, y_train)
best_params_v2= grid_search_rf_v2.best_params_


Fitting 3 folds for each of 4536 candidates, totalling 13608 fits


NameError: name 'best_params' is not defined

In [36]:
best_params_v2

{'max_depth': 7,
 'max_features': 5,
 'max_leaf_nodes': 30,
 'min_samples_leaf': 4,
 'min_samples_split': 2,
 'n_estimators': 110}

In [37]:
modelo_final_rf_v2 = grid_search_rf_v2.best_estimator_

In [42]:
modelo_final_rf_v2.fit(X_train, y_train)

y_train_pred = modelo_final_rf_v2.predict(X_train)
y_test_pred = modelo_final_rf_v2.predict(X_test)

In [43]:
# Step 1: Compute the mean of y_true
y_mean_train = np.mean(y_train)

# Step 2: Use the mean as the prediction for all instances
y_mean_pred_train = np.full_like(y_train, y_mean_train)

# Step 1: Compute the mean of y_true
y_mean_test = np.mean(y_test)

# Step 2: Use the mean as the prediction for all instances
y_mean_pred_test = np.full_like(y_test, y_mean_test)

In [47]:
metricas = {
    'train': {
        'r2_score': r2_score(y_train, y_train_pred),
        'MAE': mean_absolute_error(y_train, y_train_pred),
        'MSE': mean_squared_error(y_train, y_train_pred),
        'MSE_mean': mean_squared_error(y_train, y_mean_pred_train),
        'RMSE': np.sqrt(mean_squared_error(y_train, y_train_pred))
    },
    'test': {
        'r2_score': r2_score(y_test, y_test_pred),
        'MAE': mean_absolute_error(y_test, y_test_pred),
        'MSE': mean_squared_error(y_test, y_test_pred),
        'MSE_mean': mean_squared_error(y_test, y_mean_pred_test),
        'RMSE': np.sqrt(mean_squared_error(y_test, y_test_pred))

    }
}
pd.DataFrame(metricas).T

Unnamed: 0,r2_score,MAE,MSE,MSE_mean,RMSE
train,0.22285,6.867287,212.579183,273.53696,14.580095
test,0.100369,7.097086,231.160815,256.950651,15.203974
