In [2]:
from sklearn.metrics import mean_squared_error, r2_score
import os
import pandas as pd

BASE_PATH = "C:/Users/aguil/OneDrive/Desktop/proyectos javier/regresion-lineal/models/excel"
TRAIN_PATHS = [
    "X_train_con_outliers.xlsx",
    "X_train_sin_outliers.xlsx",
    "X_train_con_outliers_norm.xlsx",
    "X_train_sin_outliers_norm.xlsx",
    "X_train_con_outliers_scal.xlsx",
    "X_train_sin_outliers_scal.xlsx"
]
TRAIN_DATASETS = []
for path in TRAIN_PATHS:
    TRAIN_DATASETS.append(
        # pd.read_excel(BASE_PATH + "/" + path)
        pd.read_excel(f"{BASE_PATH}/{path}")
        # pd.read_excel(os.path.join(BASE_PATH, path))
    )

TEST_PATHS = [
    "X_test_con_outliers.xlsx",
    "X_test_sin_outliers.xlsx",
    "X_test_con_outliers_norm.xlsx",
    "X_test_sin_outliers_norm.xlsx",
    "X_test_con_outliers_scal.xlsx",
    "X_test_sin_outliers_scal.xlsx"
]
TEST_DATASETS = []
for path in TEST_PATHS:
    TEST_DATASETS.append(
        pd.read_excel(f"{BASE_PATH}/{path}")
    )

y_train = pd.read_excel(f"{BASE_PATH}/y_train.xlsx")
y_test = pd.read_excel(f"{BASE_PATH}/y_test.xlsx")

In [3]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

results = []
for index, dataset in enumerate(TRAIN_DATASETS):
    print(f"Entrenando dataset {index}...")

    # Inicializar modelo de regresión
    model = LinearRegression(fit_intercept=True, copy_X=True, n_jobs=None, positive=False)
    
    # Entrenar el modelo
    model.fit(dataset, y_train)
    
    # Predicciones
    y_pred_train = model.predict(dataset)
    y_pred_test = model.predict(TEST_DATASETS[index])

    # Calcular métricas de regresión
    train_mse = mean_squared_error(y_train, y_pred_train)
    test_mse = mean_squared_error(y_test, y_pred_test)
    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)

    # Guardar resultados
    results.append(
        {
            "train_mse": train_mse,
            "test_mse": test_mse,
            "train_r2": train_r2,
            "test_r2": test_r2
        }
    )

# Mostrar resultados
results

Entrenando dataset 0...
Entrenando dataset 1...
Entrenando dataset 2...
Entrenando dataset 3...
Entrenando dataset 4...
Entrenando dataset 5...


[{'train_mse': 37005531.72811555,
  'test_mse': 35493102.611650534,
  'train_r2': 0.7297182858804965,
  'test_r2': 0.8068466322629111},
 {'train_mse': 121323209.24975131,
  'test_mse': 157906559.7384013,
  'train_r2': 0.11387721167135234,
  'test_r2': 0.14067293200683983},
 {'train_mse': 37005531.72811554,
  'test_mse': 35493102.61165054,
  'train_r2': 0.7297182858804965,
  'test_r2': 0.8068466322629111},
 {'train_mse': 121323209.24975131,
  'test_mse': 157906559.7384013,
  'train_r2': 0.11387721167135234,
  'test_r2': 0.14067293200683983},
 {'train_mse': 37005531.72811554,
  'test_mse': 35493102.6116505,
  'train_r2': 0.7297182858804965,
  'test_r2': 0.8068466322629113},
 {'train_mse': 121323209.24975131,
  'test_mse': 157906559.7384013,
  'train_r2': 0.11387721167135234,
  'test_r2': 0.14067293200683983}]

In [4]:
import numpy as np
df_results = pd.DataFrame(results)

# Calcular RMSE
df_results["train_rmse"] = np.sqrt(df_results["train_mse"])
df_results["test_rmse"] = np.sqrt(df_results["test_mse"])

# Seleccionar columnas relevantes para mostrar en tabla
df_results = df_results[["train_rmse", "test_rmse", "train_r2", "test_r2"]]

# Mostrar tabla
print(df_results)

     train_rmse     test_rmse  train_r2   test_r2
0   6083.217219   5957.608800  0.729718  0.806847
1  11014.681532  12566.087686  0.113877  0.140673
2   6083.217219   5957.608800  0.729718  0.806847
3  11014.681532  12566.087686  0.113877  0.140673
4   6083.217219   5957.608800  0.729718  0.806847
5  11014.681532  12566.087686  0.113877  0.140673


In [5]:
from sklearn.linear_model import Lasso
best_dataset_idx = 0
alpha = 0.1
lasso_model = Lasso(alpha = alpha, max_iter = 5000, random_state = 42)
lasso_model.fit(TRAIN_DATASETS[best_dataset_idx], y_train)
y_pred_train = lasso_model.predict(TRAIN_DATASETS[best_dataset_idx])
y_pred_test = lasso_model.predict(TEST_DATASETS[best_dataset_idx])
mse_train = mean_squared_error(y_train, y_pred_train)
r2_train = r2_score(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)
print(mse_train)
print(r2_train)
print(mse_test)
print(r2_test)

37005531.856725805
0.7297182849411504
35494032.00772592
0.8068415744920003


In [6]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
ridge_model = Ridge(alpha = 0.5, random_state= 42)
param_grid = {
    "alpha": [0.01, 0.1, 1, 10, 100],
    "max_iter": [100, 300, 1000, 2000, 4000, 8000, 12000],
    "tol": np.linspace(0.0001, 0.01, 100),
}
grid_ridge = GridSearchCV(estimator = ridge_model, param_grid = param_grid, scoring = 'r2', cv = 5, n_jobs = -1)
ridge_model.fit(TRAIN_DATASETS[best_dataset_idx], y_train)
y_pred_train = ridge_model.predict(TRAIN_DATASETS[best_dataset_idx])
y_pred_test = ridge_model.predict(TEST_DATASETS[best_dataset_idx])
results_ridge = []
results_ridge.append({
        "train_r2": r2_score(y_train, y_pred_train),
        "train_mse": mean_squared_error(y_train, y_pred_train),
        "test_r2": r2_score(y_test, y_pred_test),
        "test_mse": mean_squared_error(y_test, y_pred_test),
    })
df_ridge = pd.DataFrame(results_ridge)
print(df_ridge)

   train_r2     train_mse   test_r2      test_mse
0  0.729713  3.700626e+07  0.806376  3.557959e+07


In [7]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
elastic_model = ElasticNet(alpha = 0.5, random_state= 42)
param_grid_elastic = {
    "alpha": [0.01, 0.1, 1, 10],
    "l1_ratio": [0.1, 0.5],
    "max_iter": [100, 300, 1000, 2000],
    "tol": np.linspace(0.0001, 0.01, 10),
}
grid_elastic = GridSearchCV(estimator = elastic_model, param_grid = param_grid_elastic, scoring = 'r2', cv = 5, n_jobs = -1)
grid_elastic.fit(TRAIN_DATASETS[best_dataset_idx], y_train)
y_pred_train = grid_elastic.predict(TRAIN_DATASETS[best_dataset_idx])
y_pred_test = grid_elastic.predict(TEST_DATASETS[best_dataset_idx])
results_elastic = []
results_elastic.append({
        "train_r2": r2_score(y_train, y_pred_train),
        "train_mse": mean_squared_error(y_train, y_pred_train),
        "test_r2": r2_score(y_test, y_pred_test),
        "test_mse": mean_squared_error(y_test, y_pred_test),
    })
df_elastic = pd.DataFrame(results_elastic)
print(df_elastic)

   train_r2     train_mse   test_r2      test_mse
0  0.729143  3.708425e+07  0.801527  3.647071e+07
