In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.feature_selection import mutual_info_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, GridSearchCV, cross_val_predict
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error

data = pd.read_csv('/home/nivelrios/documentos/Mburicao Project/src/data/data_extraction.csv')
# Copia del DataFrame original
df = data.copy()

# Convertir 'fecha' a datetime y descartar esa columna para el modelo
df['fecha'] = pd.to_datetime(df['fecha'], errors='coerce')
X = df.drop(columns=["fecha", "global_peak"])
y = df["global_peak"]

num_features = X.shape[1] Definir un pipeline que primero selecciona las K mejores features usando f_regression
# y luego entrena un modelo de regresión lineal.
pipeline = Pipeline([
    ('select', SelectKBest(score_func=f_regression)),
    ('model', LinearRegression())
])

# Definir la rejilla de parámetros para la búsqueda:
# 'select__k' indica el número de features a seleccionar.
# 'model__fit_intercept' define si se ajusta el intercepto del modelo o no.
param_grid = {
    'select__k': list(range(1, num_features + 1)) + ['all'],
    'model__fit_intercept': [True, False]
}

# Configurar validación cruzada con KFold (5 pliegues, con shuffle)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Configurar GridSearchCV para buscar la mejor combinación de hiperparámetros
grid_search = GridSearchCV(pipeline, param_grid, cv=kf, scoring='r2', n_jobs=-1, refit=True)
grid_search.fit(X, y)
best_model = grid_search.best_estimator_

print("Mejores parámetros encontrados:", grid_search.best_params_)

# --- Obtener las características seleccionadas ---
# Se accede al paso 'select' del pipeline y se utiliza get_support() para obtener la máscara booleana.
mask = best_model.named_steps['select'].get_support()
selected_features = X.columns[mask]
print("Features seleccionadas:", list(selected_features))

# Utilizar cross_val_predict para obtener predicciones fuera de la muestra usando el modelo óptimo
y_pred = cross_val_predict(best_model, X, y, cv=kf)

# Calcular las métricas de rendimiento
r2 = r2_score(y, y_pred)
rmse = np.sqrt(mean_squared_error(y, y_pred))
mape = mean_absolute_percentage_error(y, y_pred)*100

print(f"R²: {r2:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"MAPE: {mape:.3f}")

# Graficar valores reales vs. predichos
plt.figure(figsize=(8,6))
plt.scatter(y, y_pred, color='blue', alpha=0.7, label='Predicciones')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2, label='Ideal')
plt.xlabel('Valor Real')
plt.ylabel('Valor Predicho')
plt.title('Valor Real vs. Valor Predicho')
plt.legend()
plt.show()

SyntaxError: invalid syntax (2925056311.py, line 20)

In [None]:
X = df.drop(columns=["global_peak"])
y = df["global_peak"].values.reshape(-1, 1)  
if len(y.shape) == 1:
    y = y.reshape(-1, 1)
scaler_X = MinMaxScaler()  
scaler_y = MinMaxScaler()
X_normalized = scaler_X.fit_transform(X)
y_normalized = scaler_y.fit_transform(y)
model = LinearRegression()
rfecv = RFECV(estimator=model, step=1, cv=KFold(n_splits=5, shuffle=True, random_state=42),
              scoring='r2')

rfecv.fit(X_normalized, y_normalized)
selected_features = X.columns[rfecv.support_]     
print("Número óptimo de features:", rfecv.n_features_)
selected_features = X.columns[rfecv.support_]
print("Features seleccionadas:", list(selected_features))