In [1]:
import pandas as pd
import numpy as np
import datetime
from itertools import chain, combinations
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error


data = pd.read_csv('/home/nivelrios/documentos/Mburicao Project/src/data/data_extraction.csv')
# Copia del DataFrame original
df = data.copy()
# Función para obtener todas las combinaciones no vacías de una lista
def all_subsets(iterable):
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(1, len(s)+1))

# Supongamos que el DataFrame se llama df y contiene:
# - "global_peak" (target, float)
# - columnas de precipitación: "sil_10", "sil_20", ... "sil_80"
# - columnas de precipitación acumulada: "sil_accumulated_10", "sil_accumulated_20", ... "sil_accumulated_70"
#
# Se evaluará para ventanas del 1 al 7, donde la ventana 1 usa features que terminan en "_10",
# la ventana 2 los que terminan en "_20", y así sucesivamente hasta la ventana 7 (retardo de 70 min).

# Configuramos la validación cruzada (5-fold)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

results = []

# Iterar sobre ventanas: 1 hasta 7
for window in range(1, 8):
    # Determinar el sufijo en minutos: ventana 1 = 10, 2 = 20, ... ventana 7 = 70
    suffix = f"_{window*10}"
    
    # Filtrar las columnas que terminan en ese sufijo y que sean de precipitación (sil) o acumulada
    candidate_features = [col for col in df.columns 
                          if col != "global_peak" and col.endswith(suffix)
                          and (col.startswith("sil_") or col.startswith("sil_accumulated_"))]
    
    if len(candidate_features) == 0:
        print(f"No se encontraron features para la ventana {window}")
        continue
    
    best_r2 = -np.inf
    best_subset = None
    best_metrics = None

    # Iterar sobre todas las combinaciones no vacías de los features candidatos
    for subset in all_subsets(candidate_features):
        X = df[list(subset)]
        y = df["global_peak"].values.reshape(-1, 1)
        
        # Normalizar X y y
        scaler_X = MinMaxScaler()
        scaler_y = MinMaxScaler()
        X_norm = scaler_X.fit_transform(X)
        y_norm = scaler_y.fit_transform(y)
        
        # Modelo: regresión lineal
        model = LinearRegression()
        y_pred_norm = cross_val_predict(model, X_norm, y_norm, cv=kf)
        
        # Desnormalizar las predicciones
        y_pred = scaler_y.inverse_transform(y_pred_norm)
        y_true = scaler_y.inverse_transform(y_norm)
        
        # Calcular métricas
        r2_val = r2_score(y_true, y_pred)
        rmse_val = np.sqrt(mean_squared_error(y_true, y_pred))
        mape_val = mean_absolute_percentage_error(y_true, y_pred)
        
        if r2_val > best_r2:
            best_r2 = r2_val
            best_subset = subset
            best_metrics = {"R2": r2_val, "RMSE": rmse_val, "MAPE": mape_val}
    
    results.append({
        "window": window,
        "delay_min": window*10,
        "features": best_subset,
        "metrics": best_metrics
    })

# Mostrar los resultados para cada ventana
for res in results:
    print(f"Ventana {res['window']} (retardo = {res['delay_min']} min):")
    print(f"  Mejor combinación de features: {res['features']}")
    print(f"  Métricas: R² = {res['metrics']['R2']:.4f}, RMSE = {res['metrics']['RMSE']:.4f}, MAPE = {res['metrics']['MAPE']:.4f}")
    print("-"*60)


Ventana 1 (retardo = 10 min):
  Mejor combinación de features: ('sil_accumulated_10',)
  Métricas: R² = 0.8065, RMSE = 0.3647, MAPE = 0.2766
------------------------------------------------------------
Ventana 2 (retardo = 20 min):
  Mejor combinación de features: ('sil_accumulated_20',)
  Métricas: R² = 0.8059, RMSE = 0.3653, MAPE = 0.2772
------------------------------------------------------------
Ventana 3 (retardo = 30 min):
  Mejor combinación de features: ('sil_accumulated_30',)
  Métricas: R² = 0.7924, RMSE = 0.3778, MAPE = 0.2785
------------------------------------------------------------
Ventana 4 (retardo = 40 min):
  Mejor combinación de features: ('sil_40',)
  Métricas: R² = 0.8039, RMSE = 0.3672, MAPE = 0.3343
------------------------------------------------------------
Ventana 5 (retardo = 50 min):
  Mejor combinación de features: ('sil_50',)
  Métricas: R² = 0.6139, RMSE = 0.5152, MAPE = 0.3672
------------------------------------------------------------
Ventana 6 (ret

In [2]:
import pandas as pd
import numpy as np
import datetime
from itertools import chain, combinations
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error

# Función para obtener todas las combinaciones no vacías de una lista de features
def all_subsets(iterable):
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(1, len(s)+1))

# Supongamos que el DataFrame se llama df y contiene:
# - "global_peak" (target, float)
# - features de precipitación: "sil_10", "sil_20", ..., "sil_80"
# - features de precipitación acumulada: "sil_accumulated_10", "sil_accumulated_20", ..., "sil_accumulated_70"

# Configurar validación cruzada (5 folds)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

results = []

# Iterar sobre ventanas: ventana i corresponde a un retardo mínimo = i*10 minutos.
# Por ejemplo:
# - Ventana 1: se consideran features cuya parte numérica es >= 10:
#    "sil_10", "sil_20", …, "sil_80" y "sil_accumulated_10", …, "sil_accumulated_70"
# - Ventana 2: se consideran features con valor >= 20, es decir "sil_20", …, "sil_80" y "sil_accumulated_20", …, "sil_accumulated_70"
# - Y así sucesivamente hasta ventana 7 (>= 70 min).
for window in range(1, 8):
    min_delay = window * 10  # en minutos
    candidate_features = []
    
    for col in df.columns:
        if col == "global_peak":
            continue
        # Para features "sil_...", que no sean "sil_accumulated_..."
        if col.startswith("sil_") and not col.startswith("sil_accumulated_"):
            try:
                num = int(col.split("_")[1])
                if num >= min_delay and num <= 80:
                    candidate_features.append(col)
            except:
                pass
        # Para features "sil_accumulated_..."
        elif col.startswith("sil_accumulated_"):
            try:
                num = int(col.split("_")[2])
                if num >= min_delay and num <= 70:
                    candidate_features.append(col)
            except:
                pass

    if not candidate_features:
        print(f"No se encontraron features para la ventana {window}")
        continue

    best_r2 = -np.inf
    best_subset = None
    best_metrics = None

    # Explorar todas las combinaciones no vacías de los features candidatos para esta ventana
    for subset in all_subsets(candidate_features):
        X = df[list(subset)]
        y = df["global_peak"].values.reshape(-1, 1)
        
        # Normalizar
        scaler_X = MinMaxScaler()
        scaler_y = MinMaxScaler()
        X_norm = scaler_X.fit_transform(X)
        y_norm = scaler_y.fit_transform(y)
        
        # Modelo de regresión lineal
        model = LinearRegression()
        y_pred_norm = cross_val_predict(model, X_norm, y_norm, cv=kf)
        
        # Desnormalizar
        y_pred = scaler_y.inverse_transform(y_pred_norm)
        y_true = scaler_y.inverse_transform(y_norm)
        
        # Calcular métricas
        r2_val = r2_score(y_true, y_pred)
        rmse_val = np.sqrt(mean_squared_error(y_true, y_pred))
        mape_val = mean_absolute_percentage_error(y_true, y_pred)
        
        if r2_val > best_r2:
            best_r2 = r2_val
            best_subset = subset
            best_metrics = {"R2": r2_val, "RMSE": rmse_val, "MAPE": mape_val}
    
    results.append({
        "window": window,
        "min_delay": min_delay,
        "features": best_subset,
        "metrics": best_metrics
    })

# Mostrar resultados para cada ventana
for res in results:
    print(f"Ventana {res['window']} (mínimo delay = {res['min_delay']} min):")
    print(f"  Mejor combinación de features: {res['features']}")
    print(f"  Métricas: R² = {res['metrics']['R2']:.4f}, RMSE = {res['metrics']['RMSE']:.4f}, MAPE = {res['metrics']['MAPE']:.4f}")
    print("-"*60)


Ventana 1 (mínimo delay = 10 min):
  Mejor combinación de features: ('sil_50', 'sil_70', 'sil_accumulated_30', 'sil_accumulated_70')
  Métricas: R² = 0.8470, RMSE = 0.3243, MAPE = 0.2819
------------------------------------------------------------
Ventana 2 (mínimo delay = 20 min):
  Mejor combinación de features: ('sil_50', 'sil_70', 'sil_accumulated_30', 'sil_accumulated_70')
  Métricas: R² = 0.8470, RMSE = 0.3243, MAPE = 0.2819
------------------------------------------------------------
Ventana 3 (mínimo delay = 30 min):
  Mejor combinación de features: ('sil_50', 'sil_70', 'sil_accumulated_30', 'sil_accumulated_70')
  Métricas: R² = 0.8470, RMSE = 0.3243, MAPE = 0.2819
------------------------------------------------------------
Ventana 4 (mínimo delay = 40 min):
  Mejor combinación de features: ('sil_40', 'sil_60', 'sil_accumulated_60', 'sil_accumulated_70')
  Métricas: R² = 0.8209, RMSE = 0.3509, MAPE = 0.3127
------------------------------------------------------------
Ventana 