# 1. Libraries

In [13]:
import pandas as pd
import numpy as np
import datetime
import time
import os
from itertools import chain, combinations
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error

# 2. Data load

In [4]:
data = pd.read_csv('/home/nivelrios/documentos/Mburicao Project/src/data/data_extraction.csv')
df = data.copy()

# 3. Finding the best features for each window 

In [9]:
def all_subsets(iterable):
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(1, len(s)+1))

In [11]:
K = 5
kf = KFold(n_splits=K, shuffle=True, random_state=42)

results_r2 = []
results_rmse = []
results_mape = []

WINDOW_LIMIT = 7

for window in range(1, WINDOW_LIMIT + 1):
    min_delay = window * 10  # en minutos
    print(f"Procesando ventana {window} (min_delay = {min_delay} min)...")
    start_time = time.time()
    
    candidate_features = []
    for col in df.columns:
        if col == "global_peak":
            continue
        if col.startswith("sil_") and not col.startswith("sil_accumulated_"):
            try:
                num = int(col.split("_")[1])
                if num >= min_delay and num <= 80:
                    candidate_features.append(col)
            except:
                pass
        elif col.startswith("sil_accumulated_"):
            try:
                num = int(col.split("_")[2])
                if num >= min_delay and num <= 70:
                    candidate_features.append(col)
            except:
                pass
    
    if len(candidate_features) == 0:
        print(f"No se encontraron features para la ventana {window}")
        continue

    best_r2 = -np.inf
    best_r2_subset = None
    best_r2_metrics = None

    best_rmse = np.inf
    best_rmse_subset = None
    best_rmse_metrics = None

    best_mape = np.inf
    best_mape_subset = None
    best_mape_metrics = None

    for subset in all_subsets(candidate_features):
        include_min = False
        for feat in subset:
            if feat.startswith("sil_") and not feat.startswith("sil_accumulated_"):
                try:
                    num = int(feat.split("_")[1])
                    if num == min_delay:
                        include_min = True
                        break
                except:
                    pass
            elif feat.startswith("sil_accumulated_"):
                try:
                    num = int(feat.split("_")[2])
                    if num == min_delay:
                        include_min = True
                        break
                except:
                    pass
        if not include_min:
            continue  

        X = df[list(subset)]
        y_val = df["global_peak"].values.reshape(-1, 1)

        scaler_X = MinMaxScaler()
        scaler_y = MinMaxScaler()
        X_norm = scaler_X.fit_transform(X)
        y_norm = scaler_y.fit_transform(y_val)

        model = LinearRegression()
        y_pred_norm = cross_val_predict(model, X_norm, y_norm, cv=kf)
        y_pred = scaler_y.inverse_transform(y_pred_norm)
        y_true = scaler_y.inverse_transform(y_norm)

        current_r2 = r2_score(y_true, y_pred)
        current_rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        current_mape = mean_absolute_percentage_error(y_true, y_pred)

        if current_r2 > best_r2:
            best_r2 = current_r2
            best_r2_subset = subset
            best_r2_metrics = {"R2": current_r2, "RMSE": current_rmse, "MAPE": current_mape}
        
        if current_rmse < best_rmse:
            best_rmse = current_rmse
            best_rmse_subset = subset
            best_rmse_metrics = {"R2": current_r2, "RMSE": current_rmse, "MAPE": current_mape}
        
        if current_mape < best_mape:
            best_mape = current_mape
            best_mape_subset = subset
            best_mape_metrics = {"R2": current_r2, "RMSE": current_rmse, "MAPE": current_mape}

    elapsed_time = time.time() - start_time
    print(f"Ventana {window} procesada en {elapsed_time:.2f} segundos.")
    
    results_r2.append({
        "window": window,
        "n_features": len(best_r2_subset) if best_r2_subset is not None else 0,
        "best_features": best_r2_subset,
        "R2": best_r2_metrics["R2"],
        "RMSE": best_r2_metrics["RMSE"],
        "MAPE": best_r2_metrics["MAPE"]
    })
    
    results_rmse.append({
        "window": window,
        "n_features": len(best_rmse_subset) if best_rmse_subset is not None else 0,
        "best_features": best_rmse_subset,
        "RMSE": best_rmse_metrics["RMSE"],
        "R2": best_rmse_metrics["R2"],
        "MAPE": best_rmse_metrics["MAPE"]
    })
    
    results_mape.append({
        "window": window,
        "n_features": len(best_mape_subset) if best_mape_subset is not None else 0,
        "best_features": best_mape_subset,
        "MAPE": best_mape_metrics["MAPE"],
        "R2": best_mape_metrics["R2"],
        "RMSE": best_mape_metrics["RMSE"]
    })

df_results_r2 = pd.DataFrame(results_r2, columns=["window", "n_features", "best_features", "R2", "RMSE", "MAPE"])
df_results_rmse = pd.DataFrame(results_rmse, columns=["window", "n_features", "best_features", "RMSE", "R2", "MAPE"])
df_results_mape = pd.DataFrame(results_mape, columns=["window", "n_features", "best_features", "MAPE", "R2", "RMSE"])

print("\nResultados basados en mejor R²:")
print(df_results_r2)
print("\nResultados basados en mejor RMSE:")
print(df_results_rmse)
print("\nResultados basados en mejor MAPE:")
print(df_results_mape)

Procesando ventana 1 (min_delay = 10 min)...
Ventana 1 procesada en 217.61 segundos.
Procesando ventana 2 (min_delay = 20 min)...
Ventana 2 procesada en 54.26 segundos.
Procesando ventana 3 (min_delay = 30 min)...
Ventana 3 procesada en 13.52 segundos.
Procesando ventana 4 (min_delay = 40 min)...
Ventana 4 procesada en 3.37 segundos.
Procesando ventana 5 (min_delay = 50 min)...
Ventana 5 procesada en 0.84 segundos.
Procesando ventana 6 (min_delay = 60 min)...
Ventana 6 procesada en 0.21 segundos.
Procesando ventana 7 (min_delay = 70 min)...
Ventana 7 procesada en 0.05 segundos.

Resultados basados en mejor R²:
   window  n_features                                      best_features  \
0       1           4  (sil_70, sil_80, sil_accumulated_10, sil_accum...   
1       2           4  (sil_50, sil_70, sil_accumulated_20, sil_accum...   
2       3           4  (sil_50, sil_70, sil_accumulated_30, sil_accum...   
3       4           4  (sil_40, sil_60, sil_accumulated_60, sil_accum...   
4 

# 4. Save datasets

In [14]:
ruta = "/home/nivelrios/documentos/Mburicao Project/src/evaluation"
os.makedirs(ruta, exist_ok=True)
archivo = os.path.join(ruta, "LR_best_features_r2.csv")
df_results_r2.to_csv(archivo, index=False)

ruta = "/home/nivelrios/documentos/Mburicao Project/src/evaluation"
os.makedirs(ruta, exist_ok=True)
archivo = os.path.join(ruta, "LR_best_features_rmse.csv")
df_results_rmse.to_csv(archivo, index=False)

ruta = "/home/nivelrios/documentos/Mburicao Project/src/evaluation"
os.makedirs(ruta, exist_ok=True)
archivo = os.path.join(ruta, "LR_best_features_mape.csv")
df_results_mape.to_csv(archivo, index=False)