# 1. Libraries

In [45]:
import pandas as pd
import numpy as np
import datetime
import time
import os
from itertools import chain, combinations
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

# 2. Data load

In [46]:
data = pd.read_csv('/home/nivelrios/documentos/Mburicao Project/src/data/data_for_clustering_2.csv')
df = data.copy()

# 3. Finding the best features for each window 

In [47]:
def all_subsets(iterable):
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(1, len(s)+1))

In [48]:
WINDOW_LIMIT = 5

K = 5
skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=42)

results_precision = []
results_recall = []
results_f1 = []
results_false_negative = [] 

for window in range(0, WINDOW_LIMIT + 1):
    min_delay = window * 10  
    print(f"Procesando ventana {window} (min_delay = {min_delay} min)...")
    start_time = time.time()
    
    candidate_features = []
    for col in df.columns:
        if col in ["global_peak", "fecha", "event"]:
            continue
        if col.startswith("sil_") and not col.startswith("sil_accumulated_"):
            try:
                num = int(col.split("_")[1])
                if num >= min_delay and num <= 50:
                    candidate_features.append(col)
            except:
                pass
        elif col.startswith("sil_accumulated_"):
            try:
                num = int(col.split("_")[2])
                if num >= min_delay and num <= 50:
                    candidate_features.append(col)
            except:
                pass
    
    if len(candidate_features) == 0:
        print(f"No se encontraron features para la ventana {window}")
        continue

    best_precision = -np.inf
    best_precision_subset = None
    best_precision_metrics = None

    best_recall = -np.inf
    best_recall_subset = None
    best_recall_metrics = None

    best_f1 = -np.inf
    best_f1_subset = None
    best_f1_metrics = None

    best_FN = np.inf  
    best_FN_subset = None
    best_FN_metrics = None

    for subset in all_subsets(candidate_features):
        include_min = False
        for feat in subset:
            if feat.startswith("sil_") and not feat.startswith("sil_accumulated_"):
                try:
                    num = int(feat.split("_")[1])
                    if num == min_delay:
                        include_min = True
                        break
                except:
                    pass
            elif feat.startswith("sil_accumulated_"):
                try:
                    num = int(feat.split("_")[2])
                    if num == min_delay:
                        include_min = True
                        break
                except:
                    pass
        if not include_min:
            continue  

        X = df[list(subset)]
        y_class = df["event"].astype(int).values  

        scaler_X = MinMaxScaler()
        X_norm = scaler_X.fit_transform(X)
        
        clf = SVC(kernel='rbf', class_weight='balanced', random_state=42)
        y_pred = cross_val_predict(clf, X_norm, y_class, cv=skf)
        
        precision_val = precision_score(y_class, y_pred, zero_division=0)
        recall_val = recall_score(y_class, y_pred, zero_division=0)
        f1_val = f1_score(y_class, y_pred, zero_division=0)
        
        cm = confusion_matrix(y_class, y_pred)
        fn = cm[1, 0]
        
        if precision_val > best_precision:
            best_precision = precision_val
            best_precision_subset = subset
            best_precision_metrics = {"precision": precision_val, "recall": recall_val, "f1": f1_val}
        
        if recall_val > best_recall:
            best_recall = recall_val
            best_recall_subset = subset
            best_recall_metrics = {"recall": recall_val, "precision": precision_val, "f1": f1_val}
        
        if f1_val > best_f1:
            best_f1 = f1_val
            best_f1_subset = subset
            best_f1_metrics = {"f1": f1_val, "precision": precision_val, "recall": recall_val}
        
        if fn < best_FN:
            best_FN = fn
            best_FN_subset = subset
            best_FN_metrics = {"FN": fn, "precision": precision_val, "recall": recall_val, "f1": f1_val}
    
    elapsed_time = time.time() - start_time
    print(f"Ventana {window} procesada en {elapsed_time:.2f} segundos.")
    
    results_precision.append({
        "window": window,
        "n_features": len(best_precision_subset) if best_precision_subset is not None else 0,
        "best_features": best_precision_subset,
        "precision": best_precision_metrics["precision"],
        "recall": best_precision_metrics["recall"],
        "f1": best_precision_metrics["f1"]
    })
    
    results_recall.append({
        "window": window,
        "n_features": len(best_recall_subset) if best_recall_subset is not None else 0,
        "best_features": best_recall_subset,
        "recall": best_recall_metrics["recall"],
        "precision": best_recall_metrics["precision"],
        "f1": best_recall_metrics["f1"]
    })

    results_f1.append({
        "window": window,
        "n_features": len(best_f1_subset) if best_f1_subset is not None else 0,
        "best_features": best_f1_subset,
        "f1": best_f1_metrics["f1"],
        "precision": best_f1_metrics["precision"],
        "recall": best_f1_metrics["recall"]
    })
    
    results_false_negative.append({
        "window": window,
        "n_features": len(best_FN_subset) if best_FN_subset is not None else 0,
        "best_features": best_FN_subset,
        "FN": best_FN_metrics["FN"],
        "precision": best_FN_metrics["precision"],
        "recall": best_FN_metrics["recall"],
        "f1": best_FN_metrics["f1"]
    })

df_results_precision = pd.DataFrame(results_precision, columns=["window", "n_features", "best_features", "precision", "recall", "f1"])
df_results_recall = pd.DataFrame(results_recall, columns=["window", "n_features", "best_features", "recall", "precision", "f1"])
df_results_f1 = pd.DataFrame(results_f1, columns=["window", "n_features", "best_features", "f1", "precision", "recall"])
df_results_false_negative = pd.DataFrame(results_false_negative, 
    columns=["window", "n_features", "best_features", "FN", "precision", "recall", "f1"])

print("\nResultados basados en mejor Precision:")
print(df_results_precision)
print("\nResultados basados en mejor Recall:")
print(df_results_recall)
print("\nResultados basados en mejor F1:")
print(df_results_f1)
print("\nResultados basados en menor error Tipo II (Falsos Negativos):")
print(df_results_false_negative)

Procesando ventana 0 (min_delay = 0 min)...
Ventana 0 procesada en 53.81 segundos.
Procesando ventana 1 (min_delay = 10 min)...
Ventana 1 procesada en 13.42 segundos.
Procesando ventana 2 (min_delay = 20 min)...
Ventana 2 procesada en 3.40 segundos.
Procesando ventana 3 (min_delay = 30 min)...
Ventana 3 procesada en 0.84 segundos.
Procesando ventana 4 (min_delay = 40 min)...
Ventana 4 procesada en 0.21 segundos.
Procesando ventana 5 (min_delay = 50 min)...
Ventana 5 procesada en 0.06 segundos.

Resultados basados en mejor Precision:
   window  n_features                             best_features  precision  \
0       0           1                                  (sil_0,)   1.000000   
1       1           3      (sil_10, sil_30, sil_accumulated_20)   0.555556   
2       2           2  (sil_accumulated_20, sil_accumulated_40)   0.500000   
3       3           1                                 (sil_30,)   0.666667   
4       4           1                     (sil_accumulated_40,)   0.235

# 4. Save datasets

In [49]:
ruta = "/home/nivelrios/documentos/Mburicao Project/src/evaluation"
os.makedirs(ruta, exist_ok=True)
archivo = os.path.join(ruta, "SVM_best_features_precision_2.csv")
df_results_precision.to_csv(archivo, index=False)

ruta = "/home/nivelrios/documentos/Mburicao Project/src/evaluation"
os.makedirs(ruta, exist_ok=True)
archivo = os.path.join(ruta, "SVM_best_features_recall_2.csv")
df_results_recall.to_csv(archivo, index=False)

ruta = "/home/nivelrios/documentos/Mburicao Project/src/evaluation"
os.makedirs(ruta, exist_ok=True)
archivo = os.path.join(ruta, "SVM_best_features_f1_2.csv")
df_results_f1.to_csv(archivo, index=False)

ruta = "/home/nivelrios/documentos/Mburicao Project/src/evaluation"
os.makedirs(ruta, exist_ok=True)
archivo = os.path.join(ruta, "SVM_best_features_fn_2.csv")
df_results_false_negative.to_csv(archivo, index=False)