# 1. Libraries

In [26]:
import pandas as pd
import numpy as np
import os
import datetime

# 2. Data load

In [27]:
merged = pd.read_csv('/home/nivelrios/documentos/Mburicao Project/data/external/sil_nivel_merged.csv')
picos = pd.read_csv('/home/nivelrios/documentos/Mburicao Project/data/external/peak_nivel_detection.csv')

# 3. Data extraction

In [28]:
merged['fecha'] = pd.to_datetime(merged['fecha'], errors='coerce')

picos['inicio'] = pd.to_datetime(picos['inicio'], errors='coerce')
picos['fin'] = pd.to_datetime(picos['fin'], errors='coerce')
picos['max_global_fecha'] = pd.to_datetime(picos['max_global_fecha'], errors='coerce')
picos['max_local_fecha'] = pd.to_datetime(picos['max_local_fecha'], errors='coerce')

In [29]:
if "fecha" in merged.columns:
    merged.set_index("fecha", inplace=True)
else:
    merged.index = pd.to_datetime(merged.index, errors='coerce')

CRITICAL_EVENT = 2
features_list = []

for idx, row in picos.iterrows():
    base_time = row["inicio"] 
    global_peak = row["max_global"]
    
    all_dates_exist = True
    for delay in range(0, 51, 10):  
        time_point = base_time - datetime.timedelta(minutes=delay)
        if time_point not in merged.index:
            all_dates_exist = False
            break
    if not all_dates_exist:
        continue

    feature_dict = {
        "fecha": base_time,
        "global_peak": global_peak,
        "event": global_peak >= CRITICAL_EVENT  
    }

    for delay in range(0, 51, 10): 
        col_name = f"sil_{delay}"
        time_point = base_time - datetime.timedelta(minutes=delay)
        sil_value = merged.loc[time_point, "sil"]
        if isinstance(sil_value, pd.Series):
            sil_value = sil_value.iloc[0]  
        feature_dict[col_name] = sil_value

    for accum_delay in range(0, 51, 10):  
        col_accum = f"sil_accumulated_{accum_delay}"
        total = sum(feature_dict[f"sil_{d}"] for d in range(accum_delay, 51, 10))
        feature_dict[col_accum] = round(total, 1)

    features_list.append(feature_dict)

data_for_clustering = pd.DataFrame(features_list)
data_for_clustering.dropna(subset=['global_peak'], inplace=True)

# 4. Save dataframe

In [30]:
ruta = "/home/nivelrios/documentos/Mburicao Project/src/data"
os.makedirs(ruta, exist_ok=True)
archivo = os.path.join(ruta, "data_for_clustering_2.csv")
data_for_clustering.to_csv(archivo, index=False)