In [31]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBRegressor

In [None]:
#data cleaning

#Import data, sorting timestamps so it can be converted long properly
CSV = "SAIL2025_LVMA_data_3min_20August-25August2025_flow.csv"
df = pd.read_csv(CSV)

#Sorting variables/metadata
df['timestamp'] = pd.to_datetime(df["timestamp"])
df = df.sort_values("timestamp").reset_index(drop=True)

meta_cols = ["timestamp", "weekday", "is_weekend"]

df["weekday"] = df["weekday"].astype("int32")
df["is_weekend"] = df["is_weekend"].astype("int8")

#Converting from wide to long data
df_long = (df.melt(
    id_vars=meta_cols,
    var_name="sensor_id",
    value_name='value'
).sort_values(["sensor_id", "timestamp"]).reset_index(drop=True))

#Time features
df_long["hour"] = df_long["timestamp"].dt.hour
df_long["hour_sin"] = np.sin(2*np.pi*df_long["hour"]/24)
df_long["hour_cos"] = np.cos(2*np.pi*df_long["hour"]/24)

#Group Sorting before lags
df_long = df_long.sort_values(["sensor_id", "timestamp"]).reset_index(drop=True)

#lags (fix lags numbers if more or less steps needs to be remembered)
LAGS = [1, 2, 4, 8]
for lag in LAGS:
    df_long[f"value_lag_{lag}"] = df_long.groupby("sensor_id")["value"].shift(lag)

#Rolling mean
roll_window = max(LAGS)
df_long["roll_mean"] = (
    df_long.groupby("sensor_id")["value"]
    .shift(1)
    .rolling(roll_window, min_periods=1)
    .mean()
)

#Handling missing values (because the first 24 minutes of data dissapears without this)
drop_nas = False
if drop_nas:
    df_long = df_long.dropna().reset_index(drop=True)
else:
    cols = [f"value_lag_{l}" for l in LAGS] + ["roll_mean"]
    cols = [c for c in cols if c in df_long.columns]
    df_long[cols] = df_long[cols].fillna(method="bfill")

#Encoding the sensor IDs
cats = df_long["sensor_id"].astype("category")
df_long["sensor_id_code"] = cats.cat.codes
code_map = dict(enumerate(cats.cat.categories))

#Encoding weekday/is_weekend
def filter_date_variables(df_long, weekday: int | None = None, is_weekend: int| None = None):
     mask = pd.Series(True, index=df_long.index)
     if weekday is not None:
        mask &= df_long["weekday"] == int(weekday)
     if is_weekend is not None:
        mask &= df_long ["is_weekend"] == int(is_weekend)
     return df_long.loc[mask].sort_values("timestamp")

def peek(
          df_long,
          sensor_id: str | None = None,
          sensor_id_code: int | None = None,
          weekday: int | None = None,
          is_weekend: int | None = None,
          n: int = 20,
          cols: list[str] | None = None
):
    mask = pd.Series(True, index=df_long.index)
    if sensor_id is not None:
        mask &= df_long["sensor_id"] == sensor_id
    if sensor_id_code is not None:
        mask &= df_long["sensor_id_code"] == int(sensor_id_code)
    if weekday is not None:
        mask &= df_long["weekday"] == int(weekday)
    if is_weekend is not None:
        mask &= df_long["is_weekend"] == int(is_weekend)
    out = df_long.loc[mask].sort_values("timestamp")
    if cols is not None:
        out = out.loc[:, cols]
    return out.head(n)

#Feature Columns
feature_cols = [
    "sensor_id",
    "sensor_id_code",
    "hour_sin", "hour_cos", "weekday", "is_weekend",
    *[f"value_lag_{l}" for l in LAGS],
]
if "roll_mean" in df_long.columns:
        feature_cols.append("roll_mean")

# Feature (X) VS Target (Y) for later machine learning need later development
#X = df_long[feature_cols]
#y = df_long["value"]

# Train/validation split (change number to liking)
train_frac = 0.8
split_time = df_long["timestamp"].quantile(train_frac)
train_mask = df_long["timestamp"] < split_time
value_mask = ~train_mask

#printing 
#Change any variable (sensor_id, sensor_id_code, weekday, is_weekend, n)
# Put a # before any variable to disable it, or delete it to allow the program to search for specific data points
print(peek(
    df_long,
    #sensor_id = ,
    sensor_id_code=10,
    weekday=3,
    #is_weekend=0,
    n=20,
    cols=["timestamp", "sensor_id", "weekday", "is_weekend", "value"]
))

                      timestamp         sensor_id  weekday  is_weekend  value
24480 2025-08-21 00:00:00+02:00  CMSA-GAWW-15_210        3           0     40
24481 2025-08-21 00:03:00+02:00  CMSA-GAWW-15_210        3           0     43
24482 2025-08-21 00:06:00+02:00  CMSA-GAWW-15_210        3           0     31
24483 2025-08-21 00:09:00+02:00  CMSA-GAWW-15_210        3           0     49
24484 2025-08-21 00:12:00+02:00  CMSA-GAWW-15_210        3           0     38
24485 2025-08-21 00:15:00+02:00  CMSA-GAWW-15_210        3           0     31
24486 2025-08-21 00:18:00+02:00  CMSA-GAWW-15_210        3           0     44
24487 2025-08-21 00:21:00+02:00  CMSA-GAWW-15_210        3           0     33
24488 2025-08-21 00:24:00+02:00  CMSA-GAWW-15_210        3           0     42
24489 2025-08-21 00:27:00+02:00  CMSA-GAWW-15_210        3           0     25
24490 2025-08-21 00:30:00+02:00  CMSA-GAWW-15_210        3           0     43
24491 2025-08-21 00:33:00+02:00  CMSA-GAWW-15_210        3      

  df_long[cols] = df_long[cols].fillna(method="bfill")
