In [130]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split

from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import RidgeCV


from sklearn.metrics import mean_absolute_error

In [4]:
BASE = Path("../data/processed")
X_train_filtered = pd.read_csv(BASE / "X_train.csv", index_col=0,parse_dates=True)
X_test_filtered = pd.read_csv(BASE / "X_test.csv", index_col=0)
y_train = pd.read_csv(BASE / "y_train2.csv", index_col=0)


In [5]:
Xtr = X_train_filtered
Xte = X_test_filtered
Y   = y_train

In [6]:
holed_cols = [c for c in Xtr.columns if c.startswith("holed_")]
clean_cols = [c for c in Xtr.columns if c not in holed_cols]

In [7]:
len(holed_cols)

999

## 1. KNN Imputer

In [None]:
train_T = Xtr[clean_cols + holed_cols].T.astype(np.float32)

imputer = KNNImputer(
    n_neighbors=10,           # √† tuner (5, 10, 20‚Ä¶)
    weights="distance",       # pond√©ration par distance (souvent meilleur)
    metric="nan_euclidean"    # distance qui g√®re les NaN proprement
)

train_T_filled = imputer.fit_transform(train_T)
train_filled = pd.DataFrame(train_T_filled, index=train_T.index, columns=train_T.columns).T


In [12]:
Xtr.index = pd.to_datetime(Xtr.index, format="%Y-%m-%d %H:%M:%S", errors="coerce")
Y.index = pd.to_datetime(Y.index, format="%Y-%m-%d %H:%M:%S", errors="coerce")
train_filled.index = pd.to_datetime(train_filled.index, format="%Y-%m-%d %H:%M:%S", errors="coerce")


In [13]:
mask_missing = Xtr[holed_cols].isna()
Xhat = train_filled[holed_cols].loc[Y.index]    # aligner sur l'index de y_train
diff = (Xhat - Y).where(mask_missing[Y.columns])
mse = (diff.pow(2).sum().sum()) / mask_missing[Y.columns].sum().sum()
rmse = float(np.sqrt(mse))
print(f"RMSE KNN (sur trous train) = {rmse:.3f}")

RMSE KNN (sur trous train) = 196.243


## 2. Mod√®le de R√©gression 

In [19]:
# --- split des colonnes trou√©es 80/20 ---
train_cols, val_cols = train_test_split(holed_cols, test_size=0.2, random_state=42)
train_cols = clean_cols + train_cols

X = X_train_filtered[train_cols].copy()
# --- features temporelles ---
X_feat = X.copy()
X_feat["hour"] = X_feat.index.hour
X_feat["dayofweek"] = X_feat.index.dayofweek

# --- normalisation (par colonne) ---
scaler = StandardScaler()
for c in train_cols:
    X_feat[c] = scaler.fit_transform(X_feat[[c]])

In [None]:
X = X_train_filtered[clean_cols].copy()

# --- split des colonnes trou√©es 80/20 ---
train_cols, val_cols = train_test_split(holed_cols, test_size=0.2, random_state=42)
train_cols = clean_cols + train_cols

# --- features temporelles ---
X_feat = X.copy()
X_feat["hour"] = X_feat.index.hour
X_feat["dayofweek"] = X_feat.index.dayofweek

# --- normalisation (par colonne) ---
scaler = StandardScaler()
for c in train_cols:
    X_feat[c] = scaler.fit_transform(X_feat[[c]])

    # --- uniquement les lignes o√π les colonnes d‚Äôentra√Ænement sont connues ---
mask_train = ~X_feat[train_cols].isna().any(axis=1)
X_train = X_feat.loc[mask_train].drop(columns=holed_cols)
y_train = X_feat.loc[mask_train, train_cols]

    # --- mod√®le global multi-sorties ---
model = MultiOutputRegressor(RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1))

model.fit(X_train, y_train)
print("Model trained on 80% of holed columns")

# --- validation : pr√©dire les colonnes jamais vues ---
mask_val = ~X_feat[val_cols].isna().any(axis=1)
X_val = X_feat.loc[mask_val].drop(columns=holed_cols)
y_val = X_feat.loc[mask_val, val_cols]

    # pr√©diction s√©par√©e pour chaque colonne de validation
y_pred = pd.DataFrame(index=y_val.index)
for col in val_cols:
    y_pred[col] = model.estimator.predict(X_val)[:, train_cols.index(col)] \
if col in train_cols else np.nan

    # --- m√©trique ---
common_idx = y_val.index.intersection(y_pred.index)
rmse = np.sqrt(mean_squared_error(y_val.loc[common_idx], y_pred.loc[common_idx]))
print(f"RMSE (validation on 20% unseen columns) = {rmse:.3f}")

## 3. Feature Engineering + LightGBM

In [103]:
def normalize_lags(df):
    for l in range(1, 6):
        if f'lag_{l}' in df.columns:
            df[f'lag_{l}'] = (df[f'lag_{l}'] - df['mean']) / df['std']
        if f'lead_{l}' in df.columns:
            df[f'lead_{l}'] = (df[f'lead_{l}'] - df['mean']) / df['std']
    return df


In [105]:
PATH_X_TRAIN = "../data/preprocessed/X_train_features.csv"
PATH_X_VAL = "../data/preprocessed/X_val_features.csv"
PATH_Y_VAL = "../data/preprocessed/Y_val_true.csv"

X_train = pd.read_csv(PATH_X_TRAIN)
X_val = pd.read_csv(PATH_X_VAL)
y_val = pd.read_csv(PATH_Y_VAL)

In [None]:
X_train = normalize_lags(X_train)
X_val = normalize_lags(X_val)

In [107]:
le = LabelEncoder()
le.fit(list(X_train["col"].unique()) + list(X_val["col"].unique()))

X_train["col_id"] = le.transform(X_train["col"])
X_val["col_id"] = le.transform(X_val["col"])


In [108]:
# Colonnes √† ignorer
drop_cols = ["timestamp", "col", "y"] if "y" in X_train.columns else ["timestamp", "col"]

features = [c for c in X_train.columns if c not in drop_cols]

# Features et cible du training
X_train_feat = X_train[features]
y_train = X_train["y"]  # cible du mod√®le

# Features du jeu de validation
X_val_feat = X_val[features]

print(f"Features utilis√©es : {len(features)}")


Features utilis√©es : 16


In [109]:
X_val_feat

Unnamed: 0,weekday,mean,std,lag_1,lag_2,lag_3,lag_4,lag_5,lead_1,lead_2,lead_3,lead_4,lead_5,hour_sin,hour_cos,col_id
0,0,843.695513,403.572811,2.240747,1.708005,2.111898,2.426587,-0.571633,,1.172786,1.138096,0.895265,0.255975,0.866025,5.000000e-01,12245
1,0,843.695513,403.572811,,2.240747,1.708005,2.111898,2.426587,1.172786,1.138096,0.895265,0.255975,0.729248,0.923880,3.826834e-01,12245
2,0,843.695513,403.572811,0.627655,-0.291634,0.129604,0.270842,0.206417,-1.027561,0.533496,0.434381,0.327833,0.310488,0.258819,-9.659258e-01,12245
3,0,843.695513,403.572811,0.213851,-0.465085,0.974556,0.310488,0.327833,0.065179,0.786239,-1.178711,-0.410572,-0.373404,-0.793353,-6.087614e-01,12245
4,1,843.695513,403.572811,0.592965,1.755085,0.801106,0.288187,1.175264,0.915088,1.076149,0.233674,0.595443,-0.365970,0.965926,-2.588190e-01,12245
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128848,0,295.560859,101.077649,0.152745,0.202212,0.756242,-0.361711,0.390187,,0.281359,2.062168,2.705238,3.328522,0.965926,-2.588190e-01,12248
128849,0,295.560859,101.077649,,0.152745,0.202212,0.756242,-0.361711,0.281359,2.062168,2.705238,3.328522,0.320933,0.923880,-3.826834e-01,12248
128850,0,295.560859,101.077649,0.479227,-0.015442,-0.104483,0.291253,1.092617,-0.707979,0.380293,0.439653,0.083492,-0.015442,-1.000000,-1.836970e-16,12248
128851,0,295.560859,101.077649,0.548481,0.014238,-0.015442,0.083492,0.439653,,-1.430196,0.667201,-1.074034,,-0.500000,8.660254e-01,12248


In [110]:
y_val.drop(columns = ['timestamp','col'], inplace=True)

In [111]:
scaler = StandardScaler()

X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_feat), columns=features)
X_val_scaled = pd.DataFrame(scaler.transform(X_val_feat), columns=features)

In [129]:
models = [
    ("lgbm", LGBMRegressor(
        n_estimators=500,
        learning_rate=0.05,
        num_leaves=80,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.1,
        reg_lambda=0.1,
        random_state=42,
        n_jobs=-1
    )),
    ("hgb", HistGradientBoostingRegressor(max_depth=20, learning_rate=0.05)),
]

for name, model in models:
    print(f"\n Entra√Ænement du mod√®le : {name}")
    model.fit(X_train_scaled, y_train)
    y_pred_train = model.predict(X_train_scaled)
    mae_train = mean_absolute_error(y_train, y_pred_train)
    print(f" MAE train ({name}) = {mae_train:.4f}")
    y_pred = model.predict(X_val_scaled)
    mae = mean_absolute_error(y_val, y_pred)
    print(f" MAE ({name}) = {mae:.4f}")



 Entra√Ænement du mod√®le : lgbm
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0,000577 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3396
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 16
[LightGBM] [Info] Start training from score 367,262500
 MAE train (lgbm) = 30.3147
 MAE (lgbm) = 114.5453

 Entra√Ænement du mod√®le : hgb
 MAE train (hgb) = 65.1171
 MAE (hgb) = 114.0486


In [133]:
model_xgb = XGBRegressor(
    n_estimators=800,        # nombre d'arbres (compense un peu le LR)
    learning_rate=0.05,      # petit pas d'apprentissage
    max_depth=6,             # profondeur mod√©r√©e
    subsample=0.8,           # al√©atoire sur les lignes
    colsample_bytree=0.8,    # al√©atoire sur les colonnes
    random_state=42,
    n_jobs=-1
)

print("‚öôÔ∏è Entra√Ænement du mod√®le XGBoost...")
model_xgb.fit(X_train_scaled, y_train)

y_pred_train = model_xgb.predict(X_train_scaled)
y_pred_val = model_xgb.predict(X_val_scaled)

mae_train = mean_absolute_error(y_train, y_pred_train)
mae_val = mean_absolute_error(y_val, y_pred_val)

print(f"üìâ MAE train (XGB) : {mae_train:.4f}")
print(f"üìà MAE val   (XGB) : {mae_val:.4f}")
print(f"üîç Ratio val/train : {mae_val / mae_train:.2f}")

‚öôÔ∏è Entra√Ænement du mod√®le XGBoost...
üìâ MAE train (XGB) : 32.1135
üìà MAE val   (XGB) : 133.5604
üîç Ratio val/train : 4.16


In [113]:
# Pr√©diction sur le jeu de validation
y_val_pred = model.predict(X_val_scaled)

mae = mean_absolute_error(y_val, y_val_pred)
print(f" MAE sur le jeu de validation : {mae:.4f}")


 MAE sur le jeu de validation : 115.3869


In [114]:
y_val

Unnamed: 0,y_true
0,1568.0
1,1088.0
2,492.0
3,726.0
4,1174.0
...,...
128848,186.0
128849,315.0
128850,331.0
128851,232.0


In [115]:
y_val_pred

array([1137.56525779, 1119.11416555,  721.21859721, ...,  298.17894634,
        362.51921481,  199.3969745 ], shape=(128853,))