In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import matplotlib.dates as mdates
import seaborn as sns

In [22]:
BASE = Path("../data/raw")
X_train = pd.read_csv(BASE / "X_train_78VdSWL.csv", index_col=0, parse_dates=True)
y_train = pd.read_csv(BASE / "y_train_u0UkKEh.csv", index_col=0, parse_dates=True)
y_train2 = pd.read_csv(BASE / "new_output_sample.csv", index_col=0, parse_dates=True)

In [23]:
X_test = pd.read_csv(BASE / "X_test_XKVc4no.csv", index_col=0, parse_dates=True)

In [24]:
holed_cols = [c for c in X_train.columns if c.startswith("holed_")]
clean_cols = [c for c in X_train.columns if c not in holed_cols]
holed_cols_test = [c for c in X_test.columns if c.startswith("holed_")]
clean_cols_test = [c for c in X_test.columns if c not in holed_cols_test]


In [None]:
stds = X_train[clean_cols].std()
keep_clean_cols = stds[stds >= 10].index
final_cols = list(keep_clean_cols) + list(holed_cols)
X_train = X_train[final_cols]

1. Suppression des colonnes aberrantes

In [44]:
means = X_train.loc[:,keep_clean_cols].mean(axis=0)
stds = X_train.loc[:,keep_clean_cols].std(axis=0)

# Suppression des courbes "plates" (std = 0)
cols_std0 = stds[stds == 0].index

# Suppression des courbes "plates et hautes"
mask_flat_high = (means > 2000) | (stds < 10)
cols_flat_high = means[mask_flat_high].index

cols_to_drop = list(cols_std0.union(cols_flat_high))
print(f"Suppression de {len(cols_to_drop)} colonnes aberrantes.")


Suppression de 39 colonnes aberrantes.


In [45]:
X_train_filtered = X_train.drop(columns=cols_to_drop)
X_test_filtered  = X_test.drop(columns=cols_to_drop, errors='ignore')

In [46]:
lower_global, upper_global = X_train_filtered.stack().quantile([0.001, 0.999])

X_train_clipped = X_train_filtered.clip(lower=lower_global, upper=upper_global)
X_test_clipped  = X_test_filtered.clip(lower=lower_global, upper=upper_global)


In [47]:
# Calcul du taux de valeurs manquantes par colonne
missing_ratio = X_train.isna().mean(axis=0)

# Colonnes à forte proportion de NaN (ex: > 0.95 ou > 0.99)
cols_mostly_nan = missing_ratio[missing_ratio > 0.99].index

print(f"{len(cols_mostly_nan)} colonnes avec plus de 99% de valeurs manquantes.")
print(cols_mostly_nan[:5])  # aperçu

# Suppression dans le train
X_train_filtered = X_train_filtered.drop(columns=cols_mostly_nan, errors="ignore")

# Et dans le test (pour cohérence)
X_test_filtered = X_test_filtered.drop(columns=cols_mostly_nan, errors="ignore")

# Si certaines sont dans y_train (colonnes à prédire)
y_train_filtered = y_train.drop(columns=cols_mostly_nan.intersection(y_train.columns), errors="ignore")
y_train_filtered2 = y_train.drop(columns=cols_mostly_nan.intersection(y_train2.columns), errors="ignore")

1 colonnes avec plus de 99% de valeurs manquantes.
Index(['holed_192'], dtype='object')


In [48]:
# Après ton filtrage + clipping OK :
X_train_preprocessed = X_train_filtered
X_test_preprocessed  = X_test_filtered

X_train_preprocessed.to_csv("../data/processed/X_train.csv",date_format="%Y-%m-%d %H:%M:%S",index=True)
X_test_preprocessed.to_csv("../data/processed/X_test.csv",date_format="%Y-%m-%d %H:%M:%S",index=True)
y_train_filtered.to_csv("../data/processed/y_train.csv",date_format="%Y-%m-%d %H:%M:%S",index=True)
y_train_filtered2.to_csv("../data/processed/y_train2.csv",date_format="%Y-%m-%d %H:%M:%S",index=True)
