In [27]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import matplotlib.dates as mdates
import seaborn as sns

In [28]:
BASE = Path("../data/raw")
X_train = pd.read_csv(BASE / "X_train_78VdSWL.csv", index_col=0, parse_dates=True)
y_train = pd.read_csv(BASE / "y_train_u0UkKEh.csv", index_col=0, parse_dates=True)
y_train2 = pd.read_csv(BASE / "new_output_sample.csv", index_col=0, parse_dates=True)

In [31]:
X_test = pd.read_csv(BASE / "X_test_XKVc4no.csv", index_col=0, parse_dates=True)

In [32]:
holed_cols = [c for c in X_train.columns if c.startswith("holed_")]
clean_cols = [c for c in X_train.columns if c not in holed_cols]
holed_cols_test = [c for c in X_test.columns if c.startswith("holed_")]
clean_cols_test = [c for c in X_test.columns if c not in holed_cols_test]


## Filtre par taux de valeurs manquantes

In [33]:
missing_ratio = y_train.isna().mean()
threshold = 0.95
cols_to_drop = missing_ratio[missing_ratio >= threshold].index.tolist()
cols_to_keep = missing_ratio[missing_ratio <= threshold].index.tolist()

print(f"Nb de colonnes supprimés : {len(cols_to_drop)}")

Nb de colonnes supprimés : 1


In [34]:
X_train.drop(columns=cols_to_drop, inplace=True)
y_train.drop(columns=cols_to_drop, inplace=True)
y_train2.drop(columns=cols_to_drop, inplace=True)   

## Filtre par moyenne et écart-type

In [35]:
# Calcul des statistiques par colonne
col_stats = X_train.loc[:, clean_cols].describe().T[['mean', 'std']]

# Sélectionne les colonnes qui respectent tes conditions
filtered_cols = col_stats[
    (col_stats['mean'] <= 2000) & (col_stats['std'] <= 800) & (col_stats['std'] >= 5)
].index.tolist()


# Construit ton nouveau DataFrame filtré
Xt_filtered = X_train[filtered_cols + cols_to_keep]

print(f"Colonnes conservées : {len(filtered_cols)} / {X_train.shape[1]}")


Colonnes conservées : 18844 / 20999


In [36]:
# Calcul des statistiques par colonne
col_stats = X_test.loc[:, clean_cols_test].describe().T[['mean', 'std']]

# Sélectionne les colonnes qui respectent tes conditions
filtered_cols = col_stats[
    (col_stats['mean'] <= 2000) & (col_stats['std'] <= 800) & (col_stats['std'] >= 5)
].index.tolist()


# Construit ton nouveau DataFrame filtré
Xtest_filtered = X_test[filtered_cols + holed_cols_test]

print(f"Colonnes conservées : {len(filtered_cols)} / {X_test.shape[1]}")


Colonnes conservées : 34865 / 38140


In [37]:
const_cols = col_stats[col_stats['std'] == 0]
print(f" Colonnes constantes : {len(const_cols)}")

 Colonnes constantes : 1396


In [38]:
Xt_filtered.to_csv("../data/processed/X_train.csv",date_format="%Y-%m-%d %H:%M:%S",index=True)
Xtest_filtered.to_csv("../data/processed/X_test.csv",date_format="%Y-%m-%d %H:%M:%S",index=True)
y_train.to_csv("../data/processed/y_train.csv",date_format="%Y-%m-%d %H:%M:%S",index=True)
y_train2.to_csv("../data/processed/y_train2.csv",date_format="%Y-%m-%d %H:%M:%S",index=True)
