# Génération de data set afin d'essayer différentes méthodes d'imputation et leur impact. 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


from utils.data_standardize import standardize, inverse_standardize
from utils.fill_data import mixed_directional_fill, linear_interpolation
from utils.knn_impute import impute_group 
import os
print(os.getcwd())

c:\Users\julie\Documents\Python\Machine Learning - Projet FInal


In [7]:
X = pd.read_csv(r'input_training.csv')
X.sort_values(by="ID",inplace=True)

y = pd.read_csv(r'output\output_training_gmEd6Zt.csv')

data_test = pd.read_csv(r'input_test.csv')
data_test.sort_values(by="ID",inplace=True)

y_test = pd.read_csv(r"output\output_test_random.csv")


In [8]:
# On les merge pour pouvoir les manipuler ensemble et les séparer après les traitements (30% de NaN) 
X_train = pd.merge(X,y,on="ID").copy()
data_test.sort_values(by="ID",inplace=True)
X_test = pd.merge(data_test,y_test,on="ID").copy()

threshold = int(0.7 * X_train.shape[1]) # 70% des valeurs non NaN pour garder la ligne

X_train_70 = X_train.dropna(thresh=threshold).copy(deep=True)
y_70 = X_train_70["reod"].copy(deep=True)
print(f"Nombre de lignes après suppression des lignes avec moins de {threshold} valeurs non NaN : {len(X_train_70)}")

X_test_70 = X_test.dropna(thresh=threshold).copy(deep=True)
y_test_70 = X_test_70["reod"].copy(deep=True)
print(f"Nombre de lignes après suppression des lignes avec moins de {threshold} valeurs non NaN : {len(X_test_70)}")

Nombre de lignes après suppression des lignes avec moins de 39 valeurs non NaN : 730784
Nombre de lignes après suppression des lignes avec moins de 39 valeurs non NaN : 857641


In [9]:
id_cols = ["ID","day","equity","reod"]
features_cols = [f'r{i}' for i in range(53)]

X_train_70_features = X_train_70[features_cols].copy(deep=True)
X_test_70_features = X_test_70[features_cols].copy(deep=True)


## KNN IMPUTATION

In [5]:
# Imputation globale des valeurs manquantes sur le train set 
X_train_70_features_std , global_stats = standardize(X_train_70_features)
imputer = KNNImputer(n_neighbors=5)
X_train_70_features_std = imputer.fit_transform(X_train_70_features_std)
X_train_70_features_std = pd.DataFrame(X_train_70_features_std, columns=features_cols)

X_train_70_features = inverse_standardize(X_train_70_features_std, global_stats)

X_global_inmputed = pd.concat([X_train_70[id_cols],X_train_70_features],axis=1)
X_global_inmputed.to_csv("training_global_knn_imputed.csv",index=False)

In [6]:
# Imputation globale des valeurs manquantes sur le test set 
X_test_70_features_std , global_stats = standardize(X_test_70_features)
imputer = KNNImputer(n_neighbors=5)
X_test_70_features_std = imputer.fit_transform(X_test_70_features_std)
X_test_70_features_std = pd.DataFrame(X_test_70_features_std, columns=features_cols)

X_test_70_features = inverse_standardize(X_test_70_features_std, global_stats)

X_global_inmputed_test = pd.concat([X_test_70[id_cols],X_train_70_features],axis=1)
X_global_inmputed_test.to_csv("test_global_knn_imputed.csv",index=False)

## KNN Imputation par groupe (ne sera pas fait)

In [46]:
df_group_imputed = X_train_70_features.groupby(X_train_70["equity"], group_keys=False).apply(impute_group(features_cols=features_cols))
df_group_imputed.to_csv("training_group_knn_imputed.csv",index=False)

TypeError: impute_group() got an unexpected keyword argument 'features_cols'

## Mixed fill (Backward / Forward fill - Forward / Backward fill)

In [10]:
# ==================== 1. FORWARD-BACKWARD FILL ====================
# Using axis=1 to fill across time periods within each row
X_train_ffbf = mixed_directional_fill(X_train_70, features_cols, 'ffill_then_bfill', axis=1)
X_test_ffbf = mixed_directional_fill(X_test_70, features_cols, 'ffill_then_bfill', axis=1)

# Export to CSV
X_train_ffbf.to_csv('processed_data/X_train_ffbf.csv', index=False)
X_test_ffbf.to_csv('processed_data/X_test_ffbf.csv', index=False)

In [11]:
# ==================== 2. BACKWARD-FORWARD FILL ====================
X_train_bfff = mixed_directional_fill(X_train_70, features_cols, 'bfill_then_ffill', axis=1)
X_test_bfff = mixed_directional_fill(X_test_70, features_cols, 'bfill_then_ffill', axis=1)

# Export to CSV
X_train_bfff.to_csv('processed_data/X_train_bfff.csv', index=False)
X_test_bfff.to_csv('processed_data/X_test_bfff.csv', index=False)

## Interpolation linéaire par ligne 

In [12]:
# ==================== 3. LINEAR INTERPOLATION ====================
print("Applying linear interpolation...")

X_train_interp = linear_interpolation(X_train_70, features_cols, axis=1)
X_test_interp = linear_interpolation(X_test_70, features_cols, axis=1)

# Export to CSV
X_train_interp.to_csv('processed_data/X_train_interp.csv', index=False)
X_test_interp.to_csv('processed_data/X_test_interp.csv', index=False)

Applying linear interpolation...


## MICE

In [13]:
# Process training data
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

mice_train_imputer = IterativeImputer(
    max_iter=10,
    random_state=42,
    initial_strategy='mean'
)

X_train_mice = X_train_70.copy()
X_train_mice_features = mice_train_imputer.fit_transform(X_train_70_features)
X_train_mice[features_cols] = X_train_mice_features

# Process test data separately
mice_test_imputer = IterativeImputer(
    max_iter=10,
    random_state=42,
    initial_strategy='mean'
)

X_test_mice = X_test_70.copy()
X_test_mice_features = mice_test_imputer.fit_transform(X_test_70_features)
X_test_mice[features_cols] = X_test_mice_features

# Export to CSV
X_train_mice.to_csv('processed_data/X_train_mice.csv', index=False)
X_test_mice.to_csv('processed_data/X_test_mice.csv', index=False)