Ce notebook est un "brouillon" pour tenter d'améliorer le modèle. (globalement je vais faire ça comme un gros sagouin)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from datetime import datetime

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.decomposition import PCA

from tqdm import tqdm




from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Configuration de l'affichage
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_context("notebook", font_scale=1.2)
plt.rcParams['figure.figsize'] = (12, 8)

from utils.data_registry import DATASETS
from utils.features import add_features
from utils.benchmarks import get_models
from utils.experiment_runner import run_experiment, display_experiment_result, add_result
from utils.load_data import load_datasets
from utils.graphic import analyze_distributions, compare_column_stats, analyze_normalization
from utils.variance_analysis import perform_pca_analysis, perform_tsne_analysis, analyze_correlations, evaluate_feature_sets, analyze_feature_importance, select_best_features, summarize_feature_analysis
from utils.preprocess_data import precompute_datasets_with_features

# Create empty results tracker
results_tracker = pd.DataFrame(columns=[
    "dataset", "dataset_description", "model", "model_description", 
    "features_added", "feature_sets", "accuracy", "precision_weighted", 
    "recall_weighted", "f1_weighted"
])


In [None]:

# Charger les datasets
imputed_datasets = load_datasets()

on va se concentrer que sur un jeu de données, on se fiche un peu de la méthode d'imputation, la différence est probablement négligeable. 

In [2]:
#On importe les données de base sans imputation (70%) juste pour avoir une référence en termes de dimensions 
# Importation des données d'entraînement
X_train_70 = pd.read_csv(r"processed_data\X_train_70.csv")

# Importation des données de test : 
X_test_70 = pd.read_csv(r"processed_data\X_test_70.csv")

# Aperçu des données
print("\n--- Aperçu des données d'entraînement ---")
print(f"Nombre de lignes train: {X_train_70.shape[0]}, Nombre de colonnes: {X_train_70.shape[1]}")
print(f"Nombre de lignes test: {X_test_70.shape[0]}, Nombre de colonnes: {X_test_70.shape[1]}")
print("\nPremières lignes:")
display(X_train_70.head())


--- Aperçu des données d'entraînement ---
Nombre de lignes train: 730784, Nombre de colonnes: 57
Nombre de lignes test: 857641, Nombre de colonnes: 57

Premières lignes:


Unnamed: 0,ID,day,equity,r0,r1,r2,r3,r4,r5,r6,...,r44,r45,r46,r47,r48,r49,r50,r51,r52,reod
0,1,272,107,-9.76,0.0,-12.21,46.44,34.08,0.0,41.24,...,-16.92,-4.84,4.84,0.0,7.26,-9.68,-19.38,9.71,26.68,0
1,2,323,1063,49.85,0.0,0.0,-26.64,-23.66,-22.14,49.12,...,1.59,6.37,-49.32,-9.59,-6.4,22.41,-6.39,7.99,15.96,-1
2,4,123,1465,-123.84,-115.18,-26.44,0.0,42.42,10.56,0.0,...,-21.44,-21.48,10.78,-21.55,-5.4,-10.81,5.41,-32.47,43.43,-1
3,5,343,1279,-26.91,4.76,9.52,-5.55,-7.14,-1.59,-7.14,...,0.8,-3.19,3.99,-3.19,-4.79,-5.59,6.39,-6.38,-5.59,0
4,6,212,185,0.0,-30.67,4.4,-13.19,13.2,26.37,4.38,...,0.0,6.62,13.23,0.0,0.0,4.4,13.2,-4.4,4.4,-1


In [2]:
#Import du dataset qu'on va utiliser + sa version preprocessed 
x_train_ffbf = pd.read_csv(r"processed_data\X_train_ffbf.csv")
x_test_ffbf = pd.read_csv(r'processed_data\X_test_ffbf.csv')

y_train_ffbg = x_train_ffbf["reod"].copy()
y_test_ffbg = x_test_ffbf["reod"].copy()

x_train_ffbf = x_train_ffbf.loc[:,"ID":"r52"]
x_test_ffbf = x_test_ffbf.loc[:,"ID":"r52"]

print("\n--- Aperçu des données d'entraînement ---")
print(f"Nombre de lignes train: {x_train_ffbf.shape[0]}, Nombre de colonnes: {x_train_ffbf.shape[1]}")
print(f"Nombre de lignes test: {x_test_ffbf.shape[0]}, Nombre de colonnes: {x_test_ffbf.shape[1]}")
print("\nPremières lignes:")
display(x_train_ffbf.head())


--- Aperçu des données d'entraînement ---
Nombre de lignes train: 730784, Nombre de colonnes: 56
Nombre de lignes test: 857641, Nombre de colonnes: 56

Premières lignes:


Unnamed: 0,ID,day,equity,r0,r1,r2,r3,r4,r5,r6,...,r43,r44,r45,r46,r47,r48,r49,r50,r51,r52
0,1,272,107,-9.76,0.0,-12.21,46.44,34.08,0.0,41.24,...,-4.83,-16.92,-4.84,4.84,0.0,7.26,-9.68,-19.38,9.71,26.68
1,2,323,1063,49.85,0.0,0.0,-26.64,-23.66,-22.14,49.12,...,-6.37,1.59,6.37,-49.32,-9.59,-6.4,22.41,-6.39,7.99,15.96
2,4,123,1465,-123.84,-115.18,-26.44,0.0,42.42,10.56,0.0,...,-5.36,-21.44,-21.48,10.78,-21.55,-5.4,-10.81,5.41,-32.47,43.43
3,5,343,1279,-26.91,4.76,9.52,-5.55,-7.14,-1.59,-7.14,...,-0.8,0.8,-3.19,3.99,-3.19,-4.79,-5.59,6.39,-6.38,-5.59
4,6,212,185,0.0,-30.67,4.4,-13.19,13.2,26.37,4.38,...,-4.41,0.0,6.62,13.23,0.0,0.0,4.4,13.2,-4.4,4.4


In [3]:
x_train_ffbf_with_feature = pd.read_csv(r"processed_data\preprocessed\X_train_ffbf_with_features.csv")
x_test_ffbf_with_feature = pd.read_csv(r'processed_data\preprocessed\X_test_ffbf_with_features.csv')

y_train_ffbg_with_features = x_train_ffbf_with_feature["reod"].copy()
y_test_ffbg_with_features = x_test_ffbf_with_feature["reod"].copy()

train_na_count = x_train_ffbf_with_feature.isna().sum().sum()
test_na_count = x_test_ffbf_with_feature.isna().sum().sum()

print("\n--- Aperçu des données d'entraînement ---")
print(f"Nombre de lignes train: {x_train_ffbf_with_feature.shape[0]}, Nombre de colonnes: {x_train_ffbf_with_feature.shape[1]}")
print(f"Nombre de lignes test: {x_test_ffbf_with_feature.shape[0]}, Nombre de colonnes: {x_test_ffbf_with_feature.shape[1]}")

print("\n--- Contient des Na ? ---")
print(f"Nombre de NA train: {train_na_count}")
print(f"Nombre de NA train: {test_na_count}")

print("\nPremières lignes:")
display(x_train_ffbf_with_feature.head())


--- Aperçu des données d'entraînement ---
Nombre de lignes train: 730784, Nombre de colonnes: 75
Nombre de lignes test: 857641, Nombre de colonnes: 75

--- Contient des Na ? ---
Nombre de NA train: 0
Nombre de NA train: 0

Premières lignes:


Unnamed: 0,ID,day,equity,r0,r1,r2,r3,r4,r5,r6,...,r_pos_sum,r_neg_sum,r_roll_mean_5,r_roll_std_5,r_roll_mean_10,r_roll_std_10,r_roll_mean_20,r_roll_std_20,r_momentum_5,r_momentum_10
0,1,272,107,-9.76,0.0,-12.21,46.44,34.08,0.0,41.24,...,411.59,-303.83,2.918,17.927125,-0.716,13.660327,3.0335,11.110787,26.68,24.26
1,2,323,1063,49.85,0.0,0.0,-26.64,-23.66,-22.14,49.12,...,402.29,-523.8,6.714,13.011273,-2.375,19.591007,-8.4885,23.371939,25.55,84.04
2,4,123,1465,-123.84,-115.18,-26.44,0.0,42.42,10.56,0.0,...,531.34,-939.58,0.032,27.909429,-5.889,21.85648,-3.45,29.044273,64.98,64.82
3,5,343,1279,-26.91,4.76,9.52,-5.55,-7.14,-1.59,-7.14,...,122.82,-202.09,-3.192,5.385919,-1.835,4.357074,-0.2795,5.405468,-2.4,-5.59
4,6,212,185,0.0,-30.67,4.4,-13.19,13.2,26.37,4.38,...,193.74,-204.49,3.52,6.526255,3.304,6.337378,3.4195,6.772368,4.4,6.61


In [5]:
print("\nDatasets disponibles:")
for key, info in DATASETS.items():
    print(f"- {key}: {info['description']}")

print("\nModèles disponibles:")
models = get_models()
for key, info in models.items():
    if 'description' in info:
        print(f"- {key}: {info['description']}")
    else:
        print(f"- {key}")


Datasets disponibles:
- raw: Données brutes
- ffbf: Données forward filled puis backward
- bfff: Données backward filled puis forward
- interp: Données interpolation linéaire puis bffff
- mice: Données MICE imputer puis bfff
- knn: Données knn imputer puis bfff
- raw_with_features: Données brutes avec features
- ffbf_with_features: Données forward filled puis backward avec features
- bfff_with_features: Données backward filled puis forward avec features
- interp_with_features: Données interpolation linéaire puis bffff avec features
- mice_with_features: Données MICE imputer puis bfff avec features
- knn_with_features: Données knn imputer puis bfff avec features

Modèles disponibles:
- xgboost_baseline: XGBoost de base
- xgboost_tuned: XGBoost avec paramètres 
- rf_baseline: Baseline Random Forest model
- logistic: Multinomial Logistic Regression


In [7]:
print("\n--- Comparaison des stratégies d'imputation avec XGBoost ---")

# Liste des stratégies d'imputation, à la fois standards et avec features prétraitées
imputation_strategies = ["ffbf"]
imputation_results = []
scalers = {
    'Standard': StandardScaler(),
    'MinMax': MinMaxScaler(),
    'Robust': RobustScaler(),
    'Quantile': QuantileTransformer(output_distribution='normal')
}
# Tester d'abord les datasets standards
for strategy in imputation_strategies:
    for scaler_name, scaler in scalers.items():
        print(f"\n Using {scaler} - Testing normalisation on {strategy} with precomputed features...")
        print(f"\nTesting {strategy} imputation...")
        try:
            # Sans feature engineering (utilisant le dataset standard)
            result_without_features = run_experiment(dataset_key=strategy, model_key="xgboost_baseline", add_feat=False, scaler=scaler)
            results_tracker = add_result(results_tracker, result_without_features)
            imputation_results.append(result_without_features)
            print(f"Accuracy without feature engineering: {result_without_features['accuracy']:.4f}")
        except Exception as e:
            print(f"Error processing standard {strategy} dataset: {e}")

# Ensuite tester les datasets prétraités avec features
for strategy in imputation_strategies:
    print(f"\nTesting {strategy} with precomputed features...")
    for scaler_name, scaler in scalers.items():
        print(f"\n Using {scaler} - Testing normalisation on {strategy} with precomputed features...")
        try:
            # Construire le chemin d'accès aux datasets prétraités
            preprocessed_key = f"{strategy}_with_features"
            
            # Vérifier si le dataset existe dans le registre
            if preprocessed_key in DATASETS:
                # Utiliser directement le dataset prétraité (pas besoin d'add_feat)
                result_with_features = run_experiment(dataset_key=preprocessed_key, model_key="xgboost_baseline", add_feat=False, scaler=scaler)
                results_tracker = add_result(results_tracker, result_with_features)
                imputation_results.append(result_with_features)
                print(f"Accuracy with precomputed features: {result_with_features['accuracy']:.4f}")
                
                # Si nous avons les deux résultats, calculer l'amélioration
                standard_result = next((r for r in imputation_results if r['dataset'] == strategy), None)
                if standard_result:
                    improvement = result_with_features['accuracy'] - standard_result['accuracy']
                    print(f"Improvement from feature engineering: {improvement:.4f} ({improvement*100:.2f}%)")
            else:
                print(f"Preprocessed dataset '{preprocessed_key}' not found in registry.")
        except Exception as e:
            print(f"Error processing preprocessed {strategy} dataset: {e}")

# Tester avec normalisation par ligne (optionnel)
print("\n--- Testing with row normalization ---")
for strategy in imputation_strategies:
    for scaler_name, scaler in scalers.items():
        print(f"\n Using {scaler} - Testing normalisation on {strategy} with precomputed features...")
        preprocessed_key = f"{strategy}_with_features"
        if preprocessed_key in DATASETS:
            try:
                print(f"\nTesting normalisation on {strategy} with precomputed features...")
                result_normalized = run_experiment(
                    dataset_key=preprocessed_key, 
                    model_key="xgboost_baseline", 
                    add_feat=False,
                    normalize_by_row=True,  # Activer la normalisation par ligne
                    scaler=scaler
                )
                results_tracker = add_result(results_tracker, result_normalized)
                imputation_results.append(result_normalized)
                print(f"{strategy} with row normalization: Accuracy = {result_normalized['accuracy']:.4f}")
                
                # Comparer avec version sans normalisation
                non_normalized = next((r for r in imputation_results if r['dataset'] == preprocessed_key and not r.get('normalize_by_row', False)), None)
                if non_normalized:
                    diff = result_normalized['accuracy'] - non_normalized['accuracy']
                    print(f"Impact of row normalization: {diff:.4f} ({diff*100:.2f}%)")
            except Exception as e:
                print(f"Error with row normalization on {strategy}: {e}")


--- Comparaison des stratégies d'imputation avec XGBoost ---

 Using StandardScaler() - Testing normalisation on ffbf with precomputed features...

Testing ffbf imputation...
Aucune valeur manquante détectée.
Accuracy without feature engineering: 0.3136

 Using MinMaxScaler() - Testing normalisation on ffbf with precomputed features...

Testing ffbf imputation...
Aucune valeur manquante détectée.
Accuracy without feature engineering: 0.3136

 Using RobustScaler() - Testing normalisation on ffbf with precomputed features...

Testing ffbf imputation...
Aucune valeur manquante détectée.
Accuracy without feature engineering: 0.3136

 Using QuantileTransformer(output_distribution='normal') - Testing normalisation on ffbf with precomputed features...

Testing ffbf imputation...
Aucune valeur manquante détectée.
Accuracy without feature engineering: 0.3127

Testing ffbf with precomputed features...

 Using StandardScaler() - Testing normalisation on ffbf with precomputed features...
Aucune v

In [14]:
print(x_train_ffbf["day"].nunique())

503


 On remaruqe que malgré la baisse de performance avec l'utilisation des features, on a une augmentation de l'accuracy avec le standard scaler et row normalisation. On va essayer d'analyser les données avec les lignes normalisées puis réduire les dimensions des features en retirant les features non pertinentes puis potentiellement en rajouter de nouvelles. 

In [4]:
def analyze_normalized_dataset(df, feature_groups=None):
    """
    Analyze a normalized dataset, focusing on different groups of features.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        The DataFrame to analyze
    feature_groups : dict, optional
        Dictionary mapping feature group names to lists of columns to analyze
        
    Returns:
    --------
    dict
        Dictionary with analysis results
    """
    if feature_groups is None:
        # Auto-detect feature groups
        feature_groups = {
            'rendements': [col for col in df.columns if col.startswith('r') and col[1:].isdigit()],
            'basic_stats': [col for col in df.columns if col.startswith('r_') and not any(x in col for x in ['roll', 'momentum'])],
            'roll_features': [col for col in df.columns if 'roll' in col],
            'momentum_features': [col for col in df.columns if 'momentum' in col]
        }
    
    results = {}
    
    # 1. Basic statistics for each feature group
    stats = {}
    for group_name, columns in feature_groups.items():
        if not columns:
            continue
            
        group_df = df[columns]
        stats[group_name] = {
            'mean': group_df.mean().describe(),
            'std': group_df.std().describe(),
            'missing': group_df.isna().sum().sum(),
            'n_features': len(columns)
        }
    
    results['stats'] = stats
    
    # 2. Correlation with target (if available)
    if 'reod' in df.columns:
        target_correlations = {}
        for group_name, columns in feature_groups.items():
            correlations = df[columns].corrwith(df['reod'])
            target_correlations[group_name] = {
                'strongest_positive': correlations.nlargest(3),
                'strongest_negative': correlations.nsmallest(3),
                'mean_abs_corr': correlations.abs().mean()
            }
        
        results['target_correlations'] = target_correlations
    
    # 3. Plot distribution of feature values for each group
    fig_dict = {}
    for group_name, columns in feature_groups.items():
        if not columns:
            continue
            
        # Take a sample of columns if there are too many
        sample_cols = columns[:5] if len(columns) > 5 else columns
        
        fig, axes = plt.subplots(len(sample_cols), 1, figsize=(12, 3*len(sample_cols)))
        fig.suptitle(f'Distribution of {group_name} features')
        
        if len(sample_cols) == 1:
            axes = [axes]  # Make it iterable when there's only one subplot
            
        for i, col in enumerate(sample_cols):
            sns.histplot(df[col].dropna(), kde=True, ax=axes[i])
            axes[i].set_title(f'{col} (mean={df[col].mean():.2f}, std={df[col].std():.2f})')
            axes[i].axvline(df[col].mean(), color='r', linestyle='--')
        
        plt.tight_layout()
        fig_dict[group_name] = fig
    
    results['figures'] = fig_dict
    
    # 4. Feature correlations within groups
    correlation_matrices = {}
    for group_name, columns in feature_groups.items():
        if len(columns) < 2:  # Need at least 2 features for correlation
            continue
            
        # Take a sample if there are too many columns
        sample_cols = columns[:10] if len(columns) > 10 else columns
        corr_matrix = df[sample_cols].corr()
        correlation_matrices[group_name] = corr_matrix
        
        # Plot correlation matrix
        plt.figure(figsize=(10, 8))
        mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
        sns.heatmap(corr_matrix, mask=mask, cmap='coolwarm', center=0,
                   annot=True if len(sample_cols) <= 10 else False, 
                   fmt='.2f', square=True)
        plt.title(f'Correlation Matrix: {group_name}')
        plt.tight_layout()
        
    results['correlation_matrices'] = correlation_matrices
    
    return results

In [5]:
from utils.experiment_runner import normalize_rendements_by_row
#Cette fonction est utilisée automatiquement dans experiment_runner 
x_train_ffbf_with_feature_normalized = normalize_rendements_by_row(x_train_ffbf_with_feature)

In [None]:
stats_results = analyze_normalized_dataset(x_train_ffbf_with_feature_normalized)

In [None]:
def find_important_features(df, target_col='reod', threshold=0.05):
    if target_col not in df.columns:
        print(f"Target column '{target_col}' not found in DataFrame")
        return []
    
    # Calculate correlations with target
    feature_cols = [col for col in df.columns if col != target_col and col not in ['ID']]
    correlations = df[feature_cols].corrwith(df[target_col])
    
    # Filter features based on threshold
    important_features = correlations[correlations.abs() > threshold].sort_values(ascending=False)
    
    print(f"Found {len(important_features)} features with correlation > {threshold}")
    print("\nTop positively correlated features:")
    print(important_features.head(10))
    print("\nTop negatively correlated features:")
    print(important_features.tail(10))
    
    return important_features.index.tolist()

def compare_normalization_impact(original_df, normalized_df, sample_cols=None):

    # Get common columns between both DataFrames
    common_cols = [col for col in original_df.columns if col in normalized_df.columns]
    
    # Select sample columns if not provided
    if sample_cols is None:
        # Focus on rendement columns
        rendement_cols = [col for col in common_cols if col.startswith('r') and col[1:].isdigit()]
        sample_cols = np.random.choice(rendement_cols, min(5, len(rendement_cols)), replace=False)
    
    # Create comparison plots
    for col in sample_cols:
        fig, axes = plt.subplots(1, 2, figsize=(14, 5))
        
        # Original data distribution
        sns.histplot(original_df[col].dropna(), kde=True, ax=axes[0])
        axes[0].set_title(f'Original: {col}')
        axes[0].axvline(original_df[col].mean(), color='r', linestyle='--', 
                         label=f'Mean: {original_df[col].mean():.2f}')
        axes[0].legend()
        
        # Normalized data distribution
        sns.histplot(normalized_df[col].dropna(), kde=True, ax=axes[1])
        axes[1].set_title(f'Normalized: {col}')
        axes[1].axvline(normalized_df[col].mean(), color='r', linestyle='--',
                        label=f'Mean: {normalized_df[col].mean():.2f}')
        axes[1].legend()
        
        plt.tight_layout()
        plt.show()
    
    # Compare correlation structures
    print("Analyzing correlation structure changes...")
    
    # Calculate correlation matrices
    orig_corr = original_df[sample_cols].corr()
    norm_corr = normalized_df[sample_cols].corr()
    
    # Plot them side by side
    fig, axes = plt.subplots(1, 2, figsize=(16, 7))
    
    sns.heatmap(orig_corr, annot=True, cmap='coolwarm', fmt='.2f', ax=axes[0])
    axes[0].set_title('Original Correlation Matrix')
    
    sns.heatmap(norm_corr, annot=True, cmap='coolwarm', fmt='.2f', ax=axes[1])
    axes[1].set_title('Normalized Correlation Matrix')
    
    plt.tight_layout()
    plt.show()
    
    # Calculate the difference in correlation matrices
    diff_corr = norm_corr - orig_corr
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(diff_corr, annot=True, cmap='coolwarm', center=0, fmt='.2f')
    plt.title('Change in Correlation Matrix (Normalized - Original)')
    plt.tight_layout()
    plt.show()

In [None]:
def analyze_distributions(df, sample_cols=5):
    # Sélectionner des colonnes de rendement (r0, r1, etc.)
    r_cols = [col for col in df.columns if col.startswith('r') and col[1:].isdigit()]
    
    # Prendre un échantillon si trop nombreuses
    sample = r_cols[:sample_cols]
    
    # Créer une figure pour visualiser
    fig, axes = plt.subplots(len(sample), 1, figsize=(12, 4*len(sample)))
    
    for i, col in enumerate(sample):
        # Tracer l'histogramme avec courbe de densité
        sns.histplot(df[col], kde=True, ax=axes[i])
        axes[i].set_title(f'Distribution de {col} après normalisation')
        axes[i].axvline(0, color='r', linestyle='--')  # Ligne à la moyenne théorique (0)
    
    plt.tight_layout()
    plt.show()
    
    # Afficher statistiques de base pour confirmer la normalisation
    stats = df[r_cols].describe().T[['mean', 'std', 'min', 'max']]
    return stats

# Pour analyser les corrélations avec la cible
def analyze_target_correlations(df, target_col='reod'):
    # Exclure colonnes non-feature
    non_features = ['ID', target_col]
    features = [col for col in df.columns if col not in non_features]
    
    # Calculer les corrélations
    correlations = df[features].corrwith(df[target_col])
    
    # Afficher les plus fortes (positives et négatives)
    print("Top features par corrélation positive:")
    print(correlations.nlargest(10))
    
    print("\nTop features par corrélation négative:")
    print(correlations.nsmallest(10))
    
    # Visualiser
    plt.figure(figsize=(12, 8))
    correlations.abs().sort_values(ascending=False).head(20).plot(kind='bar')
    plt.title('Top 20 features par corrélation absolue avec la cible')
    plt.tight_layout()
    plt.show()
    
    return correlations

In [None]:
important_features = find_important_features(x_train_ffbf_with_feature_normalized)

In [None]:
compare_normalization_impact(x_train_ffbf_with_feature, x_train_ffbf_with_feature_normalized)

In [None]:

def select_features_with_various_methods(X, y, methods=['correlation', 'f_value', 'mutual_info', 'pca']):
    """
    Select features using various methods and compare the results.
    
    Parameters:
    -----------
    X : pandas.DataFrame
        Feature matrix
    y : pandas.Series
        Target variable
    methods : list
        List of feature selection methods to use
        
    Returns:
    --------
    dict
        Dictionary with selected features for each method
    """
    results = {}
    feature_names = X.columns.tolist()
    
    # Method 1: Correlation with target
    if 'correlation' in methods:
        # Convert target to numeric if needed
        y_numeric = pd.to_numeric(y, errors='coerce')
        
        # Calculate correlation with target
        correlations = pd.Series(
            [np.corrcoef(X[col].values, y_numeric.values)[0, 1] for col in feature_names],
            index=feature_names
        )
        
        # Get top features by absolute correlation
        abs_corr = correlations.abs().sort_values(ascending=False)
        top_corr_features = abs_corr.index.tolist()
        
        results['correlation'] = {
            'features': top_corr_features,
            'scores': abs_corr.values,
            'top_positive': correlations.nlargest(10),
            'top_negative': correlations.nsmallest(10)
        }
        
        # Visualize correlation results
        plt.figure(figsize=(12, 6))
        correlations.abs().sort_values().tail(20).plot(kind='barh')
        plt.title('Top 20 Features by Absolute Correlation with Target')
        plt.tight_layout()
        plt.show()
    
    # Method 2: ANOVA F-value
    if 'f_value' in methods:
        selector = SelectKBest(score_func=f_classif, k='all')
        selector.fit(X, y)
        
        # Get scores and p-values
        f_scores = pd.Series(selector.scores_, index=feature_names)
        p_values = pd.Series(selector.pvalues_, index=feature_names)
        
        # Get top features by F-score
        top_f_features = f_scores.sort_values(ascending=False).index.tolist()
        
        results['f_value'] = {
            'features': top_f_features,
            'scores': f_scores.sort_values(ascending=False).values,
            'p_values': p_values[top_f_features]
        }
        
        # Visualize F-score results
        plt.figure(figsize=(12, 6))
        f_scores.sort_values().tail(20).plot(kind='barh')
        plt.title('Top 20 Features by F-score')
        plt.tight_layout()
        plt.show()
    
    # Method 3: Mutual Information
    if 'mutual_info' in methods:
        selector = SelectKBest(score_func=mutual_info_classif, k='all')
        selector.fit(X, y)
        
        # Get scores
        mi_scores = pd.Series(selector.scores_, index=feature_names)
        
        # Get top features by mutual information
        top_mi_features = mi_scores.sort_values(ascending=False).index.tolist()
        
        results['mutual_info'] = {
            'features': top_mi_features,
            'scores': mi_scores.sort_values(ascending=False).values
        }
        
        # Visualize mutual information results
        plt.figure(figsize=(12, 6))
        mi_scores.sort_values().tail(20).plot(kind='barh')
        plt.title('Top 20 Features by Mutual Information')
        plt.tight_layout()
        plt.show()
    
    # Method 4: PCA
    if 'pca' in methods:
        # Standardize the data
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        
        # Apply PCA
        pca = PCA(n_components=min(X.shape[1], 100))  # Limit to 100 components max
        pca.fit(X_scaled)
        
        # Get explained variance ratio
        explained_variance = pca.explained_variance_ratio_
        cumulative_variance = np.cumsum(explained_variance)
        
        # Find number of components needed for different variance thresholds
        variance_thresholds = [0.7, 0.8, 0.9, 0.95]
        components_needed = {}
        
        for threshold in variance_thresholds:
            n_components = np.argmax(cumulative_variance >= threshold) + 1
            components_needed[threshold] = n_components
        
        results['pca'] = {
            'explained_variance': explained_variance,
            'cumulative_variance': cumulative_variance,
            'components_needed': components_needed
        }
        
        # Visualize PCA results
        plt.figure(figsize=(12, 6))
        plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, 'o-')
        plt.title('Cumulative Explained Variance')
        plt.xlabel('Number of Components')
        plt.ylabel('Cumulative Explained Variance')
        
        # Add lines for thresholds
        for threshold in variance_thresholds:
            plt.axhline(y=threshold, color='r', linestyle='--')
            plt.text(len(cumulative_variance) * 0.8, threshold + 0.01, 
                    f"{threshold*100}%: {components_needed[threshold]} components")
        
        plt.tight_layout()
        plt.show()
    
    # Compare feature selection methods
    if len(methods) > 1:
        common_methods = [m for m in methods if m != 'pca']
        
        if len(common_methods) > 1:
            method_pairs = [(i, j) for i in range(len(common_methods)) for j in range(i+1, len(common_methods))]
            
            for i, j in method_pairs:
                method1, method2 = common_methods[i], common_methods[j]
                
                # Get top features from each method
                top_features1 = results[method1]['features'][:50]  # Top 50 features
                top_features2 = results[method2]['features'][:50]
                
                # Find common features
                common_features = set(top_features1).intersection(set(top_features2))
                
                print(f"Overlap between {method1} and {method2} (top 50): {len(common_features)} features")
                print(f"Common features: {sorted(list(common_features)[:10])}...")
    
    return results


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif

# Méthode 1: Sélection par corrélation avec la cible
def select_by_correlation(X, y, top_n=None, plot=True):
    # Convertir la cible en valeurs numériques si nécessaire
    y_numeric = pd.to_numeric(y, errors='coerce')
    feature_names = X.columns
    
    # Calculer la corrélation de chaque feature avec la cible
    correlations = pd.Series(
        [np.corrcoef(X[col].values, y_numeric.values)[0, 1] for col in feature_names],
        index=feature_names
    )
    
    # Trier par corrélation absolue (positive ou négative)
    abs_corr = correlations.abs().sort_values(ascending=False)
    
    # Sélectionner le nombre de features demandé
    if top_n:
        selected_features = abs_corr.head(top_n).index.tolist()
    else:
        selected_features = abs_corr.index.tolist()
    
    # Visualiser les résultats si demandé
    if plot:
        plt.figure(figsize=(12, 6))
        abs_corr.head(20).plot(kind='barh')
        plt.title('Top 20 features par corrélation absolue avec la cible')
        plt.xlabel('Corrélation absolue')
        plt.tight_layout()
        plt.show()
        
        # Afficher les corrélations positives et négatives
        print("Top corrélations positives:")
        print(correlations.nlargest(10))
        print("\nTop corrélations négatives:")
        print(correlations.nsmallest(10))
    
    return {
        'selected_features': selected_features,
        'correlations': correlations,
        'abs_correlations': abs_corr
    }

# Méthode 2: Sélection par test ANOVA F-value
def select_by_f_value(X, y, top_n=None, plot=True):
    feature_names = X.columns
    
    # Appliquer le test F de l'ANOVA
    selector = SelectKBest(score_func=f_classif, k='all')
    selector.fit(X, y)
    
    # Récupérer les scores et p-values
    f_scores = pd.Series(selector.scores_, index=feature_names)
    p_values = pd.Series(selector.pvalues_, index=feature_names)
    
    # Trier par score F décroissant
    sorted_features = f_scores.sort_values(ascending=False)
    
    # Sélectionner le nombre de features demandé
    if top_n:
        selected_features = sorted_features.head(top_n).index.tolist()
    else:
        selected_features = sorted_features.index.tolist()
    
    # Visualiser les résultats si demandé
    if plot:
        plt.figure(figsize=(12, 6))
        sorted_features.head(20).plot(kind='barh')
        plt.title('Top 20 features par F-score (ANOVA)')
        plt.xlabel('F-score')
        plt.tight_layout()
        plt.show()
        
        # Afficher les p-values pour voir la significativité statistique
        print("Top features par F-score avec leurs p-values:")
        for feature in sorted_features.head(10).index:
            print(f"{feature}: F-score = {f_scores[feature]:.2f}, p-value = {p_values[feature]:.6f}")
    
    return {
        'selected_features': selected_features,
        'f_scores': f_scores,
        'p_values': p_values
    }

# Méthode 3: Sélection par information mutuelle
def select_by_mutual_info(X, y, top_n=None, plot=True):
    feature_names = X.columns
    
    # Appliquer la sélection par information mutuelle
    selector = SelectKBest(score_func=mutual_info_classif, k='all')
    selector.fit(X, y)
    
    # Récupérer les scores
    mi_scores = pd.Series(selector.scores_, index=feature_names)
    
    # Trier par score décroissant
    sorted_features = mi_scores.sort_values(ascending=False)
    
    # Sélectionner le nombre de features demandé
    if top_n:
        selected_features = sorted_features.head(top_n).index.tolist()
    else:
        selected_features = sorted_features.index.tolist()
    
    # Visualiser les résultats si demandé
    if plot:
        plt.figure(figsize=(12, 6))
        sorted_features.head(20).plot(kind='barh')
        plt.title('Top 20 features par information mutuelle')
        plt.xlabel('Score d\'information mutuelle')
        plt.tight_layout()
        plt.show()
        
        print("Top features par information mutuelle:")
        print(sorted_features.head(10))
    
    return {
        'selected_features': selected_features,
        'mi_scores': mi_scores
    }

# Fonction pour comparer les résultats des différentes méthodes
def compare_feature_selection_methods(results_dict, top_n=20):
    methods = list(results_dict.keys())
    
    if len(methods) < 2:
        print("Besoin d'au moins deux méthodes pour faire une comparaison")
        return
    
    # Créer un DataFrame pour comparer les top features de chaque méthode
    comparison = pd.DataFrame()
    
    for method in methods:
        # Prendre les top_n features de chaque méthode
        features = results_dict[method]['selected_features'][:top_n]
        comparison[method] = pd.Series(features)
    
    print(f"Comparaison des top {top_n} features par méthode:")
    print(comparison)
    
    # Trouver les features communes entre toutes les méthodes
    common_features = set(results_dict[methods[0]]['selected_features'][:top_n])
    for method in methods[1:]:
        common_features &= set(results_dict[method]['selected_features'][:top_n])
    
    print(f"\nFeatures communes à toutes les méthodes: {len(common_features)}")
    if common_features:
        print(sorted(list(common_features)))
    
    # Visualiser le recouvrement entre les méthodes avec un diagramme de Venn (si possible)
    try:
        from matplotlib_venn import venn2, venn3
        
        plt.figure(figsize=(10, 8))
        if len(methods) == 2:
            venn2([
                set(results_dict[methods[0]]['selected_features'][:top_n]),
                set(results_dict[methods[1]]['selected_features'][:top_n])
            ], set_labels=methods)
        elif len(methods) == 3:
            venn3([
                set(results_dict[methods[0]]['selected_features'][:top_n]),
                set(results_dict[methods[1]]['selected_features'][:top_n]),
                set(results_dict[methods[2]]['selected_features'][:top_n])
            ], set_labels=methods)
        else:
            print("Diagramme de Venn limité à 2 ou 3 ensembles")
        
        plt.title(f"Recouvrement des top {top_n} features par méthode")
        plt.show()
    except ImportError:
        print("Package matplotlib_venn non disponible pour le diagramme de Venn")

In [None]:
# Préparation des données
X = x_train_ffbf_with_feature_normalized.drop(['ID', 'reod'], axis=1, errors='ignore')
y = y_train_ffbg_with_features

# Appliquer chaque méthode de sélection
corr_results = select_by_correlation(X, y, top_n=50)
f_results = select_by_f_value(X, y, top_n=50)
mi_results = select_by_mutual_info(X, y, top_n=50)

# Comparer les résultats
all_results = {
    'correlation': corr_results,
    'f_value': f_results,
    'mutual_info': mi_results
}
compare_feature_selection_methods(all_results, top_n=20)

# Pour utiliser les features sélectionnées, vous pourriez choisir:
# 1. Soit les features communes à toutes les méthodes
common_features = set(corr_results['selected_features'][:20]) & \
                 set(f_results['selected_features'][:20]) & \
                 set(mi_results['selected_features'][:20])
                 
# 2. Soit les features d'une méthode spécifique que vous préférez
top_features = corr_results['selected_features'][:30]  # Par exemple

# 3. Soit une union des top N features de chaque méthode
union_features = set()
for method_results in all_results.values():
    union_features.update(method_results['selected_features'][:10])

print(f"Nombre de features uniques dans l'union des top 10 de chaque méthode: {len(union_features)}")

In [None]:
def optimize_feature_count(X, y, model_factory, feature_ranking, 
                          n_features_range=range(5, 101, 5),
                          scale_method='standard',
                          cv=5):
    """
    Trouve le nombre optimal de features à utiliser pour maximiser la performance du modèle.
    
    Arguments:
        X: DataFrame contenant toutes les features
        y: Series contenant la variable cible
        model_factory: Fonction qui crée une nouvelle instance du modèle
        feature_ranking: Liste des features ordonnées par importance décroissante
        n_features_range: Plage du nombre de features à tester
        scale_method: Méthode de normalisation ('standard', 'robust', 'quantile', None)
        cv: Nombre de folds pour la validation croisée
    """
    from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
    from sklearn.pipeline import Pipeline
    from sklearn.model_selection import cross_val_score
    from tqdm import tqdm
    
    # Sélectionner le bon scaler selon la méthode demandée
    if scale_method == 'standard':
        scaler = StandardScaler()
    elif scale_method == 'robust':
        scaler = RobustScaler()
    elif scale_method == 'quantile':
        scaler = QuantileTransformer(output_distribution='normal')
    else:
        scaler = None
    
    # Stocker les résultats pour chaque nombre de features
    results = []
    
    # Tester différents nombres de features
    for n_features in tqdm(n_features_range):
        # Sélectionner les top n features
        selected_features = feature_ranking[:n_features]
        X_selected = X[selected_features]
        
        # Créer le pipeline avec ou sans scaler
        if scaler:
            pipeline = Pipeline([
                ('scaler', scaler),
                ('model', model_factory())
            ])
        else:
            pipeline = model_factory()
        
        # Évaluer la performance avec validation croisée
        cv_scores = cross_val_score(pipeline, X_selected, y, cv=cv, scoring='accuracy')
        
        # Stocker le résultat
        results.append({
            'n_features': n_features,
            'mean_cv_score': cv_scores.mean(),
            'std_cv_score': cv_scores.std(),
            'features': selected_features
        })
    
    # Convertir en DataFrame pour faciliter l'analyse
    results_df = pd.DataFrame(results)
    
    # Trouver le nombre optimal de features
    best_result = results_df.loc[results_df['mean_cv_score'].idxmax()]
    optimal_n_features = int(best_result['n_features'])
    best_score = best_result['mean_cv_score']
    
    print(f"Nombre optimal de features: {optimal_n_features}")
    print(f"Score de validation croisée: {best_score:.4f} ± {best_result['std_cv_score']:.4f}")
    
    # Visualiser les résultats
    plt.figure(figsize=(12, 6))
    plt.errorbar(results_df['n_features'], results_df['mean_cv_score'], 
                 yerr=results_df['std_cv_score'], fmt='o-')
    plt.axvline(x=optimal_n_features, color='r', linestyle='--', 
                label=f'Optimal: {optimal_n_features} features')
    plt.xlabel('Nombre de features')
    plt.ylabel('Score de validation croisée (accuracy)')
    plt.title('Impact du nombre de features sur la performance du modèle')
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()
    
    return {
        'optimal_n_features': optimal_n_features,
        'best_score': best_score,
        'optimal_features': best_result['features'],
        'results': results_df
    }

In [None]:
from xgboost import XGBClassifier

# D'abord, obtenir un classement des features (par exemple via corrélation)
corr_results = select_by_correlation(X, y)
feature_ranking = corr_results['selected_features']

# Définir une factory function qui crée une nouvelle instance du modèle
def model_factory():
    return XGBClassifier(
        objective="multi:softmax",
        num_class=3,
        n_estimators=100,
        random_state=42
    )

# Trouver le nombre optimal de features
optimization_results = optimize_feature_count(
    X, y, 
    model_factory=model_factory,
    feature_ranking=feature_ranking,
    n_features_range=range(5, 101, 5)  # Tester de 5 à 100 features par pas de 5
)

# Utiliser les features optimales pour la modélisation finale
optimal_features = optimization_results['optimal_features']
print(f"Features optimales: {optimal_features}")