Ce notebook est un "brouillon" pour tenter d'améliorer le modèle. (globalement je vais faire ça comme un gros sagouin)

on va se concentrer que sur un jeu de données, on se fiche un peu de la méthode d'imputation, la différence est probablement négligeable. 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from datetime import datetime

# Configuration de l'affichage
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_context("notebook", font_scale=1.2)
plt.rcParams['figure.figsize'] = (12, 8)

# Importation des modules utilitaires avec la nouvelle structure
from utils.data_registry import DATASETS
from utils.benchmarks import get_models
from utils.data_loading import load_datasets
from utils.data_preprocessing import normalize_rendements_by_row
from utils.feature_engineering import add_features, add_financial_features
from utils.experiment_runner import run_experiment, display_experiment_result, add_result
from utils.data_analysis import (
    analyze_distributions, 
    compare_column_stats, 
    analyze_normalization,
    analyze_normalized_dataset,
    compare_normalization_impact,
    perform_pca_analysis,
    analyze_correlations
)
from utils.feature_selection import (
    select_by_correlation,
    select_by_f_value,
    select_by_mutual_info, 
    compare_feature_selection_methods,
    optimize_feature_count,
    find_important_features
)
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
from tqdm import tqdm


In [None]:

# Create empty results tracker
results_tracker = pd.DataFrame(columns=[
    "dataset", "dataset_description", "model", "model_description", 
    "features_added", "feature_sets", "accuracy", "precision_weighted", 
    "recall_weighted", "f1_weighted"
])


In [None]:

# =========================================
# Section 1: Chargement des données
# =========================================
print("Chargement des données...")

# 1.1 Chargement du dataset brut pour référence
X_train_70 = pd.read_csv(r"processed_data\X_train_70.csv")
X_test_70 = pd.read_csv(r"processed_data\X_test_70.csv")

print("\n--- Aperçu des données d'entraînement de référence ---")
print(f"Nombre de lignes train: {X_train_70.shape[0]}, Nombre de colonnes: {X_train_70.shape[1]}")
print(f"Nombre de lignes test: {X_test_70.shape[0]}, Nombre de colonnes: {X_test_70.shape[1]}")

In [None]:
# 1.2 Chargement du dataset principal (ffbf)
x_train_ffbf = pd.read_csv(r"processed_data\X_train_ffbf.csv")
x_test_ffbf = pd.read_csv(r'processed_data\X_test_ffbf.csv')

# Séparation des features et de la cible
y_train_ffbf = x_train_ffbf["reod"].copy()
y_test_ffbf = x_test_ffbf["reod"].copy()

# Conservation uniquement des colonnes pertinentes
x_train_ffbf = x_train_ffbf.loc[:,"ID":"r52"]
x_test_ffbf = x_test_ffbf.loc[:,"ID":"r52"]

print("\n--- Aperçu des données d'entraînement ffbf ---")
print(f"Nombre de lignes train: {x_train_ffbf.shape[0]}, Nombre de colonnes: {x_train_ffbf.shape[1]}")
print(f"Nombre de lignes test: {x_test_ffbf.shape[0]}, Nombre de colonnes: {x_test_ffbf.shape[1]}")


In [None]:

# 1.3 Chargement du dataset avec features préprocessées
x_train_ffbf_with_feature = pd.read_csv(r"processed_data\preprocessed\X_train_ffbf_with_features.csv")
x_test_ffbf_with_feature = pd.read_csv(r'processed_data\preprocessed\X_test_ffbf_with_features.csv')

y_train_ffbf_with_features = x_train_ffbf_with_feature["reod"].copy()
y_test_ffbf_with_features = x_test_ffbf_with_feature["reod"].copy()

# Vérification des données manquantes
train_na_count = x_train_ffbf_with_feature.isna().sum().sum()
test_na_count = x_test_ffbf_with_feature.isna().sum().sum()

print("\n--- Aperçu des données d'entraînement avec features ---")
print(f"Nombre de lignes train: {x_train_ffbf_with_feature.shape[0]}, Nombre de colonnes: {x_train_ffbf_with_feature.shape[1]}")
print(f"Nombre de lignes test: {x_test_ffbf_with_feature.shape[0]}, Nombre de colonnes: {x_test_ffbf_with_feature.shape[1]}")
print(f"\nNombre de NA train: {train_na_count}")
print(f"Nombre de NA test: {test_na_count}")


In [None]:

# Afficher les datasets et modèles disponibles
print("\nDatasets disponibles:")
for key, info in DATASETS.items():
    print(f"- {key}: {info['description']}")

print("\nModèles disponibles:")
models = get_models()
for key, info in models.items():
    if 'description' in info:
        print(f"- {key}: {info['description']}")
    else:
        print(f"- {key}")


In [None]:

# =========================================
# Section 2: Modèles de Référence
# =========================================
print("\n\n--- Comparaison des stratégies d'imputation avec XGBoost ---")

# 2.1 Test des performances du modèle de base
imputation_strategies = ["ffbf"]
imputation_results = []
scalers = {
    'Standard': StandardScaler(),
    'MinMax': MinMaxScaler(),
    'Robust': RobustScaler(),
    'Quantile': QuantileTransformer(output_distribution='normal')
}

# 2.2 Test avec différents scalers sans feature engineering
for strategy in imputation_strategies:
    for scaler_name, scaler in scalers.items():
        print(f"\n Using {scaler} - Testing normalisation on {strategy}...")
        print(f"\nTesting {strategy} imputation...")
        try:
            result_without_features = run_experiment(
                dataset_key=strategy, 
                model_key="xgboost_baseline", 
                add_feat=False, 
                scaler=scaler
            )
            results_tracker = add_result(results_tracker, result_without_features)
            imputation_results.append(result_without_features)
            print(f"Accuracy without feature engineering: {result_without_features['accuracy']:.4f}")
        except Exception as e:
            print(f"Error processing standard {strategy} dataset: {e}")

# 2.3 Test avec features préprocessées
for strategy in imputation_strategies:
    print(f"\nTesting {strategy} with precomputed features...")
    for scaler_name, scaler in scalers.items():
        print(f"\n Using {scaler} - Testing normalisation on {strategy} with precomputed features...")
        try:
            preprocessed_key = f"{strategy}_with_features"
            
            if preprocessed_key in DATASETS:
                result_with_features = run_experiment(
                    dataset_key=preprocessed_key, 
                    model_key="xgboost_baseline", 
                    add_feat=False, 
                    scaler=scaler
                )
                results_tracker = add_result(results_tracker, result_with_features)
                imputation_results.append(result_with_features)
                print(f"Accuracy with precomputed features: {result_with_features['accuracy']:.4f}")
                
                standard_result = next((r for r in imputation_results if r['dataset'] == strategy), None)
                if standard_result:
                    improvement = result_with_features['accuracy'] - standard_result['accuracy']
                    print(f"Improvement from feature engineering: {improvement:.4f} ({improvement*100:.2f}%)")
            else:
                print(f"Preprocessed dataset '{preprocessed_key}' not found in registry.")
        except Exception as e:
            print(f"Error processing preprocessed {strategy} dataset: {e}")

# 2.4 Test avec normalisation par ligne
print("\n--- Testing with row normalization ---")
for strategy in imputation_strategies:
    for scaler_name, scaler in scalers.items():
        print(f"\n Using {scaler} - Testing normalisation on {strategy} with precomputed features...")
        preprocessed_key = f"{strategy}_with_features"
        if preprocessed_key in DATASETS:
            try:
                print(f"\nTesting normalisation on {strategy} with precomputed features...")
                result_normalized = run_experiment(
                    dataset_key=preprocessed_key, 
                    model_key="xgboost_baseline", 
                    add_feat=False,
                    normalize_by_row=True,
                    scaler=scaler
                )
                results_tracker = add_result(results_tracker, result_normalized)
                imputation_results.append(result_normalized)
                print(f"{strategy} with row normalization: Accuracy = {result_normalized['accuracy']:.4f}")
                
                non_normalized = next((r for r in imputation_results if r['dataset'] == preprocessed_key and not r.get('normalize_by_row', False)), None)
                if non_normalized:
                    diff = result_normalized['accuracy'] - non_normalized['accuracy']
                    print(f"Impact of row normalization: {diff:.4f} ({diff*100:.2f}%)")
            except Exception as e:
                print(f"Error with row normalization on {strategy}: {e}")


In [None]:

# =========================================
# Section 3: Analyse des données normalisées
# =========================================
print("\n\n--- Analyse des données normalisées ---")

# 3.1 Application de la normalisation par ligne
x_train_ffbf_with_feature_normalized = normalize_rendements_by_row(x_train_ffbf_with_feature)

# 3.2 Analyse des distributions après normalisation
rendement_cols = [col for col in x_train_ffbf_with_feature_normalized.columns if col.startswith('r') and col[1:].isdigit()]
sample_cols = ['r0', 'r10', 'r25', 'r40', 'r52']

print(f"Analyse des distributions de {len(rendement_cols)} colonnes de rendement après normalisation")
plt.figure(figsize=(15, 10))
for i, col in enumerate(rendement_cols):
    plt.subplot(len(rendement_cols), 1, i+1)
    sns.histplot(x_train_ffbf_with_feature_normalized[col], kde=True)
    plt.title(f'Distribution de {col} après normalisation')
    plt.axvline(x_train_ffbf_with_feature_normalized[col].mean(), color='r', linestyle='--', 
                label=f'Moyenne: {x_train_ffbf_with_feature_normalized[col].mean():.4f}')
    plt.legend()
plt.tight_layout()
plt.show()


In [None]:


# 3.3 Comparaison avant/après normalisation
print("\nComparaison des données avant et après normalisation:")
compare_normalization_impact(
    x_train_ffbf_with_feature, 
    x_train_ffbf_with_feature_normalized, 
    sample_cols=sample_cols
)


In [None]:

# =========================================
# Section 4: Sélection des Features
# =========================================
print("\n\n--- Sélection des Features Importantes ---")

# 4.1 Préparation des données pour l'analyse
X = x_train_ffbf_with_feature_normalized.drop(['ID', 'reod'], axis=1, errors='ignore')
y = y_train_ffbf_with_features

# 4.2 Utilisation de différentes méthodes de sélection
print("\nMéthode 1: Sélection par corrélation")
corr_results = select_by_correlation(X, y, top_n=50)

print("\nMéthode 2: Sélection par test ANOVA (F-value)")
f_results = select_by_f_value(X, y, top_n=50)

print("\nMéthode 3: Sélection par information mutuelle")
mi_results = select_by_mutual_info(X, y, top_n=50)

# 4.3 Comparaison des résultats des différentes méthodes
all_results = {
    'correlation': corr_results,
    'f_value': f_results,
    'mutual_info': mi_results
}
print("\nComparaison des méthodes de sélection:")
compare_feature_selection_methods(all_results, top_n=20)

# 4.4 Identification des features communes à toutes les méthodes
common_features = set(corr_results['selected_features'][:20]) & \
                 set(f_results['selected_features'][:20]) & \
                 set(mi_results['selected_features'][:20])
print(f"\nFeatures communes aux 3 méthodes (top 20): {len(common_features)}")
print(sorted(list(common_features)))

# 4.5 Création d'un ensemble d'union des top features
union_features = set()
for method_results in all_results.values():
    union_features.update(method_results['selected_features'][:10])
print(f"\nNombre de features uniques dans l'union des top 10 de chaque méthode: {len(union_features)}")
print(sorted(list(union_features)))


In [None]:

# =========================================
# Section 5: Optimisation du nombre de features
# =========================================
print("\n\n--- Optimisation du nombre de Features ---")

# 5.1 Définition d'une factory function pour XGBoost
def xgboost_factory():
    return XGBClassifier(
        objective="multi:softmax",
        num_class=3,
        n_estimators=100,
        random_state=42
    )

# 5.2 Optimisation pour chaque méthode de sélection
for method_name, results in all_results.items():
    print(f"\nOptimisation pour la méthode: {method_name}")
    try:
        feature_ranking = results['selected_features']
        
        optimization_result = optimize_feature_count(
            X, y, 
            model_factory=xgboost_factory,
            feature_ranking=feature_ranking,
            n_features_range=range(5, min(101, len(feature_ranking)), 5)
        )
        
        print(f"Méthode {method_name} - Nombre optimal de features: {optimization_result['optimal_n_features']}")
        print(f"Score avec features optimales: {optimization_result['best_score']:.4f}")
        
        # Sauvegarder les features optimales pour chaque méthode
        all_results[method_name]['optimal_features'] = optimization_result['optimal_features']
        all_results[method_name]['optimal_n_features'] = optimization_result['optimal_n_features']
        all_results[method_name]['best_score'] = optimization_result['best_score']
        
    except Exception as e:
        print(f"Erreur lors de l'optimisation pour {method_name}: {e}")


In [None]:

# =========================================
# Section 6: Évaluation finale des modèles avec features optimisées
# =========================================
print("\n\n--- Évaluation finale des modèles avec features optimisées ---")

# 6.1 Identification de la meilleure méthode de sélection
best_method = None
best_score = 0

for method, results in all_results.items():
    if 'best_score' in results and results['best_score'] > best_score:
        best_score = results['best_score']
        best_method = method

if best_method:
    print(f"\nLa meilleure méthode de sélection est: {best_method} avec un score de {best_score:.4f}")
    best_features = all_results[best_method]['optimal_features']
    print(f"Nombre de features optimales: {len(best_features)}")
    print("Top 10 des features optimales:")
    print(best_features[:10])
    
    # 6.2 Test final sur le dataset de test avec les features optimales
    X_train_selected = X[best_features]
    
    # Préparation des données de test
    X_test = x_test_ffbf_with_feature.drop(['ID', 'reod'], axis=1, errors='ignore')
    X_test_normalized = normalize_rendements_by_row(X_test)
    X_test_selected = X_test_normalized[best_features]
    
    # Entraînement du modèle final
    print("\nEntraînement du modèle final avec les features optimales...")
    final_model = Pipeline([
        ('scaler', StandardScaler()),
        ('model', xgboost_factory())
    ])
    
    final_model.fit(X_train_selected, y)
    
    # Prédiction sur le dataset de test
    y_pred = final_model.predict(X_test_selected)
    
    # Évaluation des performances
    accuracy = accuracy_score(y_test_ffbf_with_features, y_pred)
    print(f"\nPerformance finale sur le dataset de test:")
    print(f"Accuracy: {accuracy:.4f}")
    
    print("\nClassification Report:")
    print(classification_report(y_test_ffbf_with_features, y_pred))
    
    # Affichage de la matrice de confusion
    conf_matrix = confusion_matrix(y_test_ffbf_with_features, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                xticklabels=[-1, 0, 1], yticklabels=[-1, 0, 1])
    plt.title('Matrice de Confusion')
    plt.xlabel('Valeur Prédite')
    plt.ylabel('Valeur Réelle')
    plt.tight_layout()
    plt.show()
else:
    print("Aucune méthode de sélection n'a généré de résultat d'optimisation valide.")


In [None]:
# =========================================
# Section 7: Ajout de features financières avancées
# =========================================
print("\n\n--- Ajout de features financières avancées ---")


In [None]:

# 7.2 Application des nouvelles features financières
print("Ajout de features financières avancées aux données d'entraînement...")
X_train_enriched = add_financial_features(x_train_ffbf_with_feature_normalized)
new_features = [col for col in X_train_enriched.columns if col not in x_train_ffbf_with_feature_normalized.columns]
print(f"Nouvelles features ajoutées: {len(new_features)}")
print(f"Liste des nouvelles features: {new_features}")

# 7.3 Analyse des nouvelles features
if new_features:
    # Analyse des corrélations des nouvelles features avec la cible
    financial_corr = X_train_enriched[new_features].corrwith(y_train_ffbf_with_features)
    financial_corr = financial_corr.sort_values(ascending=False)
    
    print("\nTop correlations des nouvelles features financières:")
    print(financial_corr)
    
    plt.figure(figsize=(12, 6))
    financial_corr.abs().sort_values(ascending=False).plot(kind='bar')
    plt.title('Importance des nouvelles features financières (corrélation absolue)')
    plt.tight_layout()
    plt.show()

# 7.4 Test du modèle avec les features financières
print("\nTest du modèle avec l'ajout des features financières...")
X_train_complete = X_train_enriched.drop(['ID', 'reod'], axis=1, errors='ignore')

# Utilisation des features optimales précédentes + nouvelles features financières
combined_features = list(best_features) + [f for f in new_features if f not in best_features]
X_train_final = X_train_complete[combined_features]

# Préparation des données de test avec les nouvelles features
X_test_normalized = normalize_rendements_by_row(x_test_ffbf_with_feature)
X_test_enriched = add_financial_features(X_test_normalized)
X_test_final = X_test_enriched[combined_features]

# Entraînement et évaluation du modèle final
final_model_enriched = Pipeline([
    ('scaler', StandardScaler()),
    ('model', xgboost_factory())
])

final_model_enriched.fit(X_train_final, y)
y_pred_enriched = final_model_enriched.predict(X_test_final)

# Évaluation des performances
accuracy_enriched = accuracy_score(y_test_ffbf_with_features, y_pred_enriched)
print(f"\nPerformance finale avec features financières avancées:")
print(f"Accuracy: {accuracy_enriched:.4f}")
if 'accuracy' in locals():
    improvement = accuracy_enriched - accuracy
    print(f"Amélioration: {improvement:.4f} ({improvement*100:.2f}%)")

print("\nClassification Report:")
print(classification_report(y_test_ffbf_with_features, y_pred_enriched))

# =========================================
# Section 8: Conclusion et résumé
# =========================================
print("\n\n--- Résumé des résultats ---")

# 8.1 Récapitulatif des performances
print("\nRécapitulatif des performances des modèles:")
for i, result in enumerate(imputation_results):
    print(f"{i+1}. {result['dataset']} - {result['model']} - Accuracy: {result['accuracy']:.4f}")

# 8.2 Meilleure méthode de sélection de features
if best_method:
    print(f"\nMeilleure méthode de sélection: {best_method}")
    print(f"Nombre optimal de features: {all_results[best_method]['optimal_n_features']}")
    print(f"Score optimal: {all_results[best_method]['best_score']:.4f}")

# 8.3 Résultats finaux
if 'accuracy' in locals() and 'accuracy_enriched' in locals():
    print("\nRésultats finaux:")
    print(f"Modèle avec features optimisées: {accuracy:.4f}")
    print(f"Modèle avec features financières avancées: {accuracy_enriched:.4f}")
    print(f"Amélioration finale: {(accuracy_enriched - accuracy)*100:.2f}%")

print("\nAnalyse et optimisation complètes !")