XGBoost

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import pickle

# Charger les données
file = open('dataset/dataset.pkl', 'rb')
DATA = pickle.load(file)
pheno = DATA['pheno']
X_gpa = DATA['X_gpa']
X_snps = DATA['X_snps']
X_genexp = DATA['X_genexp']

In [5]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import recall_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

def refine_and_evaluate_model_with_xgb(pheno, X_gpa, X_snps, X_genexp, model, param_grid, model_name, importance_threshold=0):
    performances = []
    hyperparams_records = []

    # Pour chaque antibiotique
    for antibiotique in pheno.columns[1:]:
        print(f"Traitement de l'antibiotique : {antibiotique}")

        # Extraire la cible (y) et les indices valides
        y = pheno[antibiotique].to_numpy()
        valid_indices = ~np.isnan(y)  
        y = y[valid_indices]

        # Filtrer les matrices explicatives
        X_gpa_filtered = X_gpa[valid_indices]
        X_snps_filtered = X_snps[valid_indices]
        X_genexp_filtered = X_genexp[valid_indices]

        X_gpa_train, X_gpa_test, X_snps_train, X_snps_test, X_genexp_train, X_genexp_test, y_train, y_test = train_test_split(
            X_gpa_filtered, X_snps_filtered, X_genexp_filtered, y, 
            test_size=0.2, random_state=42, stratify=y
        )

        scaler_genexp = StandardScaler()
        X_genexp_train = scaler_genexp.fit_transform(X_genexp_train)  # Fit + transform sur l'entraînement
        X_genexp_test = scaler_genexp.transform(X_genexp_test)        # Transform uniquement sur le test

        X_train = np.hstack([X_gpa_train, X_snps_train, X_genexp_train])
        X_test = np.hstack([X_gpa_test, X_snps_test, X_genexp_test])

        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='recall', verbose=1)
        grid_search.fit(X_train, y_train)

        # Meilleur modèle trouvé
        meilleur_modele = grid_search.best_estimator_

        # Calculer l'importance des caractéristiques
        feature_importances = meilleur_modele.feature_importances_

        # Filtrer les colonnes par importance
        important_features = feature_importances > importance_threshold

        # Identifier les colonnes filtrées pour GPA, SNPs et GENEXP
        colonnes_support = np.arange(X_train.shape[1])  
        colonnes_gpa = colonnes_support[:X_gpa_filtered.shape[1]][important_features[:X_gpa_filtered.shape[1]]]
        colonnes_snps = colonnes_support[X_gpa_filtered.shape[1]:X_gpa_filtered.shape[1] + X_snps_filtered.shape[1]][
            important_features[X_gpa_filtered.shape[1]:X_gpa_filtered.shape[1] + X_snps_filtered.shape[1]]
        ]
        colonnes_genexp = colonnes_support[X_gpa_filtered.shape[1] + X_snps_filtered.shape[1]:][
            important_features[X_gpa_filtered.shape[1] + X_snps_filtered.shape[1]:]
        ]

        # Prédictions et calcul du rappel macro
        y_pred = meilleur_modele.predict(X_test)
        rappel_macro = recall_score(y_test, y_pred, average='macro')
        print(f"Rappel macro pour {antibiotique} avec {model_name}: {rappel_macro:.4f}")

        # Stocker les performances
        performances.append({
            'Antibiotique': antibiotique,
            'Recall': rappel_macro,
            'Colonnes_GPA': colonnes_gpa.tolist(),
            'Colonnes_SNPs': colonnes_snps.tolist(),
            'Colonnes_Expression_Genetique': colonnes_genexp.tolist()
        })

        # Stocker les hyperparamètres
        hyperparams_records.append({
            'Antibiotique': antibiotique,
            'Meilleur_paramètre': grid_search.best_params_,
            'Valeurs_des_hyperparamètres': param_grid
        })

    performances_df = pd.DataFrame(performances)

    # fichier CSV 
    csv_filename = f'result_{model_name}.csv'
    performances_df.to_csv(csv_filename, index=False, sep=';')
    print(f"Les performances ont été enregistrées dans {csv_filename}")

    hyperparams_df = pd.DataFrame(hyperparams_records)

    hyperparams_csv_filename = f'param_{model_name}.csv'
    hyperparams_df.to_csv(hyperparams_csv_filename, index=False, sep=';')

    return performances_df

In [6]:
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

param_grid = {
    'max_depth': [3, 5],
    'subsample': [0.8, 1.0],
}

performances_df = refine_and_evaluate_model_with_xgb(pheno, X_gpa, X_snps, X_genexp, model, param_grid, "XGBoost")

print("\nPerformances du modèle après sélection des caractéristiques:")
print(performances_df)


Traitement de l'antibiotique : Tobramycin
Fitting 5 folds for each of 4 candidates, totalling 20 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Rappel macro pour Tobramycin avec XGBoost: 0.9437
Traitement de l'antibiotique : Ceftazidim
Fitting 5 folds for each of 4 candidates, totalling 20 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Rappel macro pour Ceftazidim avec XGBoost: 0.7763
Traitement de l'antibiotique : Ciprofloxacin
Fitting 5 folds for each of 4 candidates, totalling 20 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Rappel macro pour Ciprofloxacin avec XGBoost: 0.8719
Traitement de l'antibiotique : Meropenem
Fitting 5 folds for each of 4 candidates, totalling 20 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Rappel macro pour Meropenem avec XGBoost: 0.8330
Traitement de l'antibiotique : Colistin
Fitting 5 folds for each of 4 candidates, totalling 20 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Rappel macro pour Colistin avec XGBoost: 0.6462
Les performances ont été enregistrées dans result_XGBoost.csv

Performances du modèle après sélection des caractéristiques:
    Antibiotique    Recall                                       Colonnes_GPA  \
0     Tobramycin  0.943681  [959, 1129, 1262, 2950, 4202, 5086, 6942, 7159...   
1     Ceftazidim  0.776292   [674, 1129, 3427, 4007, 4584, 4824, 8301, 15756]   
2  Ciprofloxacin  0.871875                         [1350, 5885, 12388, 15525]   
3      Meropenem  0.833024  [665, 3517, 4449, 8302, 8490, 13855, 15631, 15...   
4       Colistin  0.646168                   [2157, 3939, 8264, 12580, 14111]   

                                       Colonnes_SNPs  \
0  [20136, 29220, 48884, 49154, 50841, 52493, 561...   
1  [38072, 42633, 43194, 49243, 54860, 56052, 560...   
2  [46840, 47206, 53629, 54144, 57651, 57872, 588...   
3  [39121, 39154, 40348, 40392, 40409, 46440, 522...   
4  [37849, 45217, 50844, 57505, 58832, 60133, 607...   

    