In [1]:
import pandas as pd
import os
import ast

def calculate_mean_recall_and_column_percentages(directory):
    # Initialiser les dictionnaires pour stocker les recalls et les colonnes
    recall_data = {}
    column_counts = {}

    # Parcourir les fichiers dans le répertoire dans l'ordre alphabétique
    for file_name in sorted(os.listdir(directory)):
        if file_name.startswith("result_") and file_name.endswith(".csv"):
            model_name = file_name[len("result_"):-4]  # Extraire le nom du modèle à partir du nom du fichier
            file_path = os.path.join(directory, file_name)

            # Lire le fichier CSV
            df = pd.read_csv(file_path, sep=';')

            for _, row in df.iterrows():
                antibiotic = row['Antibiotique']
                recall = row['Recall']
                gpa_columns = ast.literal_eval(row['Colonnes_GPA'])
                snps_columns = ast.literal_eval(row['Colonnes_SNPs'])
                genexp_columns = ast.literal_eval(row['Colonnes_Expression_Genetique'])

                # Mettre à jour les données de recall
                if antibiotic not in recall_data:
                    recall_data[antibiotic] = {}
                recall_data[antibiotic][model_name] = recall

                # Mettre à jour les données de comptes de colonnes
                if antibiotic not in column_counts:
                    column_counts[antibiotic] = {}
                column_counts[antibiotic][model_name] = {
                    'GPA': len(gpa_columns),
                    'SNPs': len(snps_columns),
                    'GenExp': len(genexp_columns)
                }

    # Créer un DataFrame à partir des données de recall
    recall_df = pd.DataFrame(recall_data).T

    # Calculer le recall moyen par modèle
    mean_recall_by_model = recall_df.mean(axis=0).to_frame(name='Recall_Mean_By_Model')

    # Calculer le recall moyen par antibiotique
    mean_recall_by_antibiotic = recall_df.mean(axis=1).to_frame(name='Recall_Mean_By_Antibiotic')

    # Calculer les pourcentages de colonnes sélectionnées
    column_percentage_dfs = {}
    for col_type in ['GPA', 'SNPs', 'GenExp']:
        col_df = pd.DataFrame({
            antibiotic: {
                model: (data[col_type] / sum(data.values()) * 100 if sum(data.values()) > 0 else 0)
                for model, data in models.items()
            }
            for antibiotic, models in column_counts.items()
        }).T

        # Calculer les stats min, max, et moyennes
        col_df['Min_Percentage'] = col_df.min(axis=1)
        col_df['Max_Percentage'] = col_df.max(axis=1)
        col_df['Mean_Percentage'] = col_df.mean(axis=1)

        # Conserver uniquement les statistiques
        column_percentage_dfs[col_type] = col_df[['Min_Percentage', 'Max_Percentage', 'Mean_Percentage']]

    return mean_recall_by_model, mean_recall_by_antibiotic, column_percentage_dfs

In [2]:
directory = "result"
mean_recall_by_model, mean_recall_by_antibiotic, column_percentage_dfs = calculate_mean_recall_and_column_percentages(directory)
print(mean_recall_by_model)
print(mean_recall_by_antibiotic)

                  Recall_Mean_By_Model
1) LGR FDR                    0.787872
2) LGR L1                     0.853616
3) SVM                        0.623030
4) Random Forest              0.704612
5) XGBoost                    0.814208
6) MLP                        0.654601
               Recall_Mean_By_Antibiotic
Tobramycin                      0.916708
Ceftazidim                      0.696866
Ciprofloxacin                   0.771413
Meropenem                       0.686466
Colistin                        0.626830
