In [32]:
# Importation des librairies
import json
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_validate
from xgboost import XGBClassifier

In [26]:
pd.set_option('future.no_silent_downcasting', True)

In [27]:
def read_csv_files_to_dict(base_path, dimension, file_prefix, missing_percentages, strategies=None):
    dataframes_dict = {}

    for percentage in missing_percentages:
        percentage_key = f"{int(percentage * 100)}%"

        if dimension == 'Completeness':
            dataframes_dict[percentage_key] = {}

            if strategies is not None:
                for strategy_name, strategy_func, subfolder in strategies:
                    file_path = f"{base_path}/{dimension}/{subfolder}/{file_prefix}_{percentage_key}.csv"

                    try:
                        df = pd.read_csv(file_path)
                        dataframes_dict[percentage_key][strategy_name] = df
                    except FileNotFoundError:
                        print(f"File not found: {file_path}")
                        dataframes_dict[percentage_key][strategy_name] = None
            else:
                print("No strategies provided for Completeness dimension.")
        elif dimension == 'Unicity':
            file_path = f"{base_path}/{dimension}/{file_prefix}_{percentage_key}_2x.csv"
            try:
                df = pd.read_csv(file_path)
                dataframes_dict[percentage_key] = df
            except FileNotFoundError:
                print(f"File not found: {file_path}")
                dataframes_dict[percentage_key] = None
        else:
            file_path = f"{base_path}/{dimension}/{file_prefix}_{percentage_key}.csv"
            try:
                df = pd.read_csv(file_path)
                dataframes_dict[percentage_key] = df
            except FileNotFoundError:
                print(f"File not found: {file_path}")
                dataframes_dict[percentage_key] = None

    return dataframes_dict


def train_and_evaluate(X, y):
    model = XGBClassifier()
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
    cv_results = cross_validate(model, X, y, cv=kfold, scoring=scoring)

    results = {
        "accuracy": cv_results['test_accuracy'].mean(),
        "precision": cv_results['test_precision_macro'].mean(),
        "recall": cv_results['test_recall_macro'].mean(),
        "f1_score": cv_results['test_f1_macro'].mean()
    }

    return results

## Completeness

In [30]:
missing_percentages = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]

# Strategy :
strategy_delete = {
    'numerical': 'delete',
    'categorical': 'delete'
}

strategy_mean_mode = {
    'numerical': 'mean',
    'categorical': 'mode'
}

strategy_median_new = {
    'numerical': 'median',
    'categorical': 'new'
}

strategy_decision_tree = {
    'numerical': 'decision_tree',
    'categorical': 'decision_tree'
}

strategy_mean_new = {
    'numerical': 'mean',
    'categorical': 'new'
}

strategy_knn_mode = {
    'numerical': 'knn',
    'categorical': 'mode'
}

strategies = [
    # ("Delete lines", strategy_delete, "Delete lines"),
    ("Mean and Mode", strategy_mean_mode, "Mean and Mode"),
    ("Median and New", strategy_median_new, "Median and New"),
    ("Decision Tree", strategy_decision_tree, "Decision Tree"),
    ("Mean and New", strategy_mean_new, "Mean and New")
    # ("KNN and Mode", strategy_knn_mode, "KNN and Mode")
]

heart_df_dict = read_csv_files_to_dict('../../Data/Classification/Indicators of Heart Disease', 'Completeness', 'heart', missing_percentages, strategies)

# Dictionnaire pour stocker les résultats
results_dict = {
    "models": [
        {
            "model": "XGBoost",
            "imputation_strategies": []
        }
    ]
}

# Boucle sur les DataFrames pour chaque pourcentage de pollution
for key, heart_df_strategies in heart_df_dict.items():
    print(key)
    for strategy, heart_df in heart_df_strategies.items():
        print(strategy)
        heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
        heart_df = pd.get_dummies(heart_df, drop_first=True)
        X = heart_df.drop('HadHeartAttack', axis=1)
        y = heart_df['HadHeartAttack']
        print(X.shape)
        print(y.shape)
        results = train_and_evaluate(X, y)

        # Extraire le pourcentage de pollution à partir de la clé
        pollution_percentage = int(key.replace('%', ''))

        # Trouver ou créer l'entrée pour la stratégie d'imputation actuelle
        strategy_entry = next((item for item in results_dict["models"][0]["imputation_strategies"] if item["strategy"] == strategy), None)

        if not strategy_entry:
            # Ajouter une nouvelle stratégie d'imputation si elle n'existe pas encore
            strategy_entry = {
                "strategy": strategy,
                "pollution_metrics": []
            }
            results_dict["models"][0]["imputation_strategies"].append(strategy_entry)

        # Ajouter les résultats au dictionnaire sous la bonne stratégie
        strategy_entry["pollution_metrics"].append({
            "pollution_percentage": pollution_percentage,
            "metrics": results
        })
        
        print(strategy_entry)

0%
Mean and Mode
(246022, 122)
(246022,)
{'strategy': 'Mean and Mode', 'pollution_metrics': [{'pollution_percentage': 0, 'metrics': {'accuracy': 0.9478867709283898, 'precision': 0.7546709544012085, 'recall': 0.6166185651988428, 'f1_score': 0.6559323267381035}}]}
Median and New
(246022, 122)
(246022,)
{'strategy': 'Median and New', 'pollution_metrics': [{'pollution_percentage': 0, 'metrics': {'accuracy': 0.9478867709283898, 'precision': 0.7546709544012085, 'recall': 0.6166185651988428, 'f1_score': 0.6559323267381035}}]}
Decision Tree
(246022, 122)
(246022,)
{'strategy': 'Decision Tree', 'pollution_metrics': [{'pollution_percentage': 0, 'metrics': {'accuracy': 0.9478867709283898, 'precision': 0.7546709544012085, 'recall': 0.6166185651988428, 'f1_score': 0.6559323267381035}}]}
Mean and New
(246022, 122)
(246022,)
{'strategy': 'Mean and New', 'pollution_metrics': [{'pollution_percentage': 0, 'metrics': {'accuracy': 0.9478867709283898, 'precision': 0.7546709544012085, 'recall': 0.6166185651

In [31]:
# Chemin du fichier JSON où les résultats seront enregistrés
output_path = "../../Results/Heart Disease/Completeness.json"

# Écrire les résultats dans un fichier JSON
with open(output_path, 'w') as json_file:
    json.dump(results_dict, json_file, indent=4)

print(f"Results saved to {output_path}")

Results saved to ../../Results/Heart Disease/Completeness.json


## Consistent Representation

In [10]:
missing_percentages = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
heart_df_dict = read_csv_files_to_dict('../../Data/Classification/Indicators of Heart Disease', 'Consistent Representation', 'heart', missing_percentages)

# Dictionnaire pour stocker les résultats
results_dict = {
    "models": [
        {
            "model": "XGBoost",
            "pollution_metrics": []
        }
    ]
}

# Boucle sur les DataFrames pour chaque pourcentage de pollution
for key, heart_df in heart_df_dict.items():
    heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
    heart_df = pd.get_dummies(heart_df, drop_first=True)
    X = heart_df.drop('HadHeartAttack', axis=1)
    y = heart_df['HadHeartAttack']

    results = train_and_evaluate(X, y)

    # Extraire le pourcentage de pollution à partir de la clé
    pollution_percentage = int(key.replace('%', ''))

    # Ajouter les résultats au dictionnaire
    results_dict["models"][0]["pollution_metrics"].append({
        "pollution_percentage": pollution_percentage,
        "metrics": results
    })

  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)


In [11]:
# Chemin du fichier JSON où les résultats seront enregistrés
output_path = "../../Results/Heart Disease/Consistent Representation.json"

# Écrire les résultats dans un fichier JSON
with open(output_path, 'w') as json_file:
    json.dump(results_dict, json_file, indent=4)

print(f"Results saved to {output_path}")

Results saved to ../../Results/Heart Disease/Consistent Representation.json


## Feature Accuracy

In [8]:
missing_percentages = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
heart_df_dict = read_csv_files_to_dict('../../Data/Classification/Indicators of Heart Disease', 'Feature Accuracy', 'heart', missing_percentages)

# Dictionnaire pour stocker les résultats
results_dict = {
    "models": [
        {
            "model": "XGBoost",
            "pollution_metrics": []
        }
    ]
}

# Boucle sur les DataFrames pour chaque pourcentage de pollution
for key, heart_df in heart_df_dict.items():
    heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
    heart_df = pd.get_dummies(heart_df, drop_first=True)
    X = heart_df.drop('HadHeartAttack', axis=1)
    y = heart_df['HadHeartAttack']

    results = train_and_evaluate(X, y)

    # Extraire le pourcentage de pollution à partir de la clé
    pollution_percentage = int(key.replace('%', ''))

    # Ajouter les résultats au dictionnaire
    results_dict["models"][0]["pollution_metrics"].append({
        "pollution_percentage": pollution_percentage,
        "metrics": results
    })

  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)


In [9]:
# Chemin du fichier JSON où les résultats seront enregistrés
output_path = "../../Results/Heart Disease/Feature Accuracy.json"

# Écrire les résultats dans un fichier JSON
with open(output_path, 'w') as json_file:
    json.dump(results_dict, json_file, indent=4)

print(f"Results saved to {output_path}")

Results saved to ../../Results/Heart Disease/Feature Accuracy.json


## Target Accuracy

In [12]:
missing_percentages = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
heart_df_dict = read_csv_files_to_dict('../../Data/Classification/Indicators of Heart Disease', 'Target Accuracy', 'heart', missing_percentages)

# Dictionnaire pour stocker les résultats
results_dict = {
    "models": [
        {
            "model": "XGBoost",
            "pollution_metrics": []
        }
    ]
}

# Boucle sur les DataFrames pour chaque pourcentage de pollution
for key, heart_df in heart_df_dict.items():
    heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
    heart_df = pd.get_dummies(heart_df, drop_first=True)
    X = heart_df.drop('HadHeartAttack', axis=1)
    y = heart_df['HadHeartAttack']

    results = train_and_evaluate(X, y)

    # Extraire le pourcentage de pollution à partir de la clé
    pollution_percentage = int(key.replace('%', ''))

    # Ajouter les résultats au dictionnaire
    results_dict["models"][0]["pollution_metrics"].append({
        "pollution_percentage": pollution_percentage,
        "metrics": results
    })

  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)


In [13]:
# Chemin du fichier JSON où les résultats seront enregistrés
output_path = "../../Results/Heart Disease/Target Accuracy.json"

# Écrire les résultats dans un fichier JSON
with open(output_path, 'w') as json_file:
    json.dump(results_dict, json_file, indent=4)

print(f"Results saved to {output_path}")

Results saved to ../../Results/Heart Disease/Target Accuracy.json


## Target Class Balance

In [15]:
balance = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
heart_df_dict = read_csv_files_to_dict('../../Data/Classification/Indicators of Heart Disease', 'Target Class Balance', 'heart', balance)

# Dictionnaire pour stocker les résultats
results_dict = {
    "models": [
        {
            "model": "XGBoost",
            "pollution_metrics": []
        }
    ]
}

# Boucle sur les DataFrames pour chaque pourcentage de pollution
for key, heart_df in heart_df_dict.items():
    heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
    heart_df = pd.get_dummies(heart_df, drop_first=True)
    X = heart_df.drop('HadHeartAttack', axis=1)
    y = heart_df['HadHeartAttack']

    results = train_and_evaluate(X, y)

    # Extraire le pourcentage de pollution à partir de la clé
    pollution_percentage = int(key.replace('%', ''))

    # Ajouter les résultats au dictionnaire
    results_dict["models"][0]["pollution_metrics"].append({
        "pollution_percentage": pollution_percentage,
        "metrics": results
    })

  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)


In [16]:
# Chemin du fichier JSON où les résultats seront enregistrés
output_path = "../../Results/Heart Disease/Target Class Balance.json"

# Écrire les résultats dans un fichier JSON
with open(output_path, 'w') as json_file:
    json.dump(results_dict, json_file, indent=4)

print(f"Results saved to {output_path}")

Results saved to ../../Results/Heart Disease/Target Class Balance.json


## Unicity

In [18]:
pollution_percentage = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
heart_df_dict = read_csv_files_to_dict('../../Data/Classification/Indicators of Heart Disease', 'Unicity', 'heart', pollution_percentage)

# Dictionnaire pour stocker les résultats
results_dict = {
    "models": [
        {
            "model": "XGBoost",
            "pollution_metrics": []
        }
    ]
}

# Boucle sur les DataFrames pour chaque pourcentage de pollution
for key, heart_df in heart_df_dict.items():
    heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
    heart_df = pd.get_dummies(heart_df, drop_first=True)
    X = heart_df.drop('HadHeartAttack', axis=1)
    y = heart_df['HadHeartAttack']

    results = train_and_evaluate(X, y)

    # Extraire le pourcentage de pollution à partir de la clé
    pollution_percentage = int(key.replace('%', ''))

    # Ajouter les résultats au dictionnaire
    results_dict["models"][0]["pollution_metrics"].append({
        "pollution_percentage": pollution_percentage,
        "metrics": results
    })

  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)
  heart_df['HadHeartAttack'] = heart_df['HadHeartAttack'].replace({'Yes': True, 'No': False}).astype(bool)


In [19]:
# Chemin du fichier JSON où les résultats seront enregistrés
output_path = "../../Results/Heart Disease/Unicity.json"

# Écrire les résultats dans un fichier JSON
with open(output_path, 'w') as json_file:
    json.dump(results_dict, json_file, indent=4)

print(f"Results saved to {output_path}")

Results saved to ../../Results/Heart Disease/Unicity.json
