In [6]:
import pandas as pd
import numpy as np

In [12]:
def feature_accuracy(df, target_column, pollution_percentage=0.1):
    """
    Applique une pollution sur un pourcentage des données numériques et catégoriques d'un DataFrame,
    en excluant la colonne cible spécifiée.
    
    Args:
    - df (pd.DataFrame): Le DataFrame à polluer.
    - target_column (str): Le nom de la colonne cible à ne pas polluer.
    - pollution_percentage (float): Le pourcentage des données à polluer, entre 0 et 1.
    
    Returns:
    - pd.DataFrame: Le DataFrame pollué.
    """
    df_polluted = df.copy()

    # Pollution des données catégoriques, en excluant la colonne cible
    categorical_columns = [col for col in df_polluted.select_dtypes(include=['object', 'category']).columns
                           if col != target_column]
    for col in categorical_columns:
        # Sélectionner un sous-ensemble de lignes à polluer
        mask = np.random.rand(len(df_polluted)) < pollution_percentage
        unique_values = df_polluted[col].unique()

        # Pour chaque ligne sélectionnée, changer la valeur par une autre valeur aléatoire différente
        df_polluted.loc[mask, col] = df_polluted.loc[mask, col].apply(
            lambda x: np.random.choice(unique_values[unique_values != x])
        )

    # Pollution des données numériques, en excluant la colonne cible
    numeric_columns = [col for col in df_polluted.select_dtypes(include=[np.number]).columns
                       if col != target_column]
    for col in numeric_columns:
        # Sélectionner un sous-ensemble de lignes à polluer
        mask = np.random.rand(len(df_polluted)) < pollution_percentage

        # Ajouter du bruit gaussien aux valeurs sélectionnées
        noise = np.random.normal(0, df_polluted[col].std(), size=len(df_polluted[mask]))
        df_polluted.loc[mask, col] += noise

    return df_polluted

In [13]:
df = pd.read_csv('../../../Data/Classification/Indicators of Heart Disease/heart_2022_no_nans_clean.csv')
df

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,4.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.60,71.67,27.99,No,No,Yes,Yes,"Yes, received Tdap",No,No
1,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,6.0,None of them,No,...,1.78,95.25,30.13,No,No,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,No
2,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,"6 or more, but not all",No,...,1.85,108.86,31.66,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
3,Alabama,Female,Fair,5.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.70,90.72,31.32,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
4,Alabama,Female,Good,3.0,15.0,Within past year (anytime less than 12 months ...,Yes,5.0,1 to 5,No,...,1.55,79.38,33.07,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246017,Virgin Islands,Male,Very good,0.0,0.0,Within past 2 years (1 year but less than 2 ye...,Yes,6.0,None of them,No,...,1.78,102.06,32.28,Yes,No,No,No,"Yes, received tetanus shot but not sure what type",No,No
246018,Virgin Islands,Female,Fair,0.0,7.0,Within past year (anytime less than 12 months ...,Yes,7.0,None of them,No,...,1.93,90.72,24.34,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,Yes
246019,Virgin Islands,Male,Good,0.0,15.0,Within past year (anytime less than 12 months ...,Yes,7.0,1 to 5,No,...,1.68,83.91,29.86,Yes,Yes,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,Yes
246020,Virgin Islands,Female,Excellent,2.0,2.0,Within past year (anytime less than 12 months ...,Yes,7.0,None of them,No,...,1.70,83.01,28.66,No,Yes,Yes,No,"Yes, received tetanus shot but not sure what type",No,No


In [14]:
df_polluted = feature_accuracy(df, 'HadHeartAttack', pollution_percentage=0.1)
df_polluted

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,4.000000,0.000000,Within past year (anytime less than 12 months ...,Yes,9.000000,None of them,No,...,1.60,71.670000,27.99,No,No,Yes,Yes,"Yes, received Tdap",Yes,No
1,Alabama,Male,Very good,0.000000,0.000000,Within past year (anytime less than 12 months ...,Yes,6.000000,None of them,No,...,1.78,95.250000,30.13,No,No,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,No
2,Alabama,Female,Very good,0.000000,0.000000,Within past year (anytime less than 12 months ...,Yes,6.916609,"6 or more, but not all",No,...,1.85,108.860000,31.66,No,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,No
3,Alabama,Female,Fair,5.000000,0.000000,Within past year (anytime less than 12 months ...,Yes,9.000000,None of them,No,...,1.70,90.720000,31.32,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
4,Alabama,Male,Good,3.000000,15.634625,Within past year (anytime less than 12 months ...,Yes,5.000000,1 to 5,No,...,1.55,79.380000,33.07,No,No,Yes,No,"No, did not receive any tetanus shot in the pa...",No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246017,Virgin Islands,Male,Very good,0.000000,0.000000,Within past year (anytime less than 12 months ...,Yes,6.000000,None of them,No,...,1.78,102.060000,32.28,Yes,No,No,No,"Yes, received tetanus shot but not sure what type",No,No
246018,New Jersey,Female,Fair,-4.658267,7.000000,Within past year (anytime less than 12 months ...,Yes,7.000000,None of them,No,...,1.93,90.720000,24.34,No,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
246019,Virgin Islands,Male,Good,0.000000,15.000000,Within past year (anytime less than 12 months ...,Yes,7.000000,1 to 5,No,...,1.68,89.088263,29.86,Yes,No,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,Yes
246020,Virgin Islands,Female,Excellent,2.000000,2.000000,Within past year (anytime less than 12 months ...,Yes,4.597948,None of them,No,...,1.70,83.010000,28.66,No,Yes,Yes,No,"Yes, received tetanus shot but not sure what type",No,No
