In [2]:
import pandas as pd
import numpy as np

In [1]:
def target_class_balance(df, target_column, imbalance_factor=0.5):
    """
    Applique une pollution de l'équilibre des classes cibles en déséquilibrant les proportions des classes dans le DataFrame.
    
    Args:
    - df (pd.DataFrame): Le DataFrame contenant la colonne cible à déséquilibrer.
    - target_column (str): Le nom de la colonne cible à déséquilibrer.
    - imbalance_factor (float): Le facteur de déséquilibre. Entre 0 et 1, où 1 garde l'équilibre initial et 0 correspond au déséquilibre maximal.
    
    Returns:
    - pd.DataFrame: Le DataFrame avec les classes cibles déséquilibrées.
    """
    df_polluted = df.copy()

    # Vérifier que le facteur de déséquilibre est entre 0 et 1
    if not 0 <= imbalance_factor <= 1:
        raise ValueError("Le facteur de déséquilibre doit être compris entre 0 et 1.")

    # Compter le nombre d'occurrences de chaque classe
    class_counts = df_polluted[target_column].value_counts()
    min_class = class_counts.idxmin()  # Classe minoritaire
    max_class = class_counts.idxmax()  # Classe majoritaire

    # Calculer le nombre de lignes à ajouter ou retirer
    total_samples = len(df_polluted)

    new_class_counts = {}
    for class_label, count in class_counts.items():
        if class_label == max_class:
            # Réduire le nombre de la classe majoritaire
            new_class_counts[class_label] = int(count * imbalance_factor)
        else:
            # Augmenter le nombre des autres classes
            new_class_counts[class_label] = count + int((total_samples - count) * (1 - imbalance_factor))

    # Créer le DataFrame pollué avec le nouvel équilibre des classes
    df_balanced = pd.DataFrame()
    for class_label, new_count in new_class_counts.items():
        class_df = df_polluted[df_polluted[target_column] == class_label]
        if new_count > len(class_df):
            # Répéter les lignes pour augmenter le nombre
            repeats = int(np.ceil(new_count / len(class_df)))
            df_balanced = pd.concat([df_balanced, pd.concat([class_df] * repeats).iloc[:new_count]], ignore_index=True)
        else:
            # Échantillonner les lignes pour réduire le nombre
            df_balanced = pd.concat([df_balanced, class_df.sample(new_count, random_state=42)], ignore_index=True)

    return df_balanced

In [3]:
df = pd.read_csv('../../../Data/Classification/Indicators of Heart Disease/heart_2022_no_nans_clean.csv')
df

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,4.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.60,71.67,27.99,No,No,Yes,Yes,"Yes, received Tdap",No,No
1,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,6.0,None of them,No,...,1.78,95.25,30.13,No,No,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,No
2,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,"6 or more, but not all",No,...,1.85,108.86,31.66,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
3,Alabama,Female,Fair,5.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.70,90.72,31.32,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
4,Alabama,Female,Good,3.0,15.0,Within past year (anytime less than 12 months ...,Yes,5.0,1 to 5,No,...,1.55,79.38,33.07,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246017,Virgin Islands,Male,Very good,0.0,0.0,Within past 2 years (1 year but less than 2 ye...,Yes,6.0,None of them,No,...,1.78,102.06,32.28,Yes,No,No,No,"Yes, received tetanus shot but not sure what type",No,No
246018,Virgin Islands,Female,Fair,0.0,7.0,Within past year (anytime less than 12 months ...,Yes,7.0,None of them,No,...,1.93,90.72,24.34,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,Yes
246019,Virgin Islands,Male,Good,0.0,15.0,Within past year (anytime less than 12 months ...,Yes,7.0,1 to 5,No,...,1.68,83.91,29.86,Yes,Yes,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,Yes
246020,Virgin Islands,Female,Excellent,2.0,2.0,Within past year (anytime less than 12 months ...,Yes,7.0,None of them,No,...,1.70,83.01,28.66,No,Yes,Yes,No,"Yes, received tetanus shot but not sure what type",No,No


In [4]:
df_polluted = target_class_balance(df, 'HadHeartAttack', imbalance_factor=0.5)
df_polluted

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Georgia,Female,Very good,10.0,30.0,Within past 2 years (1 year but less than 2 ye...,Yes,4.0,None of them,No,...,1.55,102.06,42.51,No,Yes,No,Yes,"Yes, received tetanus shot, but not Tdap",No,No
1,Connecticut,Female,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,7.0,None of them,No,...,1.65,47.63,17.47,Yes,No,No,No,"Yes, received Tdap",No,No
2,Alaska,Female,Good,15.0,2.0,Within past year (anytime less than 12 months ...,No,4.0,"6 or more, but not all",No,...,1.63,95.25,36.05,No,Yes,No,No,"Yes, received tetanus shot but not sure what type",No,No
3,New York,Female,Good,0.0,10.0,Within past year (anytime less than 12 months ...,Yes,8.0,None of them,No,...,1.57,44.00,17.74,Yes,Yes,Yes,No,"Yes, received Tdap",No,Yes
4,District of Columbia,Male,Very good,0.0,3.0,Within past year (anytime less than 12 months ...,Yes,8.0,"6 or more, but not all",No,...,1.80,77.56,23.85,No,No,Yes,No,"No, did not receive any tetanus shot in the pa...",No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246016,Ohio,Male,Fair,24.0,4.0,Within past 2 years (1 year but less than 2 ye...,No,5.0,None of them,Yes,...,1.93,108.86,29.21,Yes,No,No,Yes,"Yes, received tetanus shot but not sure what type",No,Yes
246017,Ohio,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,7.0,1 to 5,Yes,...,1.73,86.18,28.89,Yes,No,No,No,"Yes, received tetanus shot but not sure what type",No,Yes
246018,Ohio,Female,Good,0.0,30.0,Within past year (anytime less than 12 months ...,Yes,6.0,1 to 5,Yes,...,1.70,78.47,27.10,No,Yes,Yes,Yes,"Yes, received Tdap",No,No
246019,Ohio,Female,Fair,15.0,12.0,Within past year (anytime less than 12 months ...,Yes,6.0,None of them,Yes,...,1.63,108.86,41.20,No,No,Yes,No,"Yes, received tetanus shot, but not Tdap",No,Yes
