In [1]:
import pandas as pd
import numpy as np

In [2]:
def unicity(df, duplication_factor=2, percentage=0.1):
    """
    Applique une pollution de l'unicité en ajoutant des duplicatas exacts à un pourcentage aléatoire de lignes dans le DataFrame.
    
    Args:
    - df (pd.DataFrame): Le DataFrame à polluer.
    - duplication_factor (int): Le facteur de duplication des enregistrements. Par exemple, 2 signifie que chaque enregistrement original sélectionné sera dupliqué une fois.
    - percentage (float): Le pourcentage de lignes à dupliquer, entre 0 et 1.
    
    Returns:
    - pd.DataFrame: Le DataFrame avec les duplicatas ajoutés pour un certain pourcentage de lignes.
    """
    df_polluted = df.copy()

    # Vérifier que le facteur de duplication est supérieur ou égal à 1
    if duplication_factor < 1:
        raise ValueError("Le facteur de duplication doit être supérieur ou égal à 1.")

    # Vérifier que le pourcentage est entre 0 et 1
    if not 0 <= percentage <= 1:
        raise ValueError("Le pourcentage doit être compris entre 0 et 1.")

    # Sélectionner un pourcentage aléatoire de lignes à dupliquer
    mask = np.random.rand(len(df_polluted)) < percentage
    rows_to_duplicate = df_polluted[mask]

    # Ajouter des duplicatas pour les lignes sélectionnées
    duplicates = []
    for _ in range(duplication_factor - 1):
        duplicates.append(rows_to_duplicate)

    # Concaténer l'original et les duplicatas
    df_polluted = pd.concat([df_polluted] + duplicates, ignore_index=True)

    return df_polluted

In [3]:
df = pd.read_csv('../../../Data/Classification/Indicators of Heart Disease/heart_2022_no_nans_clean.csv')
df

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,4.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.60,71.67,27.99,No,No,Yes,Yes,"Yes, received Tdap",No,No
1,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,6.0,None of them,No,...,1.78,95.25,30.13,No,No,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,No
2,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,"6 or more, but not all",No,...,1.85,108.86,31.66,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
3,Alabama,Female,Fair,5.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.70,90.72,31.32,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
4,Alabama,Female,Good,3.0,15.0,Within past year (anytime less than 12 months ...,Yes,5.0,1 to 5,No,...,1.55,79.38,33.07,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246017,Virgin Islands,Male,Very good,0.0,0.0,Within past 2 years (1 year but less than 2 ye...,Yes,6.0,None of them,No,...,1.78,102.06,32.28,Yes,No,No,No,"Yes, received tetanus shot but not sure what type",No,No
246018,Virgin Islands,Female,Fair,0.0,7.0,Within past year (anytime less than 12 months ...,Yes,7.0,None of them,No,...,1.93,90.72,24.34,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,Yes
246019,Virgin Islands,Male,Good,0.0,15.0,Within past year (anytime less than 12 months ...,Yes,7.0,1 to 5,No,...,1.68,83.91,29.86,Yes,Yes,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,Yes
246020,Virgin Islands,Female,Excellent,2.0,2.0,Within past year (anytime less than 12 months ...,Yes,7.0,None of them,No,...,1.70,83.01,28.66,No,Yes,Yes,No,"Yes, received tetanus shot but not sure what type",No,No


In [4]:
df_polluted = unicity(df, duplication_factor=2, percentage=0.2)
df_polluted

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,4.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.60,71.67,27.99,No,No,Yes,Yes,"Yes, received Tdap",No,No
1,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,6.0,None of them,No,...,1.78,95.25,30.13,No,No,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,No
2,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,"6 or more, but not all",No,...,1.85,108.86,31.66,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
3,Alabama,Female,Fair,5.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.70,90.72,31.32,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
4,Alabama,Female,Good,3.0,15.0,Within past year (anytime less than 12 months ...,Yes,5.0,1 to 5,No,...,1.55,79.38,33.07,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295072,Virgin Islands,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,5.0,"6 or more, but not all",No,...,1.55,64.41,26.83,No,No,No,No,"Yes, received tetanus shot but not sure what type",No,No
295073,Virgin Islands,Female,Very good,0.0,10.0,Within past year (anytime less than 12 months ...,Yes,8.0,None of them,No,...,1.60,74.84,29.23,Yes,Yes,No,Yes,"Yes, received tetanus shot, but not Tdap",No,Yes
295074,Virgin Islands,Female,Good,4.0,3.0,Within past 2 years (1 year but less than 2 ye...,Yes,5.0,None of them,No,...,1.68,108.86,38.74,Yes,Yes,No,Yes,"Yes, received tetanus shot but not sure what type",No,Yes
295075,Virgin Islands,Male,Good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,1 to 5,No,...,1.88,74.84,21.18,No,No,No,No,"Yes, received tetanus shot but not sure what type",No,No
