In [None]:
import pandas as pd 
import matplotlib.pyplot  as plt
import seaborn as sns 

In [None]:
def get_Comb(df):
    
    med_df = df.drop(columns=['diagnosis'])

    # Compute hash only once
    df['hash'] = med_df.apply(lambda row: hash(tuple(row)), axis=1)
    # Compute risk per unique hash
    risk_df = df.groupby('hash', as_index=False)['diagnosis'].agg(risk='mean')
    # Merge back with original data and drop hash column
    risk_df = df.merge(risk_df, on='hash').drop(columns=['hash'])
    #save column order 
    col = risk_df.pop('diagnosis') 
    risk_df['diagnosis'] = col

    return risk_df

In [None]:
def get_dist_plot(df,frac=0.5, random_state=42):
    df_risk_all = get_Comb(df)
    df_risk_all.drop(columns=['diagnosis'], inplace=True)
    df_risk_all.drop_duplicates(inplace=True)

    df_frac = get_Comb(df.sample(frac=frac, random_state=random_state) )
    df_frac.drop(columns=['diagnosis'], inplace=True)
    df_frac.drop_duplicates(inplace=True)

    df_risk_all['hash'] = df_risk_all.drop(columns=['risk']).apply(lambda row: hash(tuple(row)), axis=1)
    df_frac['hash'] = df_frac.drop(columns=['risk']).apply(lambda row: hash(tuple(row)), axis=1)
    df_merge = pd.merge(df_risk_all[['hash','risk']], df_frac[['hash','risk']], on='hash', how='inner') 
    error = df_merge['risk_x']-df_merge['risk_y']
    return error

In [None]:
df= pd.read_csv('Data/sparse_med_cleaned.csv')
df.drop(columns=['ID','fin_grossesse'], inplace=True)
df['hash'] = df.drop(columns=['diagnosis']).apply(lambda row: hash(tuple(row)), axis=1)
hash_counts = df['hash'].value_counts()
df = df[df['hash'].isin(hash_counts[hash_counts > 2].index)]
df = df.reset_index(drop=True)
error_25 = get_dist_plot(df, frac=0.25)
error_50 = get_dist_plot(df, frac=0.5)
error_75 = get_dist_plot(df, frac=0.75)
error_100 = get_dist_plot(df, frac=1)

df_errors = pd.DataFrame({
    '25%': error_25,
    '50%': error_50,
    '75%': error_75,
    '100%': error_100
})

In [None]:
sns.kdeplot(error_25, label="25%", linewidth=2)
sns.kdeplot(error_50, label="50%", linewidth=2)
sns.kdeplot(error_75, label="75%", linewidth=2)
sns.kdeplot(error_100, label="100%", linewidth=2)
plt.xlabel("Error Values")
#plt.xscale('log')
plt.ylabel("Density")
plt.title("Error Distributions")
plt.legend(title="Fraction of Data Used")

# Show the plot
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(data=df_errors)
# Formatting
plt.xlabel("Fraction of Data Used")
plt.ylabel("Error Distribution")
plt.title("Boxplot of Error Distributions")
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()