In [2]:
import pandas as pd
import sys
import os
import numpy as np
from collections import defaultdict

# Read the CSV file into a DataFrame
df = pd.read_csv('df5.csv')

# Display the DataFrame to verify the columns have been dropped
df.head()


Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,Caucasian,Female,[0-20),0,Unknown,Referral,1,41,0,1,...,No,No,No,No,No,No,No,No,No,NO
1,Caucasian,Female,[0-20),1,Home,Other,3,59,0,18,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,AfricanAmerican,Female,[20-40),1,Home,Other,2,11,5,13,...,No,No,No,No,No,No,No,No,Yes,NO
3,Caucasian,Male,[20-40),1,Home,Other,2,44,1,16,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,Caucasian,Male,[40-50),1,Home,Other,1,51,0,8,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [3]:
#Check the P_29 Score


def calculate_k_anonymity(group):
    return len(group)

def calculate_normalized_entropy(series):
    if series.empty:
        return 0

    value_counts = series.value_counts(normalize=True)
    total_entropy = 0

    for count in value_counts:
        if count > 0:
            total_entropy -= count * np.log2(count)

    unique_values = series.nunique()
    if unique_values == 1:
        return 0

    normalized_entropy = total_entropy / np.log2(unique_values)
    return normalized_entropy

def calculate_global_distribution(series):
    class_distribution = series.value_counts(normalize=True)
    global_distribution = class_distribution.to_dict()
    return global_distribution

def compute_t_closeness(series, global_distribution):
    class_distribution = series.value_counts(normalize=True)
    combined_index = list(global_distribution.keys())
    class_distribution = class_distribution.reindex(combined_index, fill_value=0)

    p_values = class_distribution.values
    q_values = np.array([global_distribution.get(k, 0) for k in combined_index])

    t_closeness = 0.5 * np.sum(np.abs(p_values - q_values))
    return t_closeness

def calculate_t_closeness(df, quasi_identifiers, sensitive_attributes):
    results = []

    grouped = df.groupby(quasi_identifiers)
    global_distributions = {attribute: calculate_global_distribution(df[attribute]) for attribute in sensitive_attributes}

    for group_name, group_df in grouped:
        t_closeness_values = {}
        for attribute in sensitive_attributes:
            series = group_df[attribute]
            t_closeness = compute_t_closeness(series, global_distributions[attribute])
            t_closeness_values[f't-closeness_{attribute}'] = t_closeness

        group_result = {
            'Quasi-identifiers': ', '.join(f"{qi}: {value}" for qi, value in zip(quasi_identifiers, group_name)),
            **t_closeness_values
        }
        results.append(group_result)

    results_df = pd.DataFrame(results)
    return results_df

def calculate_P_29_score(k_value, l_value, t_value, w_k=0.5, w_l=0.25, w_t=0.25):
    reasons = []
    problematic_info = []

    k_min = k_value['k-anonymity'].min()

    if k_min == 1:
        reasons.append("k-anonymity is 1")
        problematic_rows = k_value[k_value['k-anonymity'] == 1]['Quasi-identifiers'].tolist()
        problematic_info.extend([(row, "k-anonymity is 1") for row in problematic_rows])

    if l_value.iloc[:, 1:].eq(0).any().any():
        reasons.append("normalized entropy l-value is 0 for some attribute")
        for col in l_value.columns[1:]:
            problematic_rows = l_value[l_value[col] == 0]['Quasi-identifiers'].tolist()
            problematic_info.extend([(row, f"normalized entropy l-value is 0 for {col}") for row in problematic_rows])

    if (t_value.iloc[:, 1:].astype(float) > 0.5).any().any():
        reasons.append("t-value exceeds 0.5 for some attribute")
        for col in t_value.columns[1:]:
            if t_value[col].dtype != 'object':
                problematic_rows = t_value[t_value[col].astype(float) > 0.5]['Quasi-identifiers'].tolist()
                problematic_info.extend([(row, f"t-value exceeds 0.5 for {col}") for row in problematic_rows])

    if k_min == 1 or l_value.iloc[:, 1:].eq(0).any().any() or (t_value.iloc[:, 1:].astype(float) > 0.5).any().any():
        return 0.0, problematic_info, reasons, k_min, l_value.iloc[:, 1:].min().min(), t_value.iloc[:, 1:].max().max()

    column_means = l_value.iloc[:, 1:].mean()
    normalized_l_value = column_means.mean()

    t_value_normalized = t_value.copy()
    for column in t_value.columns[1:]:
        min_val = t_value[column].min()
        max_val = t_value[column].max()
        t_value_normalized[column] = (t_value[column] - min_val) / (max_val - min_val)

    normalized_t_value = t_value_normalized.iloc[:, 1:].mean().mean()

    P_29_score = w_k * (1 - (1 / k_min)) + w_l * normalized_l_value + w_t * (1 - normalized_t_value)

    return P_29_score, problematic_info, reasons, k_min, l_value.iloc[:, 1:].min().min(), t_value.iloc[:, 1:].max().max()

def analyze_privacy(df, quasi_identifiers, sensitive_attributes):
    results = defaultdict(list)

    grouped = df.groupby(quasi_identifiers)
    for name, group in grouped:
        k_anonymity = calculate_k_anonymity(group)

        for attribute in sensitive_attributes:
            normalized_entropy = calculate_normalized_entropy(group[attribute])
            results[f'Normalized Entropy l-diversity_{attribute}'].append(normalized_entropy)

        quasi_identifier_values = ', '.join(f"{qi}: {group[qi].iloc[0]}" for qi in quasi_identifiers)
        results['Quasi-identifiers'].append(quasi_identifier_values)
        results['k-anonymity'].append(k_anonymity)

    results_df = pd.DataFrame(results)

    t_value = calculate_t_closeness(df, quasi_identifiers, sensitive_attributes)

    k_value = results_df[['Quasi-identifiers', 'k-anonymity']].copy()
    l_value_columns = ['Quasi-identifiers'] + [col for col in results_df.columns if col.startswith('Normalized Entropy l-diversity')]
    l_value = results_df[l_value_columns].copy()

    P_29_score, problematic_info, reasons, k_min, min_l_value, max_t_value = calculate_P_29_score(k_value, l_value, t_value)

    return {
        "P_29_score": P_29_score,
        "problematic_info": problematic_info,
        "reasons": reasons,
        "k_min": k_min,
        "min_l_value": min_l_value,
        "max_t_value": max_t_value,
        "k_value": k_value,
        "l_value": l_value,
        "t_value": t_value
    }

# Example usage:
# Assuming `df` is the DataFrame with your data
quasi_identifiers = ['race', 'gender', 'age']
sensitive_attributes = ['diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'readmitted']

# Calculate privacy metrics and P_29 score
privacy_results = analyze_privacy(df, quasi_identifiers, sensitive_attributes)

# Display results
print("P_29 Score:", privacy_results["P_29_score"])
print("Reasons:", privacy_results["reasons"])
print("Problematic Information:")
for info in privacy_results["problematic_info"]:
    print(f"Problem in {info[0]} due to {info[1]}")
print("Minimum k-anonymity:", privacy_results["k_min"])
print("Minimum normalized l-value:", privacy_results["min_l_value"])
print("Maximum t-value:", privacy_results["max_t_value"])


P_29 Score: 0.0
Reasons: ['normalized entropy l-value is 0 for some attribute', 't-value exceeds 0.5 for some attribute']
Problematic Information:
Problem in race: AfricanAmerican, gender: Female, age: [0-20) due to normalized entropy l-value is 0 for Normalized Entropy l-diversity_max_glu_serum
Problem in race: other, gender: Female, age: [0-20) due to normalized entropy l-value is 0 for Normalized Entropy l-diversity_max_glu_serum
Problem in race: other, gender: Male, age: [0-20) due to normalized entropy l-value is 0 for Normalized Entropy l-diversity_max_glu_serum
Problem in race: AfricanAmerican, gender: Female, age: [0-20) due to t-value exceeds 0.5 for t-closeness_diag_1
Problem in race: AfricanAmerican, gender: Male, age: [0-20) due to t-value exceeds 0.5 for t-closeness_diag_1
Problem in race: Caucasian, gender: Female, age: [0-20) due to t-value exceeds 0.5 for t-closeness_diag_1
Problem in race: Caucasian, gender: Male, age: [0-20) due to t-value exceeds 0.5 for t-closeness_

In [4]:
#data Deletion for satisfying the t=0.5 closeness

def calculate_global_distribution(series):
    class_distribution = series.value_counts(normalize=True)
    global_distribution = class_distribution.to_dict()
    return global_distribution

def compute_t_closeness(series, global_distribution):
    class_distribution = series.value_counts(normalize=True)
    combined_index = list(global_distribution.keys())
    class_distribution = class_distribution.reindex(combined_index, fill_value=0)

    p_values = class_distribution.values
    q_values = np.array([global_distribution.get(k, 0) for k in combined_index])

    t_closeness = 0.5 * np.sum(np.abs(p_values - q_values))
    return t_closeness

def calculate_t_closeness(df, quasi_identifiers, sensitive_attributes):
    results = []

    grouped = df.groupby(quasi_identifiers)
    global_distributions = {attribute: calculate_global_distribution(df[attribute]) for attribute in sensitive_attributes}

    for group_name, group_df in grouped:
        t_closeness_values = {}
        for attribute in sensitive_attributes:
            series = group_df[attribute]
            t_closeness = compute_t_closeness(series, global_distributions[attribute])
            t_closeness_values[f't-closeness_{attribute}'] = t_closeness

        group_result = {
            'Quasi-identifiers': ', '.join(f"{qi}: {value}" for qi, value in zip(quasi_identifiers, group_name)),
            **t_closeness_values
        }
        results.append(group_result)

    results_df = pd.DataFrame(results)
    return results_df

def calculate_k_anonymity(group):
    return len(group)

def calculate_normalized_entropy(series):
    if series.empty:
        return 0

    value_counts = series.value_counts(normalize=True)
    total_entropy = 0

    for count in value_counts:
        if count > 0:
            total_entropy -= count * np.log2(count)

    unique_values = series.nunique()
    if unique_values == 1:
        return 0

    normalized_entropy = total_entropy / np.log2(unique_values)
    return normalized_entropy

def calculate_k_l_values(df, quasi_identifiers, sensitive_attributes):
    results = defaultdict(list)
    grouped = df.groupby(quasi_identifiers)

    for name, group in grouped:
        k_anonymity = calculate_k_anonymity(group)
        results['Quasi-identifiers'].append(', '.join(f"{qi}: {value}" for qi, value in zip(quasi_identifiers, name)))
        results['k-anonymity'].append(k_anonymity)

        for attribute in sensitive_attributes:
            normalized_entropy = calculate_normalized_entropy(group[attribute])
            results[f'Normalized Entropy l-diversity_{attribute}'].append(normalized_entropy)

    results_df = pd.DataFrame(results)
    return results_df

def ensure_privacy(df, quasi_identifiers, sensitive_attributes, t_threshold=0.5, k_threshold=1, l_threshold=0):
    while True:
        # Calculate k-anonymity and l-diversity
        k_l_values = calculate_k_l_values(df, quasi_identifiers, sensitive_attributes)
        
        # Calculate t-closeness
        t_value = calculate_t_closeness(df, quasi_identifiers, sensitive_attributes)
        
        # Check if all groups satisfy t <= t_threshold, k > k_threshold, and l > l_threshold
        k_condition = k_l_values['k-anonymity'] > k_threshold
        l_condition = (k_l_values.iloc[:, 2:].astype(float) > l_threshold).all(axis=1)
        t_condition = (t_value.iloc[:, 1:].astype(float) <= t_threshold).all(axis=1)
        
        if k_condition.all() and l_condition.all() and t_condition.all():
            break
        
        # Identify groups that don't satisfy the conditions
        groups_to_delete = []
        for idx in range(len(k_l_values)):
            if not (k_condition[idx] and l_condition[idx] and t_condition[idx]):
                groups_to_delete.append(k_l_values['Quasi-identifiers'][idx])
        
        # Delete the identified groups
        df = df[~df.apply(lambda row: ', '.join(f"{qi}: {row[qi]}" for qi in quasi_identifiers) in groups_to_delete, axis=1)]
    
    return df

# Example usage:
# Assuming `df` is the DataFrame with your data
quasi_identifiers = ['race', 'gender', 'age']
sensitive_attributes = ['diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'readmitted']

# Ensure the dataset satisfies t-closeness, k-anonymity, and l-diversity thresholds
filtered_df = ensure_privacy(df, quasi_identifiers, sensitive_attributes, t_threshold=0.5, k_threshold=1, l_threshold=0)

print("Filtered DataFrame:")
print(filtered_df)


Filtered DataFrame:
                   race  gender       age  admission_type_id  \
2       AfricanAmerican  Female   [20-40)                  1   
3             Caucasian    Male   [20-40)                  1   
4             Caucasian    Male   [40-50)                  1   
5             Caucasian    Male   [50-60)                  1   
6             Caucasian    Male   [60-70)                  1   
...                 ...     ...       ...                ...   
101758  AfricanAmerican    Male   [70-80)                  1   
101759  AfricanAmerican  Female  [80-100)                  1   
101760        Caucasian    Male   [70-80)                  1   
101761        Caucasian  Female  [80-100)                  1   
101762        Caucasian    Male   [70-80)                  1   

       discharge_disposition_id admission_source_id  time_in_hospital  \
2                          Home               Other                 2   
3                          Home               Other              

In [5]:
import pandas as pd
import numpy as np
from collections import defaultdict

def calculate_k_anonymity(group):
    return len(group)

def calculate_normalized_entropy(series):
    if series.empty:
        return 0

    value_counts = series.value_counts(normalize=True)
    total_entropy = 0

    for count in value_counts:
        if count > 0:
            total_entropy -= count * np.log2(count)

    unique_values = series.nunique()
    if unique_values == 1:
        return 0

    normalized_entropy = total_entropy / np.log2(unique_values)
    return normalized_entropy

def calculate_global_distribution(series):
    class_distribution = series.value_counts(normalize=True)
    global_distribution = class_distribution.to_dict()
    return global_distribution

def compute_t_closeness(series, global_distribution):
    class_distribution = series.value_counts(normalize=True)
    combined_index = list(global_distribution.keys())
    class_distribution = class_distribution.reindex(combined_index, fill_value=0)

    p_values = class_distribution.values
    q_values = np.array([global_distribution.get(k, 0) for k in combined_index])

    t_closeness = 0.5 * np.sum(np.abs(p_values - q_values))
    return t_closeness

def calculate_t_closeness(df, quasi_identifiers, sensitive_attributes):
    results = []

    grouped = df.groupby(quasi_identifiers)
    global_distributions = {attribute: calculate_global_distribution(df[attribute]) for attribute in sensitive_attributes}

    for group_name, group_df in grouped:
        t_closeness_values = {}
        for attribute in sensitive_attributes:
            series = group_df[attribute]
            t_closeness = compute_t_closeness(series, global_distributions[attribute])
            t_closeness_values[f't-closeness_{attribute}'] = t_closeness

        group_result = {
            'Quasi-identifiers': ', '.join(f"{qi}: {value}" for qi, value in zip(quasi_identifiers, group_name)),
            **t_closeness_values
        }
        results.append(group_result)

    results_df = pd.DataFrame(results)
    return results_df

def calculate_P_29_score(k_value, l_value, t_value, w_k=0.5, w_l=0.25, w_t=0.25):
    reasons = []
    problematic_info = []

    k_min = k_value['k-anonymity'].min()

    if k_min == 1:
        reasons.append("k-anonymity is 1")
        problematic_rows = k_value[k_value['k-anonymity'] == 1]['Quasi-identifiers'].tolist()
        problematic_info.extend([(row, "k-anonymity is 1") for row in problematic_rows])

    if l_value.iloc[:, 1:].eq(0).any().any():
        reasons.append("normalized entropy l-value is 0 for some attribute")
        for col in l_value.columns[1:]:
            problematic_rows = l_value[l_value[col] == 0]['Quasi-identifiers'].tolist()
            problematic_info.extend([(row, f"normalized entropy l-value is 0 for {col}") for row in problematic_rows])

    if (t_value.iloc[:, 1:].astype(float) > 0.5).any().any():
        reasons.append("t-value exceeds 0.5 for some attribute")
        for col in t_value.columns[1:]:
            if t_value[col].dtype != 'object':
                problematic_rows = t_value[t_value[col].astype(float) > 0.5]['Quasi-identifiers'].tolist()
                problematic_info.extend([(row, f"t-value exceeds 0.5 for {col}") for row in problematic_rows])

    if k_min == 1 or l_value.iloc[:, 1:].eq(0).any().any() or (t_value.iloc[:, 1:].astype(float) > 0.5).any().any():
        return 0.0, problematic_info, reasons, k_min, l_value.iloc[:, 1:].min().min(), t_value.iloc[:, 1:].max().max()

    column_means = l_value.iloc[:, 1:].mean()
    normalized_l_value = column_means.mean()

    t_value_normalized = t_value.copy()
    for column in t_value.columns[1:]:
        min_val = t_value[column].min()
        max_val = t_value[column].max()
        t_value_normalized[column] = (t_value[column] - min_val) / (max_val - min_val)

    normalized_t_value = t_value_normalized.iloc[:, 1:].mean().mean()

    P_29_score = w_k * (1 - (1 / k_min)) + w_l * normalized_l_value + w_t * (1 - normalized_t_value)

    return P_29_score, problematic_info, reasons, k_min, l_value.iloc[:, 1:].min().min(), t_value.iloc[:, 1:].max().max()

def analyze_privacy(df, quasi_identifiers, sensitive_attributes):
    results = defaultdict(list)

    grouped = df.groupby(quasi_identifiers)
    for name, group in grouped:
        k_anonymity = calculate_k_anonymity(group)

        for attribute in sensitive_attributes:
            normalized_entropy = calculate_normalized_entropy(group[attribute])
            results[f'Normalized Entropy l-diversity_{attribute}'].append(normalized_entropy)

        quasi_identifier_values = ', '.join(f"{qi}: {group[qi].iloc[0]}" for qi in quasi_identifiers)
        results['Quasi-identifiers'].append(quasi_identifier_values)
        results['k-anonymity'].append(k_anonymity)

    results_df = pd.DataFrame(results)

    t_value = calculate_t_closeness(df, quasi_identifiers, sensitive_attributes)

    k_value = results_df[['Quasi-identifiers', 'k-anonymity']].copy()
    l_value_columns = ['Quasi-identifiers'] + [col for col in results_df.columns if col.startswith('Normalized Entropy l-diversity')]
    l_value = results_df[l_value_columns].copy()

    P_29_score, problematic_info, reasons, k_min, min_l_value, max_t_value = calculate_P_29_score(k_value, l_value, t_value)

    return {
        "P_29_score": P_29_score,
        "problematic_info": problematic_info,
        "reasons": reasons,
        "k_min": k_min,
        "min_l_value": min_l_value,
        "max_t_value": max_t_value,
        "k_value": k_value,
        "l_value": l_value,
        "t_value": t_value
    }

# Example usage:
# Assuming `df` is the DataFrame with your data
quasi_identifiers = ['race', 'gender', 'age']
sensitive_attributes = ['diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'readmitted']

# Calculate privacy metrics and P_29 score
privacy_results = analyze_privacy(filtered_df, quasi_identifiers, sensitive_attributes)

# Display results
print("P_29 Score:", privacy_results["P_29_score"])
print("Reasons:", privacy_results["reasons"])
print("Problematic Information:")
for info in privacy_results["problematic_info"]:
    print(f"Problem in {info[0]} due to {info[1]}")
print("Minimum k-anonymity:", privacy_results["k_min"])
print("Minimum normalized l-value:", privacy_results["min_l_value"])
print("Maximum t-value:", privacy_results["max_t_value"])


P_29 Score: 0.8375317503843992
Reasons: []
Problematic Information:
Minimum k-anonymity: 195
Minimum normalized l-value: 0.07358616908386309
Maximum t-value: 0.4574499284491258


In [6]:
output_file = 'df5-filter.csv'
filtered_df.to_csv(output_file, index=False)