<a href="https://colab.research.google.com/github/Jigyass/Data-Privacy-and-Data-Security-Models/blob/main/Exponential_Mechanism.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [99]:
!unzip /content/drive/MyDrive/Data_Privacy_and_Data_Security/adult.zip

Archive:  /content/drive/MyDrive/Data_Privacy_and_Data_Security/adult.zip
replace Index? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: Index                   
  inflating: adult.data              
  inflating: adult.names             
  inflating: adult.test              
  inflating: old.adult.names         


In [117]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import ks_2samp
from scipy.stats import entropy
from collections import Counter
from numpy.linalg import norm

In [118]:
df = pd.read_csv('adult.data')

In [119]:
df.columns = ['Age', 'Workclass', 'Fnlwgt', 'Education', 'Education-Num', 'Marital-Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Capital-Gain', 'Capital-Loss', 'Hours-Per-Week', 'Native-Country', 'Income']

In [120]:
# 1. Original Dataset
original_most_frequent = df['Education'].value_counts().idxmax()
print(f"Most frequent 'Education' in original dataset: {original_most_frequent}")

# 2. Removing a record with the most frequent "Education"
df1 = df[df['Education'] != original_most_frequent].copy()
df1_most_frequent = df1['Education'].value_counts().idxmax()
print(f"Most frequent 'Education' in df1: {df1_most_frequent}")

# 3. Removing a record with the second most frequent "Education"
second_most_frequent = df['Education'].value_counts().index[1]
df2 = df[df['Education'] != second_most_frequent].copy()
df2_most_frequent = df2['Education'].value_counts().idxmax()
print(f"Most frequent 'Education' in df2: {df2_most_frequent}")

# 4. Removing a record with the least frequent "Education"
least_frequent = df['Education'].value_counts().idxmin()
df3 = df[df['Education'] != least_frequent].copy()
df3_most_frequent = df3['Education'].value_counts().idxmax()
print(f"Most frequent 'Education' in df3: {df3_most_frequent}")


Most frequent 'Education' in original dataset:  HS-grad
Most frequent 'Education' in df1:  Some-college
Most frequent 'Education' in df2:  HS-grad
Most frequent 'Education' in df3:  HS-grad


In [121]:
# Label encoding the 'Education' column
df['Education'] = df['Education'].astype('category')
df['Education_Cat'] = df['Education'].cat.codes

# Doing the same for the modified DataFrames (df1, df2, df3)
df1['Education'] = df1['Education'].astype('category')
df1['Education_Cat'] = df1['Education'].cat.codes

df2['Education'] = df2['Education'].astype('category')
df2['Education_Cat'] = df2['Education'].cat.codes

df3['Education'] = df3['Education'].astype('category')
df3['Education_Cat'] = df3['Education'].cat.codes


In [122]:
def exponential_mechanism(df, epsilon):
    counts = df['Education'].value_counts()
    keys = counts.keys().tolist()
    utilities = counts.values
    sensitivity = 1.0 / len(keys)  # 1/n sensitivity

    # Handle large numbers to avoid overflow
    max_utility = np.max(utilities)
    scaled_utilities = utilities - max_utility

    # Calculate probabilities
    try:
        probabilities = np.exp((epsilon * scaled_utilities) / (2 * sensitivity))
    except OverflowError:
        probabilities = np.zeros_like(utilities)

    # Handle NaN and Inf
    probabilities[np.isnan(probabilities)] = 1e-10
    probabilities[np.isinf(probabilities)] = 1e+10

    # Normalize probabilities
    sum_probabilities = np.sum(probabilities)
    if sum_probabilities == 0:
        sum_probabilities = 1e-10
    probabilities /= sum_probabilities

    return np.random.choice(keys, p=probabilities)

# For ε = 0.5
epsilon = 0.5

# Generate 1,000 random results for each dataset
n = 1000
original_results = [exponential_mechanism(df, epsilon) for _ in range(n)]
df1_results = [exponential_mechanism(df1, epsilon) for _ in range(n)]
df2_results = [exponential_mechanism(df2, epsilon) for _ in range(n)]
df3_results = [exponential_mechanism(df3, epsilon) for _ in range(n)]

In [123]:
epsilon = 0.5  # epsilon value for differential privacy
n = 1000  # Number of random results to generate

# Generate 1,000 random results for the original dataset
original_results = [exponential_mechanism(df, epsilon) for _ in range(n)]

# Remove a record with the most frequent 'Education'
most_frequent_education = df['Education'].value_counts().idxmax()
df1 = df[df['Education'] != most_frequent_education].copy()
df1_results = [exponential_mechanism(df1, epsilon) for _ in range(n)]

# Remove a record with the second most frequent 'Education'
second_most_frequent_education = df['Education'].value_counts().index[1]
df2 = df[df['Education'] != second_most_frequent_education].copy()
df2_results = [exponential_mechanism(df2, epsilon) for _ in range(n)]

# Remove a record with the least frequent 'Education'
least_frequent_education = df['Education'].value_counts().idxmin()
df3 = df[df['Education'] != least_frequent_education].copy()
df3_results = [exponential_mechanism(df3, epsilon) for _ in range(n)]


In [124]:
def check_epsilon_indistinguishability(results1, results2, epsilon):
    # Count frequency of each unique value in both result sets
    unique1, counts1 = np.unique(results1, return_counts=True)
    unique2, counts2 = np.unique(results2, return_counts=True)

    # Create a dictionary for easier lookup
    freq_dict1 = dict(zip(unique1, counts1))
    freq_dict2 = dict(zip(unique2, counts2))

    # Calculate probabilities
    total1 = np.sum(counts1)
    total2 = np.sum(counts2)

    for key in set(unique1).union(unique2):
        prob1 = freq_dict1.get(key, 0) / total1
        prob2 = freq_dict2.get(key, 0) / total2

        # Check if the ratios are bounded by e^epsilon and e^-epsilon
        if prob1 == 0 or prob2 == 0:
            continue  # Ignore zero probabilities

        ratio = prob1 / prob2
        if ratio > np.exp(epsilon) or ratio < np.exp(-epsilon):
            print(f"Failed indistinguishability check for value {key}")
            return False

    print("Passed indistinguishability check")
    return True


In [125]:
epsilon = 0.5  # Set epsilon to 0.5 for part (b)

check_epsilon_indistinguishability(original_results, df1_results, epsilon)
check_epsilon_indistinguishability(original_results, df2_results, epsilon)
check_epsilon_indistinguishability(original_results, df3_results, epsilon)

Passed indistinguishability check
Passed indistinguishability check
Passed indistinguishability check


True

In [126]:
epsilon = 1.0  # New epsilon value
n = 1000  # Number of samples

original_results_1 = [exponential_mechanism(df, epsilon) for _ in range(n)]
df1_results_1 = [exponential_mechanism(df1, epsilon) for _ in range(n)]
df2_results_1 = [exponential_mechanism(df2, epsilon) for _ in range(n)]
df3_results_1 = [exponential_mechanism(df3, epsilon) for _ in range(n)]


In [127]:
epsilon = 1.0

check_epsilon_indistinguishability(original_results_1, df1_results_1, epsilon)
check_epsilon_indistinguishability(original_results_1, df2_results_1, epsilon)
check_epsilon_indistinguishability(original_results_1, df3_results_1, epsilon)

Passed indistinguishability check
Passed indistinguishability check
Passed indistinguishability check


True

In [128]:
# Compute Jensen-Shannon divergence
def jensen_shannon_divergence(p, q):
    p = np.array(p)
    q = np.array(q)
    m = 0.5 * (p + q)
    return 0.5 * (entropy(p, m) + entropy(q, m))

# Calculate empirical probabilities
def empirical_probabilities(results, num_unique_values):
    counts = np.bincount(results)
    return counts / len(results)

In [129]:
combined_results = pd.DataFrame({
    'Original_ε=0.5': original_results,
    'Modified_most_ε=0.5': df1_results,
    'Modified_second_ε=0.5': df2_results,
    'Modified_least_ε=0.5': df3_results,
    'Original_ε=1': original_results_1,
    'Modified_most_ε=1': df1_results_1,
    'Modified_second_ε=1': df2_results_1,
    'Modified_least_ε=1': df3_results_1
})
# Export the combined_results DataFrame to a CSV file
combined_results.to_csv('combined_results.csv', index=False)
