# Privacy-Preserving Record Linkage (PPRL)

Bloom filter-based privacy-preserving record linkage with differential privacy.

# CHANGE 1: Import Dependencies

In [None]:
import time
import math
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from BF import BF
from PPRL import Link

# CHANGE 2: Configure Parameters for Linkage

**Modify these values to tune the model:**

In [None]:
# MODIFY THESE PARAMETERS TO TUNE LINKAGE ACCURACY:
BF_length = 1000        # Bloom filter length (larger = more memory, less collisions)
BF_num_hash = 10        # Number of hash functions (affects false positive rate)
BF_q_gram = 2           # Q-gram size for tokenization
min_sim_val = 0.8       # Similarity threshold for matching (0-1)
link_attrs = [1,2,3,4]  # Attributes to use for linkage
block_attrs = [2,4]     # Attributes to use for blocking
ent_index = 0           # Entity ID column index
epsilon = 7             # Privacy budget (lower = more privacy, less accuracy)

link = Link(BF_length, BF_num_hash, BF_q_gram, min_sim_val, link_attrs, block_attrs, ent_index, epsilon)

# CHANGE 3: Load Datasets

**Modify dataset paths to test different data:**

In [None]:
# MODIFY THESE PATHS TO USE DIFFERENT DATASETS:
dataset1_path = '../csv_files/Alice_numrec_100_corr_50.csv'
dataset2_path = '../csv_files/Bob_numrec_100_corr_50.csv'

db1 = link.read_database(dataset1_path)
db2 = link.read_database(dataset2_path)

# CHANGE 4: PPRL Workflow

In [None]:
start_time = time.time()

blk_ind1 = link.build_BI(db1)
blk_ind2 = link.build_BI(db2)

bf_dict1, all_val_set1 = link.data_encode(db1)
bf_dict2, all_val_set2 = link.data_encode(db2)

all_val_set = all_val_set1 + all_val_set2
total_all_val_set = set(all_val_set)
num_total_all_val_set = len(total_all_val_set)
fpr = (1 - math.e**((-1*BF_num_hash*num_total_all_val_set)/BF_length))**BF_num_hash

bf_dict1_dp = link.dp_bloom_filters(bf_dict1)
bf_dict2_dp = link.dp_bloom_filters(bf_dict2)

matches = link.match(blk_ind1, blk_ind2, bf_dict1_dp, bf_dict2_dp)

end_time = time.time() - start_time

# CHANGE 5: Evaluate PPRL Performance

In [None]:
print('=== PPRL with Differential Privacy ===')
prec, rec, f1 = link.evaluate(matches, db1, db2)
print(f'False Positive Rate: {fpr:.4f}')
print(f'Privacy Budget (ε): {epsilon}')
print(f'Runtime: {end_time:.2f} seconds')

# CHANGE 6: Baseline Comparisons

In [None]:
matches_npp = link.match_npp(blk_ind1, blk_ind2, db1, db2)
print('\n=== Non-Privacy-Preserving Baseline ===')
prec_b1, rec_b1, f1_b1 = link.evaluate(matches_npp, db1, db2)

matches_nodp = link.match(blk_ind1, blk_ind2, bf_dict1, bf_dict2)
print('\n=== PPRL without Differential Privacy ===')
prec_b2, rec_b2, f1_b2 = link.evaluate(matches_nodp, db1, db2)

# CHANGE 7: Parameter Comparison Experiments

In [None]:
def test_parameter_variations(datasets, param_name, param_values, base_params):
    results = []
    
    for dataset_name, (path1, path2) in datasets.items():
        for param_val in param_values:
            params = base_params.copy()
            params[param_name] = param_val
            
            link_test = Link(params['BF_length'], params['BF_num_hash'], params['BF_q_gram'],
                           params['min_sim_val'], params['link_attrs'], params['block_attrs'],
                           params['ent_index'], params['epsilon'])
            
            db1_test = link_test.read_database(path1)
            db2_test = link_test.read_database(path2)
            
            blk_ind1_test = link_test.build_BI(db1_test)
            blk_ind2_test = link_test.build_BI(db2_test)
            
            bf_dict1_test, _ = link_test.data_encode(db1_test)
            bf_dict2_test, _ = link_test.data_encode(db2_test)
            
            bf_dict1_dp_test = link_test.dp_bloom_filters(bf_dict1_test)
            bf_dict2_dp_test = link_test.dp_bloom_filters(bf_dict2_test)
            
            matches_test = link_test.match(blk_ind1_test, blk_ind2_test, bf_dict1_dp_test, bf_dict2_dp_test)
            prec_test, rec_test, f1_test = link_test.evaluate(matches_test, db1_test, db2_test)
            
            results.append({
                'dataset': dataset_name,
                'param_value': param_val,
                'precision': prec_test,
                'recall': rec_test,
                'f1_score': f1_test
            })
    
    return pd.DataFrame(results)

base_params = {
    'BF_length': 1000,
    'BF_num_hash': 10,
    'BF_q_gram': 2,
    'min_sim_val': 0.8,
    'link_attrs': [1,2,3,4],
    'block_attrs': [2,4],
    'ent_index': 0,
    'epsilon': 7
}

datasets = {
    '100_corr_25': ('../csv_files/Alice_numrec_100_corr_25.csv', '../csv_files/Bob_numrec_100_corr_25.csv'),
    '100_corr_50': ('../csv_files/Alice_numrec_100_corr_50.csv', '../csv_files/Bob_numrec_100_corr_50.csv'),
    '500_corr_25': ('../csv_files/Alice_numrec_500_corr_25.csv', '../csv_files/Bob_numrec_500_corr_25.csv'),
    '500_corr_50': ('../csv_files/Alice_numrec_500_corr_50.csv', '../csv_files/Bob_numrec_500_corr_50.csv')
}

# CHANGE 8: Privacy Budget (Epsilon) Analysis

In [None]:
epsilon_values = [1, 3, 5, 7, 10, 15]
epsilon_results = test_parameter_variations(datasets, 'epsilon', epsilon_values, base_params)

fig, axes = plt.subplots(1, 3, figsize=(18, 5))
for dataset_name in datasets.keys():
    data = epsilon_results[epsilon_results['dataset'] == dataset_name]
    axes[0].plot(data['param_value'], data['precision'], marker='o', label=dataset_name)
    axes[1].plot(data['param_value'], data['recall'], marker='o', label=dataset_name)
    axes[2].plot(data['param_value'], data['f1_score'], marker='o', label=dataset_name)

axes[0].set_xlabel('Privacy Budget (ε)')
axes[0].set_ylabel('Precision')
axes[0].set_title('Precision vs Privacy Budget')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].set_xlabel('Privacy Budget (ε)')
axes[1].set_ylabel('Recall')
axes[1].set_title('Recall vs Privacy Budget')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

axes[2].set_xlabel('Privacy Budget (ε)')
axes[2].set_ylabel('F1 Score')
axes[2].set_title('F1 Score vs Privacy Budget')
axes[2].legend()
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('pprl_epsilon_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

print('\nEpsilon Results Summary:')
print(epsilon_results.to_string())

# CHANGE 9: Similarity Threshold Analysis

In [None]:
threshold_values = [0.6, 0.7, 0.75, 0.8, 0.85, 0.9]
threshold_results = test_parameter_variations(datasets, 'min_sim_val', threshold_values, base_params)

fig, axes = plt.subplots(1, 3, figsize=(18, 5))
for dataset_name in datasets.keys():
    data = threshold_results[threshold_results['dataset'] == dataset_name]
    axes[0].plot(data['param_value'], data['precision'], marker='s', label=dataset_name)
    axes[1].plot(data['param_value'], data['recall'], marker='s', label=dataset_name)
    axes[2].plot(data['param_value'], data['f1_score'], marker='s', label=dataset_name)

axes[0].set_xlabel('Similarity Threshold')
axes[0].set_ylabel('Precision')
axes[0].set_title('Precision vs Similarity Threshold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].set_xlabel('Similarity Threshold')
axes[1].set_ylabel('Recall')
axes[1].set_title('Recall vs Similarity Threshold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

axes[2].set_xlabel('Similarity Threshold')
axes[2].set_ylabel('F1 Score')
axes[2].set_title('F1 Score vs Similarity Threshold')
axes[2].legend()
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('pprl_threshold_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

print('\nThreshold Results Summary:')
print(threshold_results.to_string())

# CHANGE 10: Bloom Filter Length Analysis

In [None]:
bf_length_values = [500, 1000, 1500, 2000]
bf_length_results = test_parameter_variations(datasets, 'BF_length', bf_length_values, base_params)

fig, axes = plt.subplots(1, 3, figsize=(18, 5))
for dataset_name in datasets.keys():
    data = bf_length_results[bf_length_results['dataset'] == dataset_name]
    axes[0].plot(data['param_value'], data['precision'], marker='^', label=dataset_name)
    axes[1].plot(data['param_value'], data['recall'], marker='^', label=dataset_name)
    axes[2].plot(data['param_value'], data['f1_score'], marker='^', label=dataset_name)

axes[0].set_xlabel('Bloom Filter Length')
axes[0].set_ylabel('Precision')
axes[0].set_title('Precision vs BF Length')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].set_xlabel('Bloom Filter Length')
axes[1].set_ylabel('Recall')
axes[1].set_title('Recall vs BF Length')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

axes[2].set_xlabel('Bloom Filter Length')
axes[2].set_ylabel('F1 Score')
axes[2].set_title('F1 Score vs BF Length')
axes[2].legend()
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('pprl_bf_length_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

print('\nBF Length Results Summary:')
print(bf_length_results.to_string())

# CHANGE 11: Number of Hash Functions Analysis

In [None]:
num_hash_values = [5, 10, 15, 20]
num_hash_results = test_parameter_variations(datasets, 'BF_num_hash', num_hash_values, base_params)

fig, axes = plt.subplots(1, 3, figsize=(18, 5))
for dataset_name in datasets.keys():
    data = num_hash_results[num_hash_results['dataset'] == dataset_name]
    axes[0].plot(data['param_value'], data['precision'], marker='d', label=dataset_name)
    axes[1].plot(data['param_value'], data['recall'], marker='d', label=dataset_name)
    axes[2].plot(data['param_value'], data['f1_score'], marker='d', label=dataset_name)

axes[0].set_xlabel('Number of Hash Functions')
axes[0].set_ylabel('Precision')
axes[0].set_title('Precision vs Number of Hashes')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].set_xlabel('Number of Hash Functions')
axes[1].set_ylabel('Recall')
axes[1].set_title('Recall vs Number of Hashes')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

axes[2].set_xlabel('Number of Hash Functions')
axes[2].set_ylabel('F1 Score')
axes[2].set_title('F1 Score vs Number of Hashes')
axes[2].legend()
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('pprl_num_hash_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

print('\nNumber of Hash Functions Results Summary:')
print(num_hash_results.to_string())

# CHANGE 12: Combined Performance Comparison

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

for dataset_name in datasets.keys():
    eps_data = epsilon_results[epsilon_results['dataset'] == dataset_name]
    axes[0, 0].plot(eps_data['param_value'], eps_data['f1_score'], marker='o', label=dataset_name)
    
    th_data = threshold_results[threshold_results['dataset'] == dataset_name]
    axes[0, 1].plot(th_data['param_value'], th_data['f1_score'], marker='s', label=dataset_name)
    
    bf_data = bf_length_results[bf_length_results['dataset'] == dataset_name]
    axes[1, 0].plot(bf_data['param_value'], bf_data['f1_score'], marker='^', label=dataset_name)
    
    nh_data = num_hash_results[num_hash_results['dataset'] == dataset_name]
    axes[1, 1].plot(nh_data['param_value'], nh_data['f1_score'], marker='d', label=dataset_name)

axes[0, 0].set_title('F1 Score vs Privacy Budget (ε)')
axes[0, 0].set_xlabel('Privacy Budget (ε)')
axes[0, 0].set_ylabel('F1 Score')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

axes[0, 1].set_title('F1 Score vs Similarity Threshold')
axes[0, 1].set_xlabel('Similarity Threshold')
axes[0, 1].set_ylabel('F1 Score')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

axes[1, 0].set_title('F1 Score vs BF Length')
axes[1, 0].set_xlabel('Bloom Filter Length')
axes[1, 0].set_ylabel('F1 Score')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

axes[1, 1].set_title('F1 Score vs Number of Hashes')
axes[1, 1].set_xlabel('Number of Hash Functions')
axes[1, 1].set_ylabel('F1 Score')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('pprl_combined_comparison.png', dpi=150, bbox_inches='tight')
plt.show()