# Privacy-Preserving Record Linkage using Bloom filters and Similarity threshold-based classification

In [1]:
# Imports
#
import time
import math
from BF import BF #import the BF module
from PPRL import Link #import the PPRL module

In [106]:
start_time = time.time()

In [107]:
#Create an instance of class Link with the following parameter values
#Modify the values below to fine-tune the hyper-parameters
#
BF_length = 1000
BF_num_hash = 10
BF_q_gram = 2
min_sim_val = 0.8
link_attrs = [1,2,3,4]
block_attrs = [2,4]
ent_index = 0
epsilon = 7

link = Link(BF_length,BF_num_hash,BF_q_gram,min_sim_val,link_attrs,block_attrs,ent_index,epsilon)

In [108]:
#Read the first dataset - provide the link for the first dataset
db1 = link.read_database('Datasets/Alice_numrec_100_corr_50.csv')
#print(db1)

Load data file: Datasets/Alice_numrec_100_corr_50.csv
Read 100 records


In [109]:
#Read the second dataset - provide the link for the second dataset
db2 = link.read_database('Datasets/Bob_numrec_100_corr_50.csv')
#print(db2)

Load data file: Datasets/Bob_numrec_100_corr_50.csv
Read 100 records


In [110]:
#Apply blocking on the two datasets
blk_ind1 = link.build_BI(db1)
blk_ind2 = link.build_BI(db2)

Build Block Index for attributes: [2]
Generate 97 blocks
Build Block Index for attributes: [2]
Generate 90 blocks


In [111]:
#Encode records into Bloom filters
bf_dict1, all_val_set1 = link.data_encode(db1)
bf_dict2, all_val_set2 = link.data_encode(db2)

#Calculate false positive rate of bloom filter encoding
all_val_set = all_val_set1 + all_val_set2
total_all_val_set = set(all_val_set)
num_total_all_val_set = len(total_all_val_set)

fpr = (1 - math.e**((-1*BF_num_hash*num_total_all_val_set)/BF_length))**BF_num_hash
print(fpr)

0.9346272479320712


In [112]:
#Add bit-level differential privacy noise to Bloom filters
pbf_dict1 = link.add_DP_noise(bf_dict1)
pbf_dict2 = link.add_DP_noise(bf_dict2)

In [113]:
#Match and link Bloom filters from the two datasets
matches = link.match(blk_ind1,blk_ind2,pbf_dict1,pbf_dict2) 

number of common blocks: 52
Number of matching pairs: 32


In [114]:
#Evaluate runtime
end_time = time.time() - start_time
print('Total time in seconds:', end_time)

Total time in seconds: 5.520123720169067


In [115]:
#Evaluate linkage quality
print('Linkage quality of PPRL')
prec, rec, f1 = link.evaluate(matches,db1,db2)
print('Probable Privacy guarantees:', 'false positive rate of Bloom filters (larger better) - ', fpr)
print('Provable Privacy guarantees:', 'Privacy budget (smaller better) - ', epsilon)

Linkage quality of PPRL
Precision:  1.0
Recall:  0.64
F1 score:  0.7804878048780487
Probable Privacy guarantees: false positive rate of Bloom filters (larger better) -  0.9346272479320712
Provable Privacy guarantees: Privacy budget (smaller better) -  7


In [116]:
#Baseline: Macth and link from two datasets using non-privacy-preserving record linkage
matches_npp = link.match_npp(blk_ind1,blk_ind2,db1,db2)

number of common blocks: 52
Number of matching pairs: 48


In [117]:
#Baseline2: Match and link Bloom filters from two datasets without DP guarantees
matches_nodp = link.match(blk_ind1,blk_ind2,bf_dict1,bf_dict2)

number of common blocks: 52
Number of matching pairs: 47


In [118]:
#Evaluate linkage quality of non-privacy-preserving record linkage baseline method
print('Linkage quality of non-PPRL')
prec_b1, rec_b1, f1_b1 = link.evaluate(matches_npp,db1,db2)
print('Privacy guarantees:', 'None')

Linkage quality of non-PPRL
Precision:  1.0
Recall:  0.96
F1 score:  0.9795918367346939
Privacy guarantees: None


In [119]:
#Evaluate linkage quality of privacy-preserving record linkage without Differential privacy guarantees
print('Linkage quality of PPRL without DP')
prec_b2, rec_b2, f1_b2 = link.evaluate(matches_nodp,db1,db2)
print('Probable Privacy guarantees:', 'false positive rate of Bloom filters (larger better) - ', fpr)
print('Provable Privacy guarantees:', 'None')

Linkage quality of PPRL without DP
Precision:  1.0
Recall:  0.94
F1 score:  0.9690721649484536
Probable Privacy guarantees: false positive rate of Bloom filters (larger better) -  0.9346272479320712
Provable Privacy guarantees: None
