# Privacy-Preserving Record Linkage using Bloom filters and Similarity threshold-based classification

In [None]:
import time
import math

# lowercase, because pip installed 'bf', not 'BF'
from bf import BF    

# this is your local file 'PPRL.py'
from PPRL import Link


ModuleNotFoundError: No module named 'bitarray'

In [None]:
start_time = time.time()

In [None]:
#Create an instance of class Link with the following parameter values
#Modify the values below to fine-tune the hyper-parameters
#
BF_length = 1000
BF_num_hash = 10
BF_q_gram = 2
min_sim_val = 0.8
link_attrs = [1,2,3,4]
block_attrs = [2,4]
ent_index = 0
epsilon = 7

link = Link(BF_length,BF_num_hash,BF_q_gram,min_sim_val,link_attrs,block_attrs,ent_index,epsilon)

In [None]:
#Read the first dataset - provide the link for the first dataset
db1 = link.read_database('../csv_files/Alice_numrec_100_corr_50.csv')
#print(db1)

In [None]:
#Read the second dataset - provide the link for the second dataset
db2 = link.read_database('../csv_files/Bob_numrec_100_corr_50.csv')
#print(db2)

In [None]:
#Apply blocking on the two datasets
blk_ind1 = link.build_BI(db1)
blk_ind2 = link.build_BI(db2)

In [None]:
#Encode records into Bloom filters
bf_dict1, all_val_set1 = link.data_encode(db1)
bf_dict2, all_val_set2 = link.data_encode(db2)

#Calculate false positive rate of bloom filter encoding
all_val_set = all_val_set1 + all_val_set2
total_all_val_set = set(all_val_set)
num_total_all_val_set = len(total_all_val_set)

fpr = (1 - math.e**((-1*BF_num_hash*num_total_all_val_set)/BF_length))**BF_num_hash
print(fpr)

In [None]:
#Add bit-level differential privacy noise to Bloom filters
pbf_dict1 = link.add_DP_noise(bf_dict1)
pbf_dict2 = link.add_DP_noise(bf_dict2)

In [None]:
#Match and link Bloom filters from the two datasets
matches = link.match(blk_ind1,blk_ind2,pbf_dict1,pbf_dict2) 

In [None]:
#Evaluate runtime
end_time = time.time() - start_time
print('Total time in seconds:', end_time)

In [None]:
#Evaluate linkage quality
print('Linkage quality of PPRL')
prec, rec, f1 = link.evaluate(matches,db1,db2)
print('Probable Privacy guarantees:', 'false positive rate of Bloom filters (larger better) - ', fpr)
print('Provable Privacy guarantees:', 'Privacy budget (smaller better) - ', epsilon)

In [None]:
#Baseline: Macth and link from two datasets using non-privacy-preserving record linkage
matches_npp = link.match_npp(blk_ind1,blk_ind2,db1,db2)

In [None]:
#Baseline2: Match and link Bloom filters from two datasets without DP guarantees
matches_nodp = link.match(blk_ind1,blk_ind2,bf_dict1,bf_dict2)

In [None]:
#Evaluate linkage quality of non-privacy-preserving record linkage baseline method
print('Linkage quality of non-PPRL')
prec_b1, rec_b1, f1_b1 = link.evaluate(matches_npp,db1,db2)
print('Privacy guarantees:', 'None')

In [None]:
#Evaluate linkage quality of privacy-preserving record linkage without Differential privacy guarantees
print('Linkage quality of PPRL without DP')
prec_b2, rec_b2, f1_b2 = link.evaluate(matches_nodp,db1,db2)
print('Probable Privacy guarantees:', 'false positive rate of Bloom filters (larger better) - ', fpr)
print('Provable Privacy guarantees:', 'None')