# About
## v1.2 - Prevent set size reveal with padding!
This version improves on `v1.1 - Prevent main-in-the-middle attack with symmetric key encryption!` by exchanging a set of a pre-determined size, bigger than each of the client's original set size.  
This is achieved by each client padding their set with invalid phone numbers range(10000000,80000000) until their set size is equal to the pre-determined size.  
Detailed explanations about this version can be found in the READme and the video.

In [1]:
# import libraries (non-cryptogaphic)
import random # to generate phone numbers
import pandas as pd

# import libraries (cryptographic)
import cryptography.hazmat.primitives.asymmetric.dh as dh
from cryptography.fernet import Fernet
import hashlib
import sympy
import secrets

# Generate phone numbers and store them in "phone_numbers.csv"

In [2]:
random.seed(10) # to ensure same phone numbers generated every time

In [3]:
class PhoneNumberGenerator:
    # class to generate phone numbers for grab and gojek

    def __call__(self, count):
        phone_numbers = random.sample(range(80000000,100000000), count)
        return phone_numbers 

class FakePhoneNumberGenerator:
    # class to generate invalid phone numbers for padding
    def __call__(self, count):
        numbers = random.sample(range(10000000,80000000), count) # invalid phone numbers generated
        return numbers
            
    
class PhoneNumberStorageManager:
    # class to store numbers to csv file

    def __init__(self):
        self.filename = "phone_numbers_padded.csv"
    def __call__(self, gojek_phone_numbers, grab_phone_numbers):
        d = {"gojek": gojek_phone_numbers, 
            "grab": grab_phone_numbers}
        df = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in d.items()])) # create dataframe
        df.to_csv(self.filename, index = False) # store values to file "phone_numbers_padded.csv"

### Edit cell below to change phone number count and set size

In [4]:
# set set-size
set_size = 100 # both parties are to exchange sets of a pre-determined size

# set phone number count
gojek_phone_number_count = 61 # inclusive of phone numbers in common with grab
grab_phone_number_count = 91 # inclusive of phone numbers in common with gojek
common_phone_number_count = 10



In [5]:
# instantiate required classes
phone_number_generator = PhoneNumberGenerator()
fake_phone_number_generator = FakePhoneNumberGenerator()
phone_number_storage_manager = PhoneNumberStorageManager()

# generate phone numbers
phone_numbers = phone_number_generator(gojek_phone_number_count+grab_phone_number_count-common_phone_number_count)
common_phone_numbers = phone_numbers[0:common_phone_number_count]
gojek_phone_numbers = phone_numbers[0:gojek_phone_number_count]
grab_phone_numbers = common_phone_numbers + phone_numbers[gojek_phone_number_count:] 

# generate fake phone numbers
gojek_fake_phone_numbers = fake_phone_number_generator(set_size-gojek_phone_number_count)
grab_fake_phone_numbers = fake_phone_number_generator(set_size-grab_phone_number_count)

# add fake phone numbers to phone numbers
gojek_padded_set = gojek_phone_numbers + gojek_fake_phone_numbers
grab_padded_set = grab_phone_numbers + grab_fake_phone_numbers

# shuffle phone number lists
random.shuffle(gojek_padded_set)
random.shuffle(grab_padded_set)

# write phone numbers (with fake ones) to csv file
phone_number_storage_manager(gojek_padded_set, grab_padded_set)

# Define classes for the necessary for the algorithm

In [6]:
class NumberGenerator:
    # class to generate numbers required for psi

    
    def generate_public_parameters(self, size):
        # method to generate p, q and factors of p-1
        p = self.generate_safe_prime(size)
        print(f"p is prime: {sympy.ntheory.isprime(p)}")
        length_of_p = len(bin(p)[2:]) # should be 1024
        print(f"Length of prime modulus, p: {length_of_p}.\nNote: Should be {size}.")
        q = (p-1)//2
        print(f"q is prime: {sympy.ntheory.isprime(q)}")
        factors_pminus1 = [1, 2, q] # since q is prime, 2q only has these 3 factors excluding itself
                                    # 2q = p-1
                                    # factors of p-1 required to compute order of generators (lagrange theorem)
        return p, factors_pminus1

    def generate_safe_prime(self, size):
        # method to generate safe prime for p
        candidate = dh.generate_parameters(2, size).parameter_numbers().p # generate 1024-bit prime number
        while True:
            # the method used from the cryptography already generates a safe prime, this portion is merely to double confirm
            is_safe_prime = sympy.ntheory.isprime((candidate-1)//2) # if safe prime, (candidate-1)/2 is prime
            if (is_safe_prime):
                break
            else:
                candidate = dh.generate_parameters(2, size).parameter_numbers().p
                print(candidate)
 
        return candidate
    
    
    def generate_random_number(self, size):
        # method to generate client's secret
        return secrets.randbits(size)
            
class NumberInspector:
    # class to inspect values          

    
    def check_is_primitive_generator(self, candidate, factors_divisorminus1, divisor): # note: factors should be the factors of divisor-1
        
        # apply lagrange theorem
        for possible_order in factors_divisorminus1: # check congruence for all factors (factors is exclusive of p-1 itself)
            result = pow(candidate, possible_order, divisor) # fast modular exponentiation implemented in pow(x,y,z)
                                                             # python integers have arbitrary precisions, 
                                                             # no overflow would occur if operations done in pure python
            if (result == 1):
                return False # if candidate exponent any of the factors (1,2,q) congruent to 1modp, candidate is not a primitive
                             # generator
        return True # order of candidate == p-1, therefore candidate is a primitive generator

class StorageManager:
    # class to store data that needs to be sent to another party to designated file  
    
    def store_data(self, filename, data):
        df = pd.DataFrame(dict([(k,pd.Series(v, dtype = "str")) for k,v in data.items()])) # create dataframe
        df.to_csv(filename, index=False) # store data to file with filename


# Create psi client class

In [7]:
class Client:
    # client class (both grab and gojek are clients communicating directly with each other)

    def __init__(self, name, other_party_name, private_key_size, phone_numbers, p, factors_pminus1, fernet_key):
        
        self.number_inspector = NumberInspector()
        self.number_generator = NumberGenerator()
        
        # for asymmetric encryption
        self.private_key = self.number_generator.generate_random_number(private_key_size)
        self.my_set = phone_numbers
        self.p = p # prime modulus
        self.factors_pminus1 = factors_pminus1 # to calculate order of hashed phone numbers, 
                                               # since algorithm requires them to be primitive generators
        
        # for symmetric encryption
        self.fernet_key = fernet_key
        self.f = Fernet(fernet_key)
        
        # values to track for psi algorithm
        self.my_hashed_set = None # h(x)
        self.my_self_encrypted_set = None # (h(x)^(my_secret))modp
        self.my_encrypted_set = None # (h(x)^(my_secret)(other_party_secret))modp
        self.other_party_encrypted_set = None # (h(y)^(my_secret)(other_party_secret))modp
        self.common_values = None # common phone numbers
        
        
        # create datafile for communication with another party
        
        # content to store in file
        # only need to share self_encrypted_values and other_party_encrypted_values
        # common_values to ensure both calculate the same intersection
        self.my_dict = {
            'my_self_encrypted_set': None, 
            'other_party_encrypted_set': None,
            'common_values': None
        }
        
        # filenames
        self.name = name
        self.filename = name + "_data_v1.2.csv"
        self.other_party_name = other_party_name
        self.other_party_filename = other_party_name + "_data_v1.2.csv"
        
        # create file
        self.storage_manager = StorageManager()
        self.storage_manager.store_data(self.filename, self.my_dict)

    def hash_to_primitive_root_modulo_p(self, element): 
        # method to hash phone numbers to primitive root modulo p i.e. primitive generator

        endian = "big"
        element = element.to_bytes(4, endian)
        hash_hex = hashlib.sha256(element).hexdigest() # sha3_256
        hash_int = int(hash_hex, 16)
        while True:
            # repeatedly hash until primitive root modulo p is obtained
            is_primitive_generator = self.number_inspector.check_is_primitive_generator(
                hash_int, self.factors_pminus1, self.p
            )
            if (is_primitive_generator):
                break
            else:
                hash_int = hash_int.to_bytes(32, endian)
                hash_hex = hashlib.sha256(hash_int).hexdigest()
                hash_int = int(hash_hex, 16)
                
        return hash_int
    
    def modular_exponentation(self, element):
        # compute (element^(private_key))modp
    
        return pow(element, self.private_key, self.p)
    
    def hash_set(self):
        # hash all phone numbers in my set to primitive root modulo p, one by one
        
        self.my_hashed_set = []
        
        for element in self.my_set:
            hashed_value = self.hash_to_primitive_root_modulo_p(element)
            self.my_hashed_set.append(hashed_value)
            
    def encrypt_set(self, is_other_party):
        # encrypt all elements in a given set using private_key, one by one
        
        # two scenarios to consider
        # one: encrypt set sent by the other party
        if (is_other_party):
            decrypted_other_party_set = self.receive_data("my_self_encrypted_set")
            other_party_set_int = []
            for element_string in decrypted_other_party_set:
                other_party_set_int.append(int(element_string))
            set_to_encrypt = other_party_set_int
        # two: encrypt my own set
        else:
            set_to_encrypt = self.my_hashed_set
        
        # encrypt values in given set, one by one
        encrypted_values = []
        for element in set_to_encrypt:
            encrypted_value = self.modular_exponentation(element)
            encrypted_values.append(encrypted_value)
            
        # assign the encrypted set to the correct variable
        # update csv file used for communication
        if (is_other_party):
            self.other_party_encrypted_set = encrypted_values
            self.send_data(encrypted_values, "other_party_encrypted_set")
        else:
            self.my_self_encrypted_set = encrypted_values
            self.send_data(encrypted_values, "my_self_encrypted_set")
                    
        
    def get_intersection(self):
        
        # get intersection
        
        my_encrypted_set = self.receive_data("other_party_encrypted_set") # read my encrypted set from the other party's file
        my_encrypted_set_int = []
        
        # convert read values to integer
        for element in my_encrypted_set:
            my_encrypted_set_int.append(int(element))
            
        # assign to correct variable
        self.my_encrypted_set = my_encrypted_set_int
        
        # get intersection
        encrypted_common_values = set(self.my_encrypted_set).intersection(self.other_party_encrypted_set)
        index_of_common_values = []
        
        # find the index of the elements in the intersection in my_encrypted_set
        for element in encrypted_common_values:
            index_of_common_values.append(self.my_encrypted_set.index(element))
            
        self.common_values = []
        
        # find the values in my own set corresponding to the index of the elements in the intersection
        for index in index_of_common_values:
            potential_phone_number = self.my_set[index]
            
            if ((potential_phone_number-79999999)>0): # remove any potential fake numbers that intersect
                self.common_values.append(potential_phone_number)
        
        # update csv file for communication
        self.send_data(self.common_values, "common_values")
    
    def encrypt_data(self, plaintext):
        # encrypt data with Fernet
        
        endian = "big"
        element = plaintext.to_bytes(128, endian) # 1024 bits == 128 bytes
        cipher_text = self.f.encrypt(element)
        return cipher_text
    
    def decrypt_data(self, ciphertext):
        # decrypt data encrypted by Fernet
        
        ciphertext_bytes = ciphertext.encode('utf-8')[2:-1] # convert from string back to bytes
        endian = "big"
        element_in_bytes = self.f.decrypt(ciphertext_bytes)
        plaintext = int.from_bytes(element_in_bytes, endian)
        return plaintext

    def send_data(self, data_to_send, column_name):
        # send data means writing to file. encrypt data with Fernet
        
        # encrypt data
        encrypted_data_to_send = []
        for element in data_to_send:
            encrypted_element = self.encrypt_data(element)
            encrypted_data_to_send.append(encrypted_element)
            
        # send data
        self.my_dict[column_name] = encrypted_data_to_send
        self.storage_manager.store_data(self.filename, self.my_dict)
        
        
    def receive_data(self, column_name):
        # receive data means reading from file (my file). decrypt data encrypted by Fernet
        
        # receive data
        encrypted_data = self.get_other_party_data()[column_name].to_list()
        
        # decrypt data
        decrypted_data = []
        for element in encrypted_data:
            if type(element) is float: # remove NaN
                continue
            decrypted_element = self.decrypt_data(element)
            decrypted_data.append(decrypted_element)
        return decrypted_data
    
    def get_my_data(self):
        # read my file as dataframe (other party's file)
        
        return pd.read_csv(self.filename)
            
    def get_other_party_data(self):
        # read other party's file as dataframe
        
        return pd.read_csv(self.other_party_filename)
    
    


# Initialize context

### Edit cell below to change key size

In [8]:
# assign pre-determined variables for psi
key_size = 1024 # both private keys and large prime

In [9]:
# create key for symmetric key cryptography
fernet_key = Fernet.generate_key()

# create public parameters required for psi
number_generator = NumberGenerator()
p, factors_pminus1 = number_generator.generate_public_parameters(key_size)

# create clients
grab = Client("grab", "gojek", key_size, grab_padded_set, p, factors_pminus1, fernet_key)
gojek = Client("gojek", "grab", key_size, gojek_padded_set, p, factors_pminus1, fernet_key)



p is prime: True
Length of prime modulus, p: 1024.
Note: Should be 1024.
q is prime: True


# Get intersection

## Step 1: Hash phone numbers

In [10]:
# clients hash their own set
grab.hash_set()
gojek.hash_set()

### Clients' status after step 1 (value of variables in client)
Note: hashed set is not stored in the file as it is not meant to be shared with the other party hence, files' status after step 1 is not shown

In [11]:
d = {"gojek hashed set": gojek.my_hashed_set,
    "grab hashed set": grab.my_hashed_set}
df = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in d.items()]))
df

Unnamed: 0,gojek hashed set,grab hashed set
0,2768208393965946516720796860751640208381574662...,6083752070806636466132091408609415889437525399...
1,2971259318188054936238701069597951289131550069...,6555446118383117843414628061923602991736146568...
2,8972686745761283250343305795976962558216403891...,3062635494707074836246422469968877214990302591...
3,3931935781355731990509341551661525053343436962...,2835741223782226842638506406959946569416451500...
4,5239384073106902936597081070089405716452234004...,1959933681547377774404303441869569823311439691...
...,...,...
95,8697318885625964372170362596957150767783631535...,4848231873104288702523644346183711147463602694...
96,7960204644377876874342383407634690809702051347...,1087372183257775118984247408250577970017171312...
97,8828111265965186841295119336388642758526460672...,3948222474783499474687611776797431125471701320...
98,9196222794520213004035745078489116002064573260...,7018556158959834041117086565402060628499966280...


## Step 2: encrypt hashed set with own private key

In [12]:
# clients self encrypt hashed set
grab.encrypt_set(False) # set is_other_party to false to encrypt own hashed set
gojek.encrypt_set(False)

### Clients' status after step 2 (value of variables in client)

In [13]:
d = {"gojek hashed set": gojek.my_hashed_set,
    "grab hashed set": grab.my_hashed_set,
    "gojek self-encrypted set": gojek.my_self_encrypted_set,
    "grab self-encrypted set": grab.my_self_encrypted_set}
df = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in d.items()]))
df

Unnamed: 0,gojek hashed set,grab hashed set,gojek self-encrypted set,grab self-encrypted set
0,2768208393965946516720796860751640208381574662...,6083752070806636466132091408609415889437525399...,9557256798444998184220275153824514335332385363...,1392352159919810814017120681618309248468361322...
1,2971259318188054936238701069597951289131550069...,6555446118383117843414628061923602991736146568...,1460451436827093985196414316894890537894937362...,4622428122136317708189314036339472078217807601...
2,8972686745761283250343305795976962558216403891...,3062635494707074836246422469968877214990302591...,3571188632407501134124104523088980239946688176...,5942908640927549251917941758078932952822195230...
3,3931935781355731990509341551661525053343436962...,2835741223782226842638506406959946569416451500...,1538229197935480455180260588412252506182615469...,7198280742294667504621057866290344241933761279...
4,5239384073106902936597081070089405716452234004...,1959933681547377774404303441869569823311439691...,8258474891767245356657215479687971229284999379...,1131009300071130670438802381425857068233711464...
...,...,...,...,...
95,8697318885625964372170362596957150767783631535...,4848231873104288702523644346183711147463602694...,6986696944471524792448963798207600583551551379...,3486909962302736219700328330295447882837700248...
96,7960204644377876874342383407634690809702051347...,1087372183257775118984247408250577970017171312...,8890085851683032426554197986412694846922630676...,3305471807821912777104979274305894203182716863...
97,8828111265965186841295119336388642758526460672...,3948222474783499474687611776797431125471701320...,6454280183340631564446477051026150589875456465...,2496872812736358130577346816495859375019331280...
98,9196222794520213004035745078489116002064573260...,7018556158959834041117086565402060628499966280...,1134129216613459646024895431301762076502499298...,1108072423141012822398588241842464975475759935...


### Files' status after step 2 (value of variables in file - clients' variables encrypted with Fernet)

In [14]:
df_gojek = gojek.get_my_data()
df_grab = grab.get_my_data()
print("gojek's file:")
df_gojek


gojek's file:


Unnamed: 0,my_self_encrypted_set,other_party_encrypted_set,common_values
0,b'gAAAAABhoySjZBvx0K1XG9vicqBEX9youI8dYa3aXSHw...,,
1,b'gAAAAABhoySjUx74pvRlDdTNrqMPvZjkU7salFkRflUV...,,
2,b'gAAAAABhoySjDSenxt2FZIm-GnAh3rli4V9r8HTS7oon...,,
3,b'gAAAAABhoySj-vCs5hXY-3WQjT4rynA7mxGhxrrUOFPz...,,
4,b'gAAAAABhoySjH6UUHkHL8Flu9SQFC1xF0THqmMEkUixq...,,
...,...,...,...
95,b'gAAAAABhoySj00CEOzvm6BLANslHqmaasMQeJ1IT7Ale...,,
96,b'gAAAAABhoySjYISDIOA0nUofTo8XAsuYTpPxSbhluPLH...,,
97,b'gAAAAABhoySjLa6erJRPgYIrItk0WLJd9rdOgWAoPQ7T...,,
98,b'gAAAAABhoySjggx8LDZFePyFUbTZjz5xn1r8vNXfyqEE...,,


In [15]:
print("grab's file:")
df_grab

grab's file:


Unnamed: 0,my_self_encrypted_set,other_party_encrypted_set,common_values
0,b'gAAAAABhoySjh9cWfH4PpH1rBOSdZ63PBsG1dKhVWdnq...,,
1,b'gAAAAABhoySj9qSCYeL_jdG-gmTFZZWwGAbOU2OoGLdc...,,
2,b'gAAAAABhoySjdCfDahrGziRK9FhuP0W32qaUxPfV7wsW...,,
3,b'gAAAAABhoySjHOX6YWxl8gW4_Xm3vroeWIy8dDs5148m...,,
4,b'gAAAAABhoySj9soRg2-JUmUxCHcQXZKL3tW5VW-A2FV-...,,
...,...,...,...
95,b'gAAAAABhoySjdqvdD2lWTTJoK8REPWLLIW4VXwkUEYf7...,,
96,b'gAAAAABhoySjGXIV2HeBUBWr5FUwfI9Bf43McNw-uRuK...,,
97,b'gAAAAABhoySjLSj-38cDr_p1Ghh39iKYEuFGy_LFgrkF...,,
98,b'gAAAAABhoySjJS2ZAKrfB2LNnsWvg9J76fs0egLOC3Zl...,,


## Step 3: encrypt other party's self-encrypted set with own private key


In [16]:
# clients encrypt other party's self encrypted set
grab.encrypt_set(True) # set is_other_party to true
gojek.encrypt_set(True)

### Clients' status after step 3 (value of variables in client)

In [17]:
d = {"gojek hashed set": gojek.my_hashed_set,
    "grab hashed set": grab.my_hashed_set,
    "gojek self-encrypted set": gojek.my_self_encrypted_set,
    "grab self-encrypted set": grab.my_self_encrypted_set,
    "gojek encrypted set": grab.other_party_encrypted_set,
    "grab encrypted set": gojek.other_party_encrypted_set}
df = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in d.items()]))
df

Unnamed: 0,gojek hashed set,grab hashed set,gojek self-encrypted set,grab self-encrypted set,gojek encrypted set,grab encrypted set
0,2768208393965946516720796860751640208381574662...,6083752070806636466132091408609415889437525399...,9557256798444998184220275153824514335332385363...,1392352159919810814017120681618309248468361322...,2300390704556221313577963135320936798852806232...,1281252686543961183522353778077229692196011855...
1,2971259318188054936238701069597951289131550069...,6555446118383117843414628061923602991736146568...,1460451436827093985196414316894890537894937362...,4622428122136317708189314036339472078217807601...,1317606303926197872121260195114249826297781271...,5220978494564481332492194367778287229525456463...
2,8972686745761283250343305795976962558216403891...,3062635494707074836246422469968877214990302591...,3571188632407501134124104523088980239946688176...,5942908640927549251917941758078932952822195230...,1457964589338857629679463557055145913663575234...,1194412300580734417891316323169870781702617511...
3,3931935781355731990509341551661525053343436962...,2835741223782226842638506406959946569416451500...,1538229197935480455180260588412252506182615469...,7198280742294667504621057866290344241933761279...,4254700741338299316419325059783470002322200468...,1391992782221417421400682169823026680844258431...
4,5239384073106902936597081070089405716452234004...,1959933681547377774404303441869569823311439691...,8258474891767245356657215479687971229284999379...,1131009300071130670438802381425857068233711464...,1147392936283044303244773272053784324332943926...,1559945421804806407529255730577771914920661804...
...,...,...,...,...,...,...
95,8697318885625964372170362596957150767783631535...,4848231873104288702523644346183711147463602694...,6986696944471524792448963798207600583551551379...,3486909962302736219700328330295447882837700248...,1315344486749542364637096843456616960500143193...,4211795640512906751845964021437945885979847844...
96,7960204644377876874342383407634690809702051347...,1087372183257775118984247408250577970017171312...,8890085851683032426554197986412694846922630676...,3305471807821912777104979274305894203182716863...,1167373885549945402991746346636214702234659921...,1161492368286201523110868484432013155677066211...
97,8828111265965186841295119336388642758526460672...,3948222474783499474687611776797431125471701320...,6454280183340631564446477051026150589875456465...,2496872812736358130577346816495859375019331280...,1418936840904404426977479423554871854963019439...,5472202978841524575524056747522742031454065689...
98,9196222794520213004035745078489116002064573260...,7018556158959834041117086565402060628499966280...,1134129216613459646024895431301762076502499298...,1108072423141012822398588241842464975475759935...,1420173906249052718176950494013899318006951201...,3210771110138912664108461007691016694100500200...


### Files' status after step 3  (value of variables in file - clients' variables encrypted with Fernet)

In [18]:
df_gojek = gojek.get_my_data()
df_grab = grab.get_my_data()
print("gojek's file:")
df_gojek



gojek's file:


Unnamed: 0,my_self_encrypted_set,other_party_encrypted_set,common_values
0,b'gAAAAABhoySjZBvx0K1XG9vicqBEX9youI8dYa3aXSHw...,b'gAAAAABhoySlDwATUma-bd9TeH1VPZyIC5CpFT_xzeR3...,
1,b'gAAAAABhoySjUx74pvRlDdTNrqMPvZjkU7salFkRflUV...,b'gAAAAABhoySli32ESxHtUFeK-Dd_kFPwfrKf6oXpsKBk...,
2,b'gAAAAABhoySjDSenxt2FZIm-GnAh3rli4V9r8HTS7oon...,b'gAAAAABhoySlOsWrwCs7NzCSHyYKXKSxsStz-EUISg1s...,
3,b'gAAAAABhoySj-vCs5hXY-3WQjT4rynA7mxGhxrrUOFPz...,b'gAAAAABhoySloWC120tcu340hKsancN8jwmttO-lDUkI...,
4,b'gAAAAABhoySjH6UUHkHL8Flu9SQFC1xF0THqmMEkUixq...,b'gAAAAABhoySlK5kNw1FqU9k2lNjAvwhRq9oHwGr_SWXj...,
...,...,...,...
95,b'gAAAAABhoySj00CEOzvm6BLANslHqmaasMQeJ1IT7Ale...,b'gAAAAABhoySlioRmvrTnGaKkvKWYCsDT9Vk2zIp8DJxX...,
96,b'gAAAAABhoySjYISDIOA0nUofTo8XAsuYTpPxSbhluPLH...,b'gAAAAABhoySlQ4NkWki3EMUo2fU721pfdOpMifKL-DXJ...,
97,b'gAAAAABhoySjLa6erJRPgYIrItk0WLJd9rdOgWAoPQ7T...,b'gAAAAABhoySlw_cwq46NRQWDNayLIZbggETEgIAf_Q2k...,
98,b'gAAAAABhoySjggx8LDZFePyFUbTZjz5xn1r8vNXfyqEE...,b'gAAAAABhoySlNZGFTJ2rDPOvbJUYvCcqK_OXcIAI4u2z...,


In [19]:
print("grab's file:")
df_grab

grab's file:


Unnamed: 0,my_self_encrypted_set,other_party_encrypted_set,common_values
0,b'gAAAAABhoySjh9cWfH4PpH1rBOSdZ63PBsG1dKhVWdnq...,b'gAAAAABhoySksUglydlS2N20BdxXZ7GOteNczuDyscSr...,
1,b'gAAAAABhoySj9qSCYeL_jdG-gmTFZZWwGAbOU2OoGLdc...,b'gAAAAABhoySknj9X980YfMd0cUloMRp_rnCKtyn0brYO...,
2,b'gAAAAABhoySjdCfDahrGziRK9FhuP0W32qaUxPfV7wsW...,b'gAAAAABhoySkgVdr0EEYoTz-x7TP4ynJHoXhhCQatHPk...,
3,b'gAAAAABhoySjHOX6YWxl8gW4_Xm3vroeWIy8dDs5148m...,b'gAAAAABhoySkRas-aEYojB2DkgWkksClCjyOBHnENHP5...,
4,b'gAAAAABhoySj9soRg2-JUmUxCHcQXZKL3tW5VW-A2FV-...,b'gAAAAABhoySkmJhWudmy1nCuGMfJpGHojfo9XVmixGqZ...,
...,...,...,...
95,b'gAAAAABhoySjdqvdD2lWTTJoK8REPWLLIW4VXwkUEYf7...,b'gAAAAABhoySkc9bJ_viiuO2SN4UX6ePmZJHjGFcD-_cq...,
96,b'gAAAAABhoySjGXIV2HeBUBWr5FUwfI9Bf43McNw-uRuK...,b'gAAAAABhoySkarX-tUwd0jQQV3x1zKPMoGYgZ3ioPPBA...,
97,b'gAAAAABhoySjLSj-38cDr_p1Ghh39iKYEuFGy_LFgrkF...,b'gAAAAABhoySk9hK5RGMhs4WZlMSMTsd2TSyTMsq-CEhJ...,
98,b'gAAAAABhoySjJS2ZAKrfB2LNnsWvg9J76fs0egLOC3Zl...,b'gAAAAABhoySkmcN_G5aLGHFpmlhCUnM2eDLdGbeQSDxe...,


## Step 4: find intersection


In [20]:
# clients find intersection
grab.get_intersection()
gojek.get_intersection()

### Clients' status after step 4 (value of variables in client)

In [21]:
d = {"gojek hashed set": gojek.my_hashed_set,
    "grab hashed set": grab.my_hashed_set,
    "gojek self-encrypted set": gojek.my_self_encrypted_set,
    "grab self-encrypted set": grab.my_self_encrypted_set,
    "gojek encrypted set": grab.other_party_encrypted_set,
    "grab encrypted set": gojek.other_party_encrypted_set,
    "gojek found intersection": gojek.common_values,
    "grab found intersection:": grab.common_values}
df = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in d.items()]))
df

Unnamed: 0,gojek hashed set,grab hashed set,gojek self-encrypted set,grab self-encrypted set,gojek encrypted set,grab encrypted set,gojek found intersection,grab found intersection:
0,2768208393965946516720796860751640208381574662...,6083752070806636466132091408609415889437525399...,9557256798444998184220275153824514335332385363...,1392352159919810814017120681618309248468361322...,2300390704556221313577963135320936798852806232...,1281252686543961183522353778077229692196011855...,96192082.0,96192082.0
1,2971259318188054936238701069597951289131550069...,6555446118383117843414628061923602991736146568...,1460451436827093985196414316894890537894937362...,4622428122136317708189314036339472078217807601...,1317606303926197872121260195114249826297781271...,5220978494564481332492194367778287229525456463...,99173089.0,94391128.0
2,8972686745761283250343305795976962558216403891...,3062635494707074836246422469968877214990302591...,3571188632407501134124104523088980239946688176...,5942908640927549251917941758078932952822195230...,1457964589338857629679463557055145913663575234...,1194412300580734417891316323169870781702617511...,94391128.0,99173089.0
3,3931935781355731990509341551661525053343436962...,2835741223782226842638506406959946569416451500...,1538229197935480455180260588412252506182615469...,7198280742294667504621057866290344241933761279...,4254700741338299316419325059783470002322200468...,1391992782221417421400682169823026680844258431...,81093373.0,81093373.0
4,5239384073106902936597081070089405716452234004...,1959933681547377774404303441869569823311439691...,8258474891767245356657215479687971229284999379...,1131009300071130670438802381425857068233711464...,1147392936283044303244773272053784324332943926...,1559945421804806407529255730577771914920661804...,99397525.0,99397525.0
...,...,...,...,...,...,...,...,...
95,8697318885625964372170362596957150767783631535...,4848231873104288702523644346183711147463602694...,6986696944471524792448963798207600583551551379...,3486909962302736219700328330295447882837700248...,1315344486749542364637096843456616960500143193...,4211795640512906751845964021437945885979847844...,,
96,7960204644377876874342383407634690809702051347...,1087372183257775118984247408250577970017171312...,8890085851683032426554197986412694846922630676...,3305471807821912777104979274305894203182716863...,1167373885549945402991746346636214702234659921...,1161492368286201523110868484432013155677066211...,,
97,8828111265965186841295119336388642758526460672...,3948222474783499474687611776797431125471701320...,6454280183340631564446477051026150589875456465...,2496872812736358130577346816495859375019331280...,1418936840904404426977479423554871854963019439...,5472202978841524575524056747522742031454065689...,,
98,9196222794520213004035745078489116002064573260...,7018556158959834041117086565402060628499966280...,1134129216613459646024895431301762076502499298...,1108072423141012822398588241842464975475759935...,1420173906249052718176950494013899318006951201...,3210771110138912664108461007691016694100500200...,,


Note: Last 2 columns, unlike the rest of the columns, do not have a one-to-one mapping with other values belonging to the same row i.e. values in the last 2 columns do not have any relation to the other values in the same row as it.

### Files' status after step 4  (value of variables in file - clients' variables encrypted with Fernet)

In [22]:
df_gojek = gojek.get_my_data()
df_grab = grab.get_my_data()
print("gojek's file:")
df_gojek


gojek's file:


Unnamed: 0,my_self_encrypted_set,other_party_encrypted_set,common_values
0,b'gAAAAABhoySjZBvx0K1XG9vicqBEX9youI8dYa3aXSHw...,b'gAAAAABhoySlDwATUma-bd9TeH1VPZyIC5CpFT_xzeR3...,b'gAAAAABhoySl6yE1-nvTxEKzuPqo2GKwYFggfJygsvRW...
1,b'gAAAAABhoySjUx74pvRlDdTNrqMPvZjkU7salFkRflUV...,b'gAAAAABhoySli32ESxHtUFeK-Dd_kFPwfrKf6oXpsKBk...,b'gAAAAABhoySln_Hv3o0k3BfSy3WTmZImzT4Hzv6-8PDJ...
2,b'gAAAAABhoySjDSenxt2FZIm-GnAh3rli4V9r8HTS7oon...,b'gAAAAABhoySlOsWrwCs7NzCSHyYKXKSxsStz-EUISg1s...,b'gAAAAABhoySltZdtPPCRiQz8gQ-AD5Amr5yqU36UBMVw...
3,b'gAAAAABhoySj-vCs5hXY-3WQjT4rynA7mxGhxrrUOFPz...,b'gAAAAABhoySloWC120tcu340hKsancN8jwmttO-lDUkI...,b'gAAAAABhoySlvaK34884wQer0-vK8B59UxjSrP9RpqQw...
4,b'gAAAAABhoySjH6UUHkHL8Flu9SQFC1xF0THqmMEkUixq...,b'gAAAAABhoySlK5kNw1FqU9k2lNjAvwhRq9oHwGr_SWXj...,b'gAAAAABhoySlSv4cadB2zByYVQRYSnFsJKeyFUEifU2Y...
...,...,...,...
95,b'gAAAAABhoySj00CEOzvm6BLANslHqmaasMQeJ1IT7Ale...,b'gAAAAABhoySlioRmvrTnGaKkvKWYCsDT9Vk2zIp8DJxX...,
96,b'gAAAAABhoySjYISDIOA0nUofTo8XAsuYTpPxSbhluPLH...,b'gAAAAABhoySlQ4NkWki3EMUo2fU721pfdOpMifKL-DXJ...,
97,b'gAAAAABhoySjLa6erJRPgYIrItk0WLJd9rdOgWAoPQ7T...,b'gAAAAABhoySlw_cwq46NRQWDNayLIZbggETEgIAf_Q2k...,
98,b'gAAAAABhoySjggx8LDZFePyFUbTZjz5xn1r8vNXfyqEE...,b'gAAAAABhoySlNZGFTJ2rDPOvbJUYvCcqK_OXcIAI4u2z...,


In [23]:
print("grab's file:")
df_grab

grab's file:


Unnamed: 0,my_self_encrypted_set,other_party_encrypted_set,common_values
0,b'gAAAAABhoySjh9cWfH4PpH1rBOSdZ63PBsG1dKhVWdnq...,b'gAAAAABhoySksUglydlS2N20BdxXZ7GOteNczuDyscSr...,b'gAAAAABhoySlbbOSwxL9zOL_z8m3F0SnhK1VlopKqFhz...
1,b'gAAAAABhoySj9qSCYeL_jdG-gmTFZZWwGAbOU2OoGLdc...,b'gAAAAABhoySknj9X980YfMd0cUloMRp_rnCKtyn0brYO...,b'gAAAAABhoySl6fP_T-JTveUCnvc0ew1NBt42F8QemnU3...
2,b'gAAAAABhoySjdCfDahrGziRK9FhuP0W32qaUxPfV7wsW...,b'gAAAAABhoySkgVdr0EEYoTz-x7TP4ynJHoXhhCQatHPk...,b'gAAAAABhoySlwHQW-LPtHWguGer6EIyCQpNzDH_fYkCY...
3,b'gAAAAABhoySjHOX6YWxl8gW4_Xm3vroeWIy8dDs5148m...,b'gAAAAABhoySkRas-aEYojB2DkgWkksClCjyOBHnENHP5...,b'gAAAAABhoySlCtF7p_o6L_BOnC8b-Bq-5EyjfY9kjujD...
4,b'gAAAAABhoySj9soRg2-JUmUxCHcQXZKL3tW5VW-A2FV-...,b'gAAAAABhoySkmJhWudmy1nCuGMfJpGHojfo9XVmixGqZ...,b'gAAAAABhoySl7GV2vHNANXbQikERPPGIqyHjTjMujmA_...
...,...,...,...
95,b'gAAAAABhoySjdqvdD2lWTTJoK8REPWLLIW4VXwkUEYf7...,b'gAAAAABhoySkc9bJ_viiuO2SN4UX6ePmZJHjGFcD-_cq...,
96,b'gAAAAABhoySjGXIV2HeBUBWr5FUwfI9Bf43McNw-uRuK...,b'gAAAAABhoySkarX-tUwd0jQQV3x1zKPMoGYgZ3ioPPBA...,
97,b'gAAAAABhoySjLSj-38cDr_p1Ghh39iKYEuFGy_LFgrkF...,b'gAAAAABhoySk9hK5RGMhs4WZlMSMTsd2TSyTMsq-CEhJ...,
98,b'gAAAAABhoySjJS2ZAKrfB2LNnsWvg9J76fs0egLOC3Zl...,b'gAAAAABhoySkmcN_G5aLGHFpmlhCUnM2eDLdGbeQSDxe...,


# Check results

In [24]:
# get intersection found by the two parties
gojek_found_intersection = gojek.common_values
grab_found_intersection = grab.common_values


# sort numbers for easier comparison
gojek_found_intersection.sort()
grab_found_intersection.sort()
common_phone_numbers.sort()

# summarize them in a dataframe
d = {"actual": common_phone_numbers,
    "gojek": gojek_found_intersection,
    "grab": grab_found_intersection}
df = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in d.items()]))
df

Unnamed: 0,actual,gojek,grab
0,80497694,80497694,80497694
1,81093373,81093373,81093373
2,86915509,86915509,86915509
3,89312048,89312048,89312048
4,94391128,94391128,94391128
5,95521626,95521626,95521626
6,96192082,96192082,96192082
7,96485172,96485172,96485172
8,99173089,99173089,99173089
9,99397525,99397525,99397525
