***

In [86]:
from phe import paillier
import time
import pandas as pd

***

In [31]:
# Generate public and private keys
def generate_keys():
    public_key, private_key = paillier.generate_paillier_keypair()
    return public_key, private_key

# Encode a string into a unique integer
def encode_string(s, encoding_dict):
    if s not in encoding_dict:
        encoding_dict[s] = len(encoding_dict) + 1
    return encoding_dict[s]

# Encrypt a value
def encrypt_value(value, public_key):
    return public_key.encrypt(value)

# Decrypt a value
def decrypt_value(encrypted_value, private_key):
    return private_key.decrypt(encrypted_value)

In [37]:
# Encrypt and search in dataset
def HE(dataset, query_value):
    # Record the start time
    start_time = time.time()
    
    # Generate keys
    public_key, private_key = generate_keys()
    
    # Encoding dictionary
    encoding_dict = {}
    
    # Encode and encrypt the dataset
    encoded_dataset = [encode_string(value, encoding_dict) for value in dataset]
    encrypted_dataset = [encrypt_value(value, public_key) for value in encoded_dataset]
        
    # Encode and encrypt the query value
    encoded_query = encode_string(query_value, encoding_dict)
    encrypted_query = encrypt_value(encoded_query, public_key)
    
    # Searching through encrypted values
    search_result = any(decrypt_value(encrypted_value, private_key) == encoded_query for encrypted_value in encrypted_dataset)
    
    # Record the end time
    end_time = time.time()
    
    # Calculate the time taken
    time_taken = end_time - start_time
    
    print(f"Search result for query value '{query_value}': {search_result}")
    print(f"Time taken to run the cell: {time_taken} seconds")

In [39]:
#Testing
dataset = [1, 2, 3, 4, '5', '6', '7', '8', '9', '10']
query_value = '10'
HE(dataset, query_value)

Search result for query value '10': True
Time taken to run the cell: 3.6481218338012695 seconds


***

In [69]:
def extract_column_to_list(csv_filename, column_name, max_rows):
    try:
        # Read the CSV file with a limit on the number of rows
        df = pd.read_csv(csv_filename, nrows=max_rows)
        
        # Check if the column exists in the DataFrame
        if column_name not in df.columns:
            raise ValueError(f"Column '{column_name}' does not exist in the CSV file.")

        # Extract the column and convert it to a list
        column_list = df[column_name].tolist()
        
        return column_list
    
    except FileNotFoundError:
        print(f"File '{csv_filename}' not found.")
        return None
    except ValueError as e:
        print(e)
        return None

***

In [110]:
#Importing the data

#dataset from:
#https://www.kaggle.com/code/imranp/starter-synthetic-financial-datasets-cd6449a6-6

csv_filename = 'SynFinData1.csv'  
column_name = 'nameDest'  

***

In [113]:
max_rows = 10 

dataset10 = extract_column_to_list(csv_filename, column_name, max_rows)
print(dataset10)

In [114]:
max_rows = 50 

dataset50 = extract_column_to_list(csv_filename, column_name, max_rows)
#print(dataset50)

In [115]:
max_rows = 100

dataset100 = extract_column_to_list(csv_filename, column_name, max_rows)
#print(dataset100)

In [116]:
max_rows = 500

dataset500 = extract_column_to_list(csv_filename, column_name, max_rows)
#print(dataset500)

In [117]:
max_rows = 1000

dataset1000 = extract_column_to_list(csv_filename, column_name, max_rows)
#print(dataset1000)

In [118]:
max_rows = 5000

dataset5000 = extract_column_to_list(csv_filename, column_name, max_rows)
#print(dataset5000)

In [119]:
max_rows = 10000

dataset10000 = extract_column_to_list(csv_filename, column_name, max_rows)
#print(dataset10000)

In [120]:
max_rows = 50000

dataset50000 = extract_column_to_list(csv_filename, column_name, max_rows)
#print(dataset50000)

In [121]:
max_rows = 100000

dataset100000 = extract_column_to_list(csv_filename, column_name, max_rows)
#print(dataset100000)

***

In [138]:
#String Test 10
query_value = 'M1979787155'
HE(dataset10, query_value)

Search result for query value 'M1979787155': True
Time taken to run the cell: 3.5409061908721924 seconds


In [123]:
#String Test 50
query_value = 'M1979787155'
HE(dataset50, query_value)

Search result for query value 'M1979787155': True
Time taken to run the cell: 12.957278966903687 seconds


In [124]:
#String Test 100
query_value = 'M1979787155'
HE(dataset100, query_value)

Search result for query value 'M1979787155': True
Time taken to run the cell: 22.82736086845398 seconds


In [126]:
#String Test 500
query_value = 'M1979787155'
HE(dataset500, query_value)

Search result for query value 'M1979787155': True
Time taken to run the cell: 108.46706032752991 seconds


In [127]:
#String Test 1,000
query_value = 'M1979787155'
HE(dataset1000, query_value)

Search result for query value 'M1979787155': True
Time taken to run the cell: 223.77642345428467 seconds


In [128]:
#String Test 5,000
query_value = 'M1979787155'
HE(dataset5000, query_value)

Search result for query value 'M1979787155': True
Time taken to run the cell: 1049.3617677688599 seconds


In [129]:
#String Test 10,000
query_value = 'M1979787155'
HE(dataset10000, query_value)

Search result for query value 'M1979787155': True
Time taken to run the cell: 2081.393767118454 seconds


In [130]:
#String Test 50,000
query_value = 'M1979787155'
HE(dataset50000, query_value)

Search result for query value 'M1979787155': True
Time taken to run the cell: 10439.68198800087 seconds


In [132]:
#String Test 100,000
query_value = 'M1979787155'
HE(dataset100000, query_value)

Search result for query value 'M1979787155': True
Time taken to run the cell: 20963.100166082382 seconds


***

In [134]:
#Individual tests were left above so some results could be shown in notebook.
#For the rest of the tests, just going to test, record and change variables to keep things compact.

In [147]:
column_name2 = 'oldbalanceDest' 

In [149]:
max_rows = 10 
intdataset10 = extract_column_to_list(csv_filename, column_name2, max_rows)
#print(intdataset10)

#Int Test 10
query_value = 21182
HE(intdataset10, query_value)

Search result for query value '21182': True
Time taken to run the cell: 3.833334445953369 seconds
