# ZKredible
This notebook illustrate the idea of ZK lending.

some introduction to be added


## Setup

install and import necessary libraries

In [None]:
!pip install tenseal



In [None]:
import torch
import tenseal as ts
import pandas as pd
import random
from time import time


# those are optional and are not necessary for training
import numpy as np
import matplotlib.pyplot as plt

We use the dataset for credit score classification from [kaggle](https://www.kaggle.com/datasets/parisrohan/credit-score-classification/data?select=train.csv).

We already clean the data and for the purpose of clarity, we won't show the cleaning process here. You can check the data cleaning part in our code base


In [94]:
torch.random.manual_seed(919)
random.seed(919)


def split_train_test(x, y, test_ratio=0.3):
    idxs = [i for i in range(len(x))]
    random.shuffle(idxs)
    # delimiter between test and train data
    delim = int(len(x) * test_ratio)
    test_idxs, train_idxs = idxs[:delim], idxs[delim:]
    return x[train_idxs], y[train_idxs], x[test_idxs], y[test_idxs]


def credit_data():
    data = pd.read_csv("data/cleaned_data.csv")
    # we use credit score as label.
    y = torch.tensor(data['Credit_Score'].values.astype(np.single)).unsqueeze(1)
    data = data.drop("Credit_Score", "columns")

    x = torch.tensor(data.values.astype(np.single))

    return split_train_test(x, y)

x_train, y_train, x_test, y_test = credit_data()

print(f"x_train has shape: {x_train.shape}")
print(f"y_train has shape: {y_train.shape}")
print(f"x_test has shape: {x_test.shape}")
print(f"y_test has shape: {y_test.shape}")


x_train has shape: torch.Size([1400, 39])
y_train has shape: torch.Size([1400, 1])
x_test has shape: torch.Size([600, 39])
y_test has shape: torch.Size([600, 1])


  data = data.drop("Credit_Score", "columns")


#### Update Rule

For updating the parameter, the usual rule is as follows, where $x^{(i)}$ is the i'th input data:

$$\theta_j = \theta_j - \alpha \; [ \frac{1}{m} \sum_{i=1}^m (\hat{y}^{(i)} - y^{(i)}) x^{(i)} + \frac{\lambda}{m} \theta_j]$$

However, due to homomorphic encryption constraint, we preferred to use an $\alpha = 1$ to reduce a multiplication and set $\frac{\lambda}{m} = 0.05$ which gets us to the following update rule:

$$\theta_j = \theta_j - [ \frac{1}{m} \sum_{i=1}^m (\hat{y}^{(i)} - y^{(i)}) x^{(i)} + 0.05 \theta_j]$$

#### Sigmoid Approximation

Since we can't simply compute sigmoid on encrypted data, we need to approximate it using a low degree polynomial, the lower the degree the better, as we aim to perform as few multiplications as possible, to be able to use smaller parameters and thus optimize computation. This tutorial uses a degree 3 polynomial from https://eprint.iacr.org/2018/462.pdf, which approximates the sigmoid function in the range $[-5,5]$.

$$\sigma(x) = 0.5 + 0.197 x - 0.004 x^3$$

#### Homomorphic Encryption Parameters

From the input data to the parameter update, a ciphertext will need a multiplicative depth of 6, 1 for the dot product operation, 2 for the sigmoid approximation, and 3 for the backpropagation phase (one is actually hidden in the `self._delta_w += enc_x * out_minus_y` operation in the `backward()` function, which is multiplying a 1-sized vector with an n-sized one, which requires masking the first slot and replicating it n times in the first vector). With a scale of around 20 bits, we need 6 coefficients modulus with the same bit-size as the scale, plus the last coefficient, which needs more bits, we are already out of the 4096 polynomial modulus degree (which requires < 109 total bit count of the coefficients modulus, if we consider 128-bit security), so we will use 8192. This will allow us to batch up to 4096 values in a single ciphertext, but we are far away from this limitation, so we shouldn't even think about it.


In [95]:
class EncryptedLR:

    def __init__(self, n_features):
        model = torch.nn.Linear(n_features, 1)
        self.weight = model.weight.data.tolist()[0]
        self.bias = model.bias.data.tolist()
        # we accumulate gradients and counts the number of iterations
        self._delta_w = 0
        self._delta_b = 0
        self._count = 0

    def forward(self, enc_x):
        enc_out = enc_x.dot(self.weight) + self.bias
        enc_out = EncryptedLR.sigmoid(enc_out)
        return enc_out

    def backward(self, enc_x, enc_out, enc_y):
        out_minus_y = (enc_out - enc_y)
        self._delta_w += enc_x * out_minus_y
        self._delta_b += out_minus_y
        self._count += 1

    def update_parameters(self):
        if self._count == 0:
            raise RuntimeError("You should at least run one forward iteration")
        # update weights
        # We use a small regularization term to keep the output
        # of the linear layer in the range of the sigmoid approximation
        self.weight -= self._delta_w * (1 / self._count) + self.weight * 0.05
        self.bias -= self._delta_b * (1 / self._count)
        # reset gradient accumulators and iterations count
        self._delta_w = 0
        self._delta_b = 0
        self._count = 0

    @staticmethod
    def sigmoid(enc_x):
        # We use the polynomial approximation of degree 3
        # sigmoid(x) = 0.5 + 0.197 * x - 0.004 * x^3
        # from https://eprint.iacr.org/2018/462.pdf
        # which fits the function pretty well in the range [-5,5]
        return enc_x.polyval([0.5, 0.197, 0, -0.004])

    def plain_accuracy(self, x_test, y_test):
        # evaluate accuracy of the model on
        # the plain (x_test, y_test) dataset
        w = torch.tensor(self.weight)
        b = torch.tensor(self.bias)
        out = torch.sigmoid(x_test.matmul(w) + b).reshape(-1, 1)
        correct = torch.abs(y_test - out) < 0.5
        return correct.float().mean()

    def encrypt(self, context):
        self.weight = ts.ckks_vector(context, self.weight)
        self.bias = ts.ckks_vector(context, self.bias)

    def decrypt(self):
        self.weight = self.weight.decrypt()
        self.bias = self.bias.decrypt()

    def __call__(self, *args, **kwargs):
        return self.forward(*args, **kwargs)


In [96]:
# parameters to achive 128 bit security, per claimed
poly_mod_degree = 8192
coeff_mod_bit_sizes = [40, 21, 21, 21, 21, 21, 21, 40]
# create TenSEALContext
ctx_training = ts.context(ts.SCHEME_TYPE.CKKS, poly_mod_degree, -1, coeff_mod_bit_sizes)
ctx_training.global_scale = 2 ** 21
ctx_training.generate_galois_keys()

In [98]:
t_start = time()
enc_x_train = [ts.ckks_vector(ctx_training, x.tolist()) for x in x_train]
enc_y_train = [ts.ckks_vector(ctx_training, y.tolist()) for y in y_train]
t_end = time()
print(f"Encryption of the training_set took {int(t_end - t_start)} seconds")

Encryption of the training_set took 37 seconds


In [101]:
n_features = x_train.shape[1]
print(n_features)
encrypted_logistic = EncryptedLR(n_features)

times = []

print(x_test.type())

## temp
EPOCHS=1

for epoch in range(EPOCHS):
    encrypted_logistic.encrypt(ctx_training)

    # if you want to keep an eye on the distribution to make sure
    # the function approximation is still working fine
    # WARNING: this operation is time consuming
    # encrypted_out_distribution(eelr, enc_x_train)

    t_start = time()
    for enc_x, enc_y in zip(enc_x_train, enc_y_train):
        enc_out = encrypted_logistic.forward(enc_x)
        encrypted_logistic.backward(enc_x, enc_out, enc_y)
    encrypted_logistic.update_parameters()
    t_end = time()
    times.append(t_end - t_start)

    encrypted_logistic.decrypt()
    accuracy = encrypted_logistic.plain_accuracy(x_test, y_test)
    print(f"Accuracy at epoch #{epoch + 1} is {accuracy}")


print(encrypted_logistic.weight)
print(encrypted_logistic.bias)


# print(f"\nAverage time per epoch: {int(sum(times) / len(times))} seconds")
# print(f"Final accuracy is {accuracy}")

# diff_accuracy = plain_accuracy - accuracy
# print(f"Difference between plain and encrypted accuracies: {diff_accuracy}")
# if diff_accuracy < 0:
#     print("Oh! We got a better accuracy when training on encrypted data! The noise was on our side...")


39
torch.FloatTensor
Accuracy at epoch #1 is 0.7300000190734863
[105841671.45342314, 37590.63030239445, -19.736270493926774, 17195.110922071777, 15691.550939873187, -1454.721696131865, -969.1268472571969, -856.9531764324754, 6347.122610045073, 12710.216949949197, -2947.8271711040716, 10449.337807999636, 2970.5026684256563, 11814.95195592563, 302.9028142883926, 22088.588437983388, 24803.365632282497, 20864.870811068664, 2838.9520160034217, 8274.002528775658, 2117.929234589493, 4853.174173993653, 1327.5063719009408, 5715.735938563395, 5589.049024375214, 4156.481528692245, 7438.729170407365, 6402.973733492752, 3766.505956049865, 3644.533162017821, 1872.806829347397, 4954.057095472459, 1862.945450672039, 8527.567065488533, 14558.552761394449, 9585.949095984868, 5501.117199719318, 9306.489230837677, 17351.52238239749]
[24263.788741506833]


## Comparision with plain Logistic Regression Model
Now let's train a plain logistic regression model with pytorch and compare the result with model trained with homomorphic encryption

In [115]:
class LogisticModel(torch.nn.Module):

    def __init__(self, n_features):
        super(LogisticModel, self).__init__()
        self.linear = torch.nn.Linear(n_features, 1)

    def forward(self, x):
        out = torch.sigmoid(self.linear(x))
        return out

# defining model, optimizer and loss function
n_features = x_train.shape[1]
plain_logistic = LogisticModel(n_features)
optim = torch.optim.SGD(plain_logistic.parameters(), lr=0.4)
lossf = torch.nn.BCELoss()

EPOCHS = 5

# training process
for epoch in range(EPOCHS):
    optim.zero_grad()
    out = plain_logistic(x_train)
    loss = lossf(out, y_train)
    loss.backward()
    optim.step()
    print(f"Loss at epoch {epoch + 1}: {loss.data}")


# print(plain_logistic.linear.weight)
# print(plain_logistic.linear.bias)

# calculate the accuracy
def accuracy(model, x, y):
    out = model(x)
    correct = torch.abs(y - out) < 0.5
    return correct.float().mean()

plain_accuracy = accuracy(plain_logistic, x_test, y_test)
print(f"Accuracy of plain model on test_set: {plain_accuracy}")




Loss at epoch 1: 31.254297256469727
Loss at epoch 2: 27.71476936340332
Loss at epoch 3: 27.71476936340332
Loss at epoch 4: 27.71476936340332
Loss at epoch 5: 27.71476936340332
Accuracy of plain model on test_set: 0.7300000190734863


# Convert encrypted model to ONNX format and thus can be feed into EZKL

Now we do a conversion, unfortunately there is no elegant way to do so, as tenseal does not natively support exporting to ONNX format. So we have to convert it first to a pytorch logistic model, by manually assigned it's weight and bias to what we just trained in encrypted model, and then export it using pytorch's api


In [116]:
model_to_EZKL = LogisticModel(n_features)
model_to_EZKL.linear.weight.data = torch.tensor([encrypted_logistic.weight])
model_to_EZKL.linear.bias.data = torch.tensor(encrypted_logistic.bias)






In [117]:
!pip install onnx
!pip install ezkl



# EZKL jumped in

In [120]:
# define the path of all the necessary file
import os
import ezkl
import json
import onnx

# set up some path
model_path = os.path.join('network.onnx')
compiled_model_path = os.path.join('network.compiled')
pk_path = os.path.join('test.pk')
vk_path = os.path.join('test.vk')
settings_path = os.path.join('settings.json')
srs_path = os.path.join('kzg.srs')
witness_path = os.path.join('witness.json')
data_path = os.path.join('input.json')


# # After training, export to onnx (network.onnx) and create a data file (input.json)
x = torch.randn([1, n_features], dtype=torch.float32)


# print(x)

    # Export the model

model = model_to_EZKL

#set model to eval model
model.eval()

torch.onnx.export(model,               # model being run
                  x,                   # model input (or a tuple for multiple inputs)
                  model_path,            # where to save the model (can be a file or file-like object)
                  export_params=True,        # store the trained parameter weights inside the model file
                  opset_version=10,          # the ONNX version to export the model to
                  do_constant_folding=True,  # whether to execute constant folding for optimization
                  input_names = ['input'],   # the model's input names
                  output_names = ['output'], # the model's output names
                  dynamic_axes={'input' : {0 : 'batch_size'},    # variable length axes
                                'output' : {0 : 'batch_size'}})

data_array = ((x).detach().numpy()).reshape([-1]).tolist()

data_json = dict(input_data = [data_array])

json.dump( data_json, open(data_path, 'w' ))

verbose: False, log level: Level.ERROR



In [85]:
py_run_args = ezkl.PyRunArgs()

#ZKP will hide information about user input, and model's parameters
py_run_args.input_visibility = "private"
py_run_args.output_visibility = "public"
py_run_args.param_visibility = "fixed"

res = ezkl.gen_settings(model_path, settings_path, py_run_args=py_run_args)

assert res == True

res = ezkl.compile_circuit(model_path, compiled_model_path, settings_path)
assert res == True

res = ezkl.get_srs(srs_path, settings_path)
res = ezkl.gen_witness(data_path, compiled_model_path, witness_path)
assert os.path.isfile(witness_path)

In [86]:
# HERE WE SETUP THE CIRCUIT PARAMS
# WE GOT KEYS
# WE GOT CIRCUIT PARAMETERS
# EVERYTHING ANYONE HAS EVER NEEDED FOR ZK



res = ezkl.setup(
        compiled_model_path,
        vk_path,
        pk_path,
        srs_path,
    )

assert res == True
assert os.path.isfile(vk_path)
assert os.path.isfile(pk_path)
assert os.path.isfile(settings_path)

In [87]:
# GENERATE A PROOF


proof_path = os.path.join('test.pf')

res = ezkl.prove(
        witness_path,
        compiled_model_path,
        pk_path,
        proof_path,
        srs_path,
        "single",
    )

print(res)
assert os.path.isfile(proof_path)

{'instances': [[[8895048820941913435, 12590578388187041751, 10644156957292843428, 485414809211466155]]], 'proof': '00a5bab194cf112bb60ee87d8ce9e343a7c2660af0f8669f5efd075b8b0ed1f0175fa95b786e7d8048d57f3ad7d255dd004cb0d19467fb58dbd76c6884c960332f915fe155b59d96abc2cd3f27d21901c9262fa94cd159d04bb5464f6eb3e51f1289d610d01df1218e670a66c44f257f51774211567e026f462b7523a305bb191513f472809948a9e7907188332dfd26b1df37875ffab548b9faa449134732e61fe6852bc1b80a364ea7b590375b9fa211ba7ecf8cae629e05f8811a2612d6e306c814722fdb7410ac5c6b553f83d8f0b33cedce2a2c1afb3694f1c7a5851a8318c659a7a883340b81e0c61f490e775819b245c09bebec373d5c0bf8034b7c742ad63ea8b2c37e7db99e68e9a6bbf7f1e4f8568b29e48b46d75fb2936feb0fa904d4fc2b857c4d5967e10b595fc2efffb777aee411a283d66df7192323f5b3980f3bcb355851336132f37623d3bdedab1ccdb0dc9f6273a427c2aea358b7e5bd091e19cfcf97fe6adeedb5d243e0a504e7b3770a3222caf3961b9b1ee9e2c0ea25fba8819efe8572e66b3c469340734edc65e90217d70951034949c89bcdc00210c76dee6af6aaf9b717de84744d5aeda78052d188243211d3d0e

In [88]:
# VERIFY IT

res = ezkl.verify(
        proof_path,
        settings_path,
        vk_path,
        srs_path,
    )

assert res == True
print("verified")

verified
