# LR示例

以下示例仅用于说明LR的工作流程，目前仅支持单机测试。

PSI

In [None]:
import pandas as pd
import os
from PSI import PSICompany, PSIPartner

project_dir = os.path.dirname(os.path.abspath(''))
data_dir = os.path.join(project_dir, 'Datasets', 'data', 'data')
host_data = pd.read_csv(os.path.join(data_dir, 'breast_hetero_host.csv'))
guest_data = pd.read_csv(os.path.join(data_dir, 'breast_hetero_guest.csv'))

company_key, company_features = host_data['id'], host_data.drop(columns=['id'])
partner_key, partner_features = guest_data['id'], guest_data.drop(columns=['id'])
company_key = company_key.astype(str)
partner_key = partner_key.astype(str)
company = PSICompany(company_key, company_features)
partner = PSIPartner(partner_key, partner_features)

U_c, company_pk = company.exchange()
U_g, partner_pk = partner.exchange()
E_c, U_p, partner_pk = partner.exchange(U_c, company_pk)
L, R_cI = company.compute_intersection(E_c, U_p, partner_pk)
R_pI = partner.output_shares(L)
company_share = R_cI[0]
partner_share = R_pI[0]

此时share是`np.ndarray`类型。下面将其放入秘密共享设备spu中。
目前我暂时还没有找到由share直接构造`SPUObject`的方法。可以暂时使用这个方法：

In [None]:
from common import MPCInitializer, sigmoid, softmax
import secretflow as sf
import numpy as np
import jax.numpy as jnp
mpc_init = MPCInitializer()
company, partner, coordinator, spu = mpc_init.company, mpc_init.partner, mpc_init.coordinator, mpc_init.spu
# 假设y由company持有
label_holder = company

def share2spu(X1 : np.ndarray, X2 : np.ndarray):
    """
    X1：由Company持有的share
    X2：由Partner持有的share
    """    
    X1, X2 = jnp.array(X1,dtype=jnp.float32), jnp.array(X2,dtype=jnp.float32)
    # 将X1，X2分别移动到spu
    X1 = sf.to(company, X1).to(spu)
    X2 = sf.to(partner, X2).to(spu)
    # 再在spu内部相加，得到秘密共享的变量X
    return spu(jnp.add)(X1, X2)

划分训练集和测试集

In [None]:
from sklearn.model_selection import train_test_split


train_1, test_1, train_2, test_2 = train_test_split(
    company_share, partner_share)
train_X1, train_y1 = train_1[:, :-1], train_1[:, -1]
train_X2, train_y2 = train_2[:, :-1], train_2[:, -1]
test_X1, test_y1 = test_1[:, :-1], test_1[:, -1]
test_X2, test_y2 = test_2[:, :-1], test_2[:, -1]

test_X = share2spu(test_X1, test_X2)
# 目前的模型在推理状态下，预测值按公开处理，因此测试集的y也公开
test_y = test_y1 + test_y2

训练集划分batch以实现批量训练

In [None]:
num_samples, num_features = train_X1.shape  # train_X1和train_X2的样本数相同

batch_size = 1024
Xs = []
ys = []
for j in range(0,num_samples,batch_size):
    batch = min(batch_size,num_samples - j)
    X_batch = share2spu(train_X1[j:j+batch], train_X2[j:j+batch])
    y_batch = train_y1[j:j+batch] + train_y2[j:j+batch]
    Xs.append(X_batch)
    ys.append(y_batch)

训练指定的轮次

In [None]:
from LR import SSLR

model = SSLR(num_features)
model.fit(Xs, ys, epochs=10)

或手动训练，绘制损失曲线

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from tqdm import tqdm

n_iter = 20
accs = []
max_acc = 0
for t in range(1,n_iter + 1):
    print(f"Epoch {t}")
    for X,y in tqdm(zip(Xs, ys)):
        y_pred = model.forward(X)
        model.backward(X, y, y_pred, 0.1 / t)

    y_pred = model.predict(test_X)
    Accracy = accuracy_score(test_y, y_pred)
    if Accracy > max_acc:
        max_acc = Accracy
        print(f"Iteration {t}, Accuracy: {Accracy:.4f}")
    accs.append(Accracy)

plt.plot(accs,label = "SSLR",color = "blue")