# XGBoost运行示例

以下示例仅用于说明XGBoost的工作流程，目前仅支持单机测试。

加载数据集

In [43]:
import pandas as pd
import os
from PSI import PSICompany, PSIPartner

project_dir = os.path.dirname(os.path.abspath(''))
data_dir = os.path.join(project_dir, 'Datasets', 'data', 'data')
host_data = pd.read_csv(os.path.join(data_dir, 'breast_hetero_host.csv'))
guest_data = pd.read_csv(os.path.join(data_dir, 'breast_hetero_guest.csv'))

company_key, company_features = host_data['id'], host_data.drop(columns=['id'])
partner_key, partner_features = guest_data['id'], guest_data.drop(columns=['id'])
company_key = company_key.astype(str)
partner_key = partner_key.astype(str)
company_features = company_features.to_numpy()
partner_features = partner_features.to_numpy()

XGBoost需要在PSI开始之前预先分桶

In [None]:
from XGBoost import quantize_buckets
import numpy as np

Quantiles1, _, buckets_labels1 = quantize_buckets(company_features, k=50)
Quantiles2, _, buckets_labels2 = quantize_buckets(partner_features[:,:-1], k=50) #最后一列是y无需分桶
pass

PSI

注意此处每条数据每个属性的分桶标签作为public_features传入

In [45]:
company = PSICompany(company_key, company_features, buckets_labels1)
partner = PSIPartner(partner_key, partner_features, buckets_labels2)

U_c, company_pk = company.exchange()
E_c, U_p, partner_pk = partner.exchange(U_c, company_pk)
L, R_cI = company.compute_intersection(E_c, U_p, partner_pk)
R_pI = partner.output_shares(L)
company_share = R_cI[0]
partner_share = R_pI[0]

Computing masked company cipher
Computing masked partner cipher
Computing company shares
Computing partner shares


获取交集的分桶标签

In [46]:
company_buckets_labels = R_cI[1]
partner_buckets_labels = R_pI[1]
buckets_labels = np.concatenate((company_buckets_labels, partner_buckets_labels), axis=1)

此时share是`np.ndarray`类型。下面将其放入秘密共享设备spu中。
目前我暂时还没有找到由share直接构造`SPUObject`的方法。可以暂时使用这个方法：

In [47]:
from common import MPCInitializer, sigmoid, softmax
import secretflow as sf
import numpy as np
import jax.numpy as jnp
mpc_init = MPCInitializer()
company, partner, coordinator, spu = mpc_init.company, mpc_init.partner, mpc_init.coordinator, mpc_init.spu
# 假设y由company持有
label_holder = company

def share2spu(X1 : np.ndarray, X2 : np.ndarray):
    """
    X1：由Company持有的share
    X2：由Partner持有的share
    """    
    X1, X2 = jnp.array(X1,dtype=jnp.float32), jnp.array(X2,dtype=jnp.float32)
    # 将X1，X2分别移动到spu
    X1 = sf.to(company, X1).to(spu)
    X2 = sf.to(partner, X2).to(spu)
    # 再在spu内部相加，得到秘密共享的变量X
    def add(X1, X2):
        """
        在SPU中执行加法操作
        """
        return X1 + X2
    return spu(add)(X1, X2)

划分训练集和测试集

In [None]:
from sklearn.model_selection import train_test_split

train_1, test_1, train_2, test_2, buckets_labels_train, buckets_labels_test = train_test_split(
    company_share, partner_share, buckets_labels)
train_X1, train_y1 = train_1[:, :-1], train_1[:, -1].reshape(-1,1)
train_X2, train_y2 = train_2[:, :-1], train_2[:, -1].reshape(-1,1)
test_X1, test_y1 = test_1[:, :-1], test_1[:, -1].reshape(-1,1)
test_X2, test_y2 = test_2[:, :-1], test_2[:, -1].reshape(-1,1)

train_X = share2spu(train_X1, train_X2)
from secretflow.data.ndarray import load, PartitionWay
np.save("test_X1.npy", test_X1)
np.save("test_X2.npy", test_X2)
test_X = load({company: "test_X1.npy", partner: "test_X2.npy"})
train_y = share2spu(train_y1, train_y2).to(label_holder)
# 目前的模型在推理状态下，预测值按公开处理，因此测试集的y也公开
test_y = test_y1 + test_y2

将分桶标签整理恢复为桶列表

桶列表中每个元素`bucket_j`是特征`j`的桶列表。`bucket_j`中的每个元素是一个一维数组，表示一个桶。桶里面保存每个元素在`X`中的索引。

In [49]:
from XGBoost import recover_buckets
buckets_train = recover_buckets(buckets_labels_train)

对分位点进行整理，需要整理为联邦数组形式和秘密共享形式

In [50]:
np.save("Quantiles1.npy", Quantiles1)
np.save("Quantiles2.npy", Quantiles2)
FedQuantiles = load({company: "Quantiles1.npy", partner: "Quantiles2.npy"}, partition_way=PartitionWay.HORIZONTAL)

Quantiles1 = sf.to(company, jnp.array(Quantiles1)).to(spu)
Quantiles2 = sf.to(partner, jnp.array(Quantiles2)).to(spu)
# 将分桶标签转换为SPU上的格式
def concat(Quantiles1, Quantiles2):
    """
    将两个分桶标签拼接在一起
    """
    return jnp.concatenate((Quantiles1, Quantiles2), axis=1)
SSQuantiles = spu(concat)(Quantiles1, Quantiles2)

训练

In [None]:
from XGBoost import SSXGBoost
split_index = company_features.shape[1]  # 分割索引，表示company特征与partner特征的分界
model = SSXGBoost(train_X1.shape[1], split_index)
model.fit(train_X, train_y, buckets_train, SSQuantiles, FedQuantiles)

y_pred = model.predict(test_X)

from sklearn.metrics import accuracy_score
Accuracy = accuracy_score(test_y, y_pred)
print(f"Accuracy of SSXGBoost on breast dataset: {Accuracy:.4f}")

  from .autonotebook import tqdm as notebook_tqdm
2025-07-28 23:21:19,224	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2025-07-28 23:21:19,545	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
  return getattr(_posixsubprocess, original_name)(args, *other_args)
  return getattr(_posixsubprocess, original_name)(args, *other_args)
2025-07-28 23:21:43,058	ERROR services.py:1353 -- Failed to start the dashboard 
2025-07-28 23:21:43,059	ERROR services.py:1378 -- Error should be written to 'dashboard.log' or 'dashboard.err'. We are printing the last 20 lines for you. See 'https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#logging-directory-structure' to find where the log file is.
2025-07-28 23:21:43,060	ERROR services.py:1388 -- Couldn't read dashboard.log file. Err

NameError: name 'company_features' is not defined

