Skip to content

Commit

Permalink
it works?, bug found health
Browse files Browse the repository at this point in the history
  • Loading branch information
Jaap-Meerhof committed Aug 15, 2023
1 parent 9838478 commit 5a3cbec
Show file tree
Hide file tree
Showing 10 changed files with 56 additions and 34 deletions.
2 changes: 1 addition & 1 deletion run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ name="$1"
outputfile="output_${name}.txt"
errorfile="error_${name}.txt"

rm $outputfile $errorfile & pip install . && bsub -J $name -q bio -n 12 -M 64G -R "rusage[mem=64G]" -o $outputfile -e $errorfile mpiexec -np 3 /home/meerhofj/.conda/envs/fedxgboost_mpi/bin/python /home/meerhofj/Documents/Federated_XGBoost_Python/tests/main.py
rm $outputfile $errorfile & pip install . && bsub -J $name -q bio -n 4 -M 64G -R "rusage[mem=64G]" -o $outputfile -e $errorfile mpiexec -np 3 /home/meerhofj/.conda/envs/fedxgboost_mpi/bin/python /home/meerhofj/Documents/Federated_XGBoost_Python/tests/main.py
6 changes: 3 additions & 3 deletions src/SFXGBoost/MemberShip.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ def f_random(D_Train_Shadow, D_Out_Shadow):
D_Train_Shadow (Tuple(np.ndarray)): holds X and y
D_Out_Shadow (Tuple(np.ndarray)): holds X and y
"""
max_lenght = np.max((D_Train_Shadow[0].shape[0], D_Out_Shadow[0].shape[0])) # make it such that the concatenated list is 50/50 split
X_Train_Shadow = D_Train_Shadow[0][:max_lenght, :]
X_Out_Shadow = D_Out_Shadow[0][:max_lenght, :]
min_lenght = np.min((D_Train_Shadow[0].shape[0], D_Out_Shadow[0].shape[0])) # make it such that the concatenated list is 50/50 split
X_Train_Shadow = D_Train_Shadow[0][:min_lenght, :]
X_Out_Shadow = D_Out_Shadow[0][:min_lenght, :]

# add an extra column with 1 if Train_Shadow else 0
X_Train_Shadow = np.hstack( (X_Train_Shadow, np.ones((X_Train_Shadow.shape[0], 1))))
Expand Down
26 changes: 14 additions & 12 deletions src/SFXGBoost/Model.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,14 +62,17 @@ def boost(self, init_Probas):
####################
if rank == PARTY_ID.SERVER:
splits = comm.recv(source=1, tag=MSG_ID.INITIAL_QUANTILE_SPLITS)

self.logger.warning("splits:")
for i, split_k in enumerate(splits):
self.logger.warning(f"{self.fName[i]} = {split_k}")

for t in range(self.config.max_tree):
print("#####################")
# print("#####################")
print(f"> Busy with Tree {t} <")
print("#####################")
# print("#####################")

nodes = [[self.trees[c][t].root] for c in range(self.config.nClasses)]
for l in range(self.config.max_depth):
for l in range(self.config.max_depth+1):
G = [[ [] for _ in range(len(nodes[c])) ] for c in range(self.config.nClasses)]
H = [[ [] for _ in range(len(nodes[c])) ] for c in range(self.config.nClasses)]
for Pi in range(1, comm.Get_size()):
Expand All @@ -87,7 +90,7 @@ def boost(self, init_Probas):
G[c][i] += Gpi[c][i]
H[c][i] += Hpi[c][i]
splittingInfos = [[] for _ in range(self.config.nClasses)]
print("got gradients")
# print("got gradients")
for c in range(self.config.nClasses):
# def test(i):
# print(f"working on node c={c} i={i}")
Expand All @@ -99,7 +102,7 @@ def boost(self, init_Probas):
# results = list(executor.map(test, range(len(nodes[c]))))
# print(results)
for i, n in enumerate(nodes[c]): #TODO multithread!
split_cn = self.find_split(splits, G[c][i], H[c][i], l+1 == self.config.max_depth)
split_cn = self.find_split(splits, G[c][i], H[c][i], l == self.config.max_depth)
splittingInfos[c].append(split_cn)

for Pi in range(1, comm.Get_size()):
Expand Down Expand Up @@ -134,12 +137,11 @@ def boost(self, init_Probas):
y = self.y
G, H = None, None
for t in range(self.config.max_tree):
print(y_pred)
G, H = getGradientHessians(np.argmax(y, axis=1), y_pred) # nUsers, nClasses
G, H = np.array(G).T, np.array(H).T # (nClasses, nUsers)
nodes = [[self.trees[c][t].root] for c in range(self.config.nClasses)]

for l in range(self.config.max_depth):
for l in range(self.config.max_depth+1):
Gnodes = [[] for _ in range(self.config.nClasses)]
Hnodes = [[] for _ in range(self.config.nClasses)]

Expand All @@ -151,7 +153,7 @@ def boost(self, init_Probas):
Gnodes[c].append(gcn)
Hnodes[c].append(hcn)
# send the gradients for every class's tree, the different nodes that have to be updated in that tree and the
print(f"sending gradients as rank {rank} on level {l}")
# print(f"sending gradients as rank {rank} on level {l}")
comm.send((Gnodes,Hnodes), PARTY_ID.SERVER, tag=MSG_ID.RESPONSE_GRADIENTS)
splits = comm.recv(source=PARTY_ID.SERVER, tag=MSG_ID.SPLIT_UPDATE)
nodes = self.update_trees(nodes, splits, l) # also update Instances
Expand Down Expand Up @@ -235,8 +237,8 @@ def find_split(self, splits, gradient, hessian, is_last_level):
Hl += hessian[k][v]
Gr = G-Gl
Hr = H-Hl
score = L(G, H, Gl, Gr, Hl, Hr, self.config.lam, self.config.gamma)
if score > maxScore:
score = L(G, H, Gl, Gr, Hl, Hr, self.config.lam, self.config.alpha)
if score > maxScore and Hl > 1 and Hr > 1: # TODO 1 = min_child_weight
value = splits[k][v]
feature = k
featureName = self.fName[k]
Expand All @@ -258,7 +260,7 @@ def update_trees(self, last_level_nodes:List[List[FLTreeNode]], splits:List[List
for c in range(self.config.nClasses):
for n, node in enumerate(last_level_nodes[c]):
splitInfo = splits[c][n]
if splitInfo.isValid and depth+1 < self.config.max_depth:
if splitInfo.isValid and depth < self.config.max_depth:
node.splittingInfo = splitInfo
node.leftBranch = FLTreeNode()
node.rightBranch = FLTreeNode()
Expand Down
24 changes: 16 additions & 8 deletions src/SFXGBoost/common/XGBoostcommon.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,21 @@
from SFXGBoost.common.BasicTypes import Direction
from SFXGBoost.config import rank

L = lambda G,H, GL, GR, HL, HR, lamb, gamma: 1/2 * ((GL*GL / (HL + lamb)) + (GR*GR / (HR + lamb)) - (G*G / (H + lamb))) - gamma

def ThresholdL1(g, alpha):
if g > alpha:
return g - alpha
elif g < -alpha:
return g + alpha
else:
return 0.0

# L = lambda G,H, GL, GR, HL, HR, lamb, gamma: 1/2 * ((ThresholdL1(GL*GL) / (HL + lamb)) + (ThresholdL1(GR*GR) / (HR + lamb)) - (ThresholdL1(G*G) / (H + lamb))) - gamma
L = lambda G,H, GL, GR, HL, HR, lamb, alpha: ((ThresholdL1(GL*GL, alpha) / (HL + lamb)) + (ThresholdL1(GR*GR, alpha) / (HR + lamb)) - (ThresholdL1(G*G, alpha) / (H + lamb)))

# L = lambda G,H, GL, GR, HL, HR, lamb, gamma, alpha: 1/2 * (ThresholdL1(GL, alpha) / (HL + lamb)) + (GR*GR / (HR + lamb)) - (G*G / (H + lamb))) - gamma

# def computeSplitScore(Gl, Gr)

class PARTY_ID:
ACTIVE_PARTY = 1
Expand Down Expand Up @@ -82,13 +96,7 @@ def weights_to_probas(y_pred):
y_pred[rowid, :] = np.exp(row-wmax) / wsum
return y_pred

def ThresholdL1(g, alpha):
if g > alpha:
return g - alpha
elif g < -alpha:
return g + alpha
else:
return 0.0


def compute_splitting_score(SM, GVec, HVec, lamb, gamma):
G = sum(GVec)
Expand Down
3 changes: 2 additions & 1 deletion src/SFXGBoost/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ def __init__(self, config:Config):
logger = logging.getLogger()
day = date.today().strftime("%b-%d-%Y")

curTime = round(time.time())
# curTime = round(time.time())
curTime = time.strftime("%H:%M", time.localtime())

logName = 'Log/{}/{}/{}_{}/Rank_{}.log'.format(config.nameTest, str(day), str(curTime), str(config.model), str(rank))
os.makedirs(os.path.dirname(logName), exist_ok=True)
Expand Down
2 changes: 1 addition & 1 deletion src/SFXGBoost/data_structure/treestructure.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,6 @@ def compute_leaf_param(gVec, hVec, lamb, alpha):
hI = np.sum(hVec)
# print(f"gI = {gI}")
# print(f"hI = {hI}")
weight = -1.0 * ThresholdL1(gI / (hI + lamb), alpha)
weight = -1.0 * ThresholdL1(gI, alpha) / (hI + lamb)
score = 1/2 * weight * gI
return weight, score
8 changes: 4 additions & 4 deletions src/SFXGBoost/dataset/datasetRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def getHealthcare(paths): # https://www.kaggle.com/datasets/nehaprabhavalkar/av-
train_size = 100_000
test_size = 30_000
random_state = 420
shadow_size = 150_000
shadow_size = 150_000

def returnfunc():
train = check_mul_paths_csv("AV_HealthcareAnalyticsII/train_data", paths)
Expand All @@ -145,7 +145,7 @@ def returnfunc():
sample = check_mul_paths_csv("AV_HealthcareAnalyticsII/sample_sub", paths)
strings = ['Hospital_type_code', 'Hospital_region_code', 'Department', 'Ward_Type', 'Ward_Facility_Code', 'Type of Admission', 'Severity of Illness', 'Age', 'Stay']
train = train.dropna()


# def complex_factorize(df, col):
# ser = pd.Series(df[col].unique())
Expand All @@ -159,8 +159,8 @@ def returnfunc():

# train[strings] = train[strings].apply(lambda x: pd.factorize(x)[0])
# train = train.apply(lambda x: pd.factorize(x)[0])
fName = train.columns.tolist()[1:]
X = train.values[:, 1:]
fName = train.columns.tolist()[1:17]
X = train.values[:, 1:17]
y = makeOneHot(y = train.values[:, 17].reshape(-1,1))
X_train = X[:train_size]
y_train = y[:train_size]
Expand Down
2 changes: 1 addition & 1 deletion src/SFXGBoost/loss/softmax.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ def getGradientHessians(y, y_pred, case_weight=None):
grad = np.zeros((y_pred.shape), dtype=float) # for multi-class
hess = np.zeros((y_pred.shape), dtype=float) # for multi-class
for rowid in range(y_pred.shape[0]):
wmax = max(y_pred[rowid]) # line 100 multiclass_obj.cu
wmax = max(y_pred[rowid]) # line 10s0 multiclass_obj.cu
wsum =0.0
for i in y_pred[rowid] : wsum += np.exp(i - wmax)
for c in range(y_pred.shape[1]):
Expand Down
2 changes: 1 addition & 1 deletion src/SFXGBoost/view/TreeRender.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
class FLVisNode():
def __init__(self, logger, FLnode: FLTreeNode):
self.key = FLnode.owner
self.key = str(FLnode.splittingInfo.featureName) + " " + str(FLnode.splittingInfo.splitValue)
self.key = str(FLnode.splittingInfo.featureName) + " " + f"{FLnode.splittingInfo.splitValue:.{2}f}"
self.weight = FLnode.weight
self.right = FLVisNode(logger, FLnode.rightBranch) if(FLnode.rightBranch is not None) else None
self.left = FLVisNode(logger, FLnode.leftBranch) if(FLnode.leftBranch is not None) else None
Expand Down
15 changes: 13 additions & 2 deletions tests/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ def test_global(config:Config, logger:Logger, model: SFXGBoostClassifierBase, ge
learning_rate=0.3, n_estimators=config.max_tree, gamma=config.gamma, reg_alpha=0, reg_lambda=config.lam)
xgboostmodel.fit(X_train, np.argmax(y_train, axis=1))
from sklearn.metrics import accuracy_score
print(np.shape(X_test))

y_pred_xgb = xgboostmodel.predict(X_test)
print(f"Accuracy xgboost normal = {accuracy_score(np.argmax(y_test, axis=1), y_pred_xgb)}")

Expand Down Expand Up @@ -102,12 +104,21 @@ def main():
shadow_model = SFXGBoost(config, logger)
# shadow_model = xgb.XGBClassifier(ax_depth=config.max_depth, objective="multi:softmax", tree_method="approx",
# learning_rate=0.3, n_estimators=config.max_tree, gamma=config.gamma, reg_alpha=0, reg_lambda=config.lam)
attack_model = xgb.XGBClassifier(tree_method="exact", objective='binary:logistic', max_depth=10, n_estimators=30, learning_rate=0.3)
# attack_model = xgb.XGBClassifier(tree_method="exact", objective='binary:logistic', max_depth=10, n_estimators=30, learning_rate=0.3)
# attack_model = DecisionTreeClassifier(max_depth=6,max_leaf_nodes=10)
# attack_model = MLPClassifier(hidden_layer_sizes=(10,10), activation='relu', solver='adam', learning_rate_init=0.01, max_iter=2000)
attack_model = MLPClassifier(hidden_layer_sizes=(20,11,11), activation='relu', solver='adam', learning_rate_init=0.01, max_iter=2000)

# if isSaved(config.nameTest, config):
# shadow_model = retrieve("model", config)
# TODO target_model = train_model()
# TODO shadow_model = train_model()
# that way I can save the model reuse it and apply different attack_models on it.
# TODO SFXGBoost().getGradients.

# X_train, y_train, X_test, y_test, fName, X_shadow, y_shadow = getDataBase(config.dataset, POSSIBLE_PATHS)()
# log_distribution(logger, X_train, y_train, y_test)
# model.fit(X_train, y_train, fName)

X, y, y_pred_org, y_test, model, X_shadow, y_shadow, fName = test_global(config, logger, model, getDataBase(config.dataset, POSSIBLE_PATHS))

preform_attack_centralised(config, (X_shadow, y_shadow), model, shadow_model, attack_model, X, y, fName)
Expand Down

0 comments on commit 5a3cbec

Please sign in to comment.