it works?, bug found health

Jaap-Meerhof · Aug 15, 2023 · 5a3cbec · 5a3cbec
1 parent 9838478
commit 5a3cbec
Show file tree

Hide file tree

Showing 10 changed files with 56 additions and 34 deletions.
diff --git a/run.sh b/run.sh
@@ -3,4 +3,4 @@ name="$1"
 outputfile="output_${name}.txt"
 errorfile="error_${name}.txt"
 
-rm $outputfile $errorfile & pip install . && bsub -J $name -q bio -n 12 -M 64G -R "rusage[mem=64G]" -o $outputfile -e $errorfile mpiexec -np 3 /home/meerhofj/.conda/envs/fedxgboost_mpi/bin/python /home/meerhofj/Documents/Federated_XGBoost_Python/tests/main.py
+rm $outputfile $errorfile & pip install . && bsub -J $name -q bio -n 4 -M 64G -R "rusage[mem=64G]" -o $outputfile -e $errorfile mpiexec -np 3 /home/meerhofj/.conda/envs/fedxgboost_mpi/bin/python /home/meerhofj/Documents/Federated_XGBoost_Python/tests/main.py
diff --git a/src/SFXGBoost/MemberShip.py b/src/SFXGBoost/MemberShip.py
@@ -33,9 +33,9 @@ def f_random(D_Train_Shadow, D_Out_Shadow):
         D_Train_Shadow (Tuple(np.ndarray)): holds X and y
         D_Out_Shadow (Tuple(np.ndarray)): holds X and y
     """
-    max_lenght = np.max((D_Train_Shadow[0].shape[0], D_Out_Shadow[0].shape[0])) # make it such that the concatenated list is 50/50 split
-    X_Train_Shadow = D_Train_Shadow[0][:max_lenght, :]
-    X_Out_Shadow = D_Out_Shadow[0][:max_lenght, :]
+    min_lenght = np.min((D_Train_Shadow[0].shape[0], D_Out_Shadow[0].shape[0])) # make it such that the concatenated list is 50/50 split
+    X_Train_Shadow = D_Train_Shadow[0][:min_lenght, :]
+    X_Out_Shadow = D_Out_Shadow[0][:min_lenght, :]
 
     # add an extra column with 1 if Train_Shadow else 0
     X_Train_Shadow = np.hstack( (X_Train_Shadow, np.ones((X_Train_Shadow.shape[0], 1)))) 

diff --git a/src/SFXGBoost/Model.py b/src/SFXGBoost/Model.py
@@ -62,14 +62,17 @@ def boost(self, init_Probas):
         ####################
         if rank == PARTY_ID.SERVER:
             splits = comm.recv(source=1, tag=MSG_ID.INITIAL_QUANTILE_SPLITS)
-
+            self.logger.warning("splits:")
+            for i, split_k in enumerate(splits):
+                self.logger.warning(f"{self.fName[i]} = {split_k}")
+
             for t in range(self.config.max_tree):
-                print("#####################")
+                # print("#####################")
                 print(f"> Busy with Tree {t} <")
-                print("#####################")
+                # print("#####################")
 
                 nodes = [[self.trees[c][t].root] for c in range(self.config.nClasses)] 
-                for l in range(self.config.max_depth):
+                for l in range(self.config.max_depth+1):
                     G = [[ [] for _ in range(len(nodes[c])) ] for c in range(self.config.nClasses)]
                     H = [[ [] for _ in range(len(nodes[c])) ] for c in range(self.config.nClasses)]
                     for Pi in range(1, comm.Get_size()):
@@ -87,7 +90,7 @@ def boost(self, init_Probas):
                                     G[c][i] += Gpi[c][i]
                                     H[c][i] += Hpi[c][i]
                     splittingInfos = [[] for _ in range(self.config.nClasses)] 
-                    print("got gradients")
+                    # print("got gradients")
                     for c in range(self.config.nClasses):
                         # def test(i):
                         #     print(f"working on node c={c} i={i}")
@@ -99,7 +102,7 @@ def boost(self, init_Probas):
                         #     results = list(executor.map(test, range(len(nodes[c]))))
                         # print(results)
                         for i, n in enumerate(nodes[c]): #TODO multithread!
-                            split_cn = self.find_split(splits, G[c][i], H[c][i], l+1 == self.config.max_depth)
+                            split_cn = self.find_split(splits, G[c][i], H[c][i], l == self.config.max_depth)
                             splittingInfos[c].append(split_cn)
 
                     for Pi in range(1, comm.Get_size()):
@@ -134,12 +137,11 @@ def boost(self, init_Probas):
             y = self.y
             G, H = None, None
             for t in range(self.config.max_tree):
-                print(y_pred)
                 G, H = getGradientHessians(np.argmax(y, axis=1), y_pred) # nUsers, nClasses
                 G, H = np.array(G).T, np.array(H).T  # (nClasses, nUsers)
                 nodes = [[self.trees[c][t].root] for c in range(self.config.nClasses)]
 
-                for l in range(self.config.max_depth):
+                for l in range(self.config.max_depth+1):
                     Gnodes = [[] for _ in range(self.config.nClasses)]
                     Hnodes = [[] for _ in range(self.config.nClasses)]
 
@@ -151,7 +153,7 @@ def boost(self, init_Probas):
                             Gnodes[c].append(gcn)
                             Hnodes[c].append(hcn)
                     # send the gradients for every class's tree, the different nodes that have to be updated in that tree and the 
-                    print(f"sending gradients as rank {rank} on level {l}")
+                    # print(f"sending gradients as rank {rank} on level {l}")
                     comm.send((Gnodes,Hnodes), PARTY_ID.SERVER, tag=MSG_ID.RESPONSE_GRADIENTS)
                     splits = comm.recv(source=PARTY_ID.SERVER, tag=MSG_ID.SPLIT_UPDATE)
                     nodes = self.update_trees(nodes, splits, l) # also update Instances
@@ -235,8 +237,8 @@ def find_split(self, splits, gradient, hessian, is_last_level):
                     Hl += hessian[k][v]
                     Gr = G-Gl
                     Hr = H-Hl
-                    score = L(G, H, Gl, Gr, Hl, Hr, self.config.lam, self.config.gamma)
-                    if score > maxScore:
+                    score = L(G, H, Gl, Gr, Hl, Hr, self.config.lam, self.config.alpha)
+                    if score > maxScore and Hl > 1 and Hr > 1: # TODO 1 = min_child_weight
                         value = splits[k][v]
                         feature = k
                         featureName = self.fName[k]
@@ -258,7 +260,7 @@ def update_trees(self, last_level_nodes:List[List[FLTreeNode]], splits:List[List
         for c in range(self.config.nClasses):
             for n, node in enumerate(last_level_nodes[c]):
                 splitInfo = splits[c][n]
-                if splitInfo.isValid and depth+1 < self.config.max_depth:
+                if splitInfo.isValid and depth < self.config.max_depth:
                     node.splittingInfo = splitInfo
                     node.leftBranch = FLTreeNode()
                     node.rightBranch = FLTreeNode()

diff --git a/src/SFXGBoost/common/XGBoostcommon.py b/src/SFXGBoost/common/XGBoostcommon.py
@@ -4,7 +4,21 @@
 from SFXGBoost.common.BasicTypes import Direction
 from SFXGBoost.config import rank
 
-L = lambda G,H, GL, GR, HL, HR, lamb, gamma: 1/2 * ((GL*GL / (HL + lamb)) + (GR*GR / (HR + lamb)) - (G*G / (H + lamb))) - gamma
+
+def ThresholdL1(g, alpha):
+    if g > alpha:
+        return g - alpha
+    elif g < -alpha:
+        return g + alpha
+    else:
+        return 0.0
+
+# L = lambda G,H, GL, GR, HL, HR, lamb, gamma: 1/2 * ((ThresholdL1(GL*GL) / (HL + lamb)) + (ThresholdL1(GR*GR) / (HR + lamb)) - (ThresholdL1(G*G) / (H + lamb))) - gamma
+L = lambda G,H, GL, GR, HL, HR, lamb, alpha: ((ThresholdL1(GL*GL, alpha) / (HL + lamb)) + (ThresholdL1(GR*GR, alpha) / (HR + lamb)) - (ThresholdL1(G*G, alpha) / (H + lamb)))
+
+# L = lambda G,H, GL, GR, HL, HR, lamb, gamma, alpha: 1/2 * (ThresholdL1(GL, alpha) / (HL + lamb)) + (GR*GR / (HR + lamb)) - (G*G / (H + lamb))) - gamma
+
+# def computeSplitScore(Gl, Gr)
 
 class PARTY_ID:
     ACTIVE_PARTY = 1
@@ -82,13 +96,7 @@ def weights_to_probas(y_pred):
                 y_pred[rowid, :] = np.exp(row-wmax) / wsum
     return y_pred
 
-def ThresholdL1(g, alpha):
-    if g > alpha:
-        return g - alpha
-    elif g < -alpha:
-        return g + alpha
-    else:
-        return 0.0
+
 
 def compute_splitting_score(SM, GVec, HVec, lamb, gamma):
     G = sum(GVec)

diff --git a/src/SFXGBoost/config.py b/src/SFXGBoost/config.py
@@ -45,7 +45,8 @@ def __init__(self, config:Config):
         logger = logging.getLogger()
         day = date.today().strftime("%b-%d-%Y")
 
-        curTime = round(time.time())
+        # curTime = round(time.time())
+        curTime = time.strftime("%H:%M", time.localtime())
 
         logName = 'Log/{}/{}/{}_{}/Rank_{}.log'.format(config.nameTest, str(day), str(curTime), str(config.model), str(rank))
         os.makedirs(os.path.dirname(logName), exist_ok=True)

diff --git a/src/SFXGBoost/data_structure/treestructure.py b/src/SFXGBoost/data_structure/treestructure.py
@@ -118,6 +118,6 @@ def compute_leaf_param(gVec, hVec, lamb, alpha):
         hI = np.sum(hVec)
         # print(f"gI = {gI}")
         # print(f"hI = {hI}")
-        weight = -1.0 * ThresholdL1(gI / (hI + lamb), alpha)
+        weight = -1.0 * ThresholdL1(gI, alpha) / (hI + lamb)
         score = 1/2 * weight * gI
         return weight, score
diff --git a/src/SFXGBoost/dataset/datasetRetrieval.py b/src/SFXGBoost/dataset/datasetRetrieval.py
@@ -136,7 +136,7 @@ def getHealthcare(paths): # https://www.kaggle.com/datasets/nehaprabhavalkar/av-
     train_size = 100_000
     test_size = 30_000
     random_state = 420
-    shadow_size = 150_000   
+    shadow_size = 150_000
 
     def returnfunc():
         train = check_mul_paths_csv("AV_HealthcareAnalyticsII/train_data", paths)
@@ -145,7 +145,7 @@ def returnfunc():
         sample = check_mul_paths_csv("AV_HealthcareAnalyticsII/sample_sub", paths)
         strings = ['Hospital_type_code', 'Hospital_region_code', 'Department', 'Ward_Type', 'Ward_Facility_Code', 'Type of Admission', 'Severity of Illness', 'Age', 'Stay']
         train = train.dropna()
-        
+
 
         # def complex_factorize(df, col):
         #     ser = pd.Series(df[col].unique())
@@ -159,8 +159,8 @@ def returnfunc():
 
         # train[strings] = train[strings].apply(lambda x: pd.factorize(x)[0])
         # train = train.apply(lambda x: pd.factorize(x)[0])
-        fName = train.columns.tolist()[1:]
-        X = train.values[:, 1:]
+        fName = train.columns.tolist()[1:17]
+        X = train.values[:, 1:17]
         y = makeOneHot(y = train.values[:, 17].reshape(-1,1))
         X_train = X[:train_size]
         y_train = y[:train_size]

diff --git a/src/SFXGBoost/loss/softmax.py b/src/SFXGBoost/loss/softmax.py
@@ -9,7 +9,7 @@ def getGradientHessians(y, y_pred, case_weight=None):
     grad = np.zeros((y_pred.shape), dtype=float) # for multi-class
     hess = np.zeros((y_pred.shape), dtype=float) # for multi-class
     for rowid in range(y_pred.shape[0]):
-        wmax = max(y_pred[rowid]) # line 100 multiclass_obj.cu
+        wmax = max(y_pred[rowid]) # line 10s0 multiclass_obj.cu
         wsum =0.0
         for i in y_pred[rowid] : wsum +=  np.exp(i - wmax)
         for c in range(y_pred.shape[1]):

diff --git a/src/SFXGBoost/view/TreeRender.py b/src/SFXGBoost/view/TreeRender.py
@@ -8,7 +8,7 @@
 class FLVisNode():
     def __init__(self, logger, FLnode: FLTreeNode):
         self.key = FLnode.owner
-        self.key = str(FLnode.splittingInfo.featureName) + " " + str(FLnode.splittingInfo.splitValue)
+        self.key = str(FLnode.splittingInfo.featureName) + " " + f"{FLnode.splittingInfo.splitValue:.{2}f}"
         self.weight = FLnode.weight
         self.right = FLVisNode(logger, FLnode.rightBranch) if(FLnode.rightBranch is not None) else None
         self.left = FLVisNode(logger, FLnode.leftBranch) if(FLnode.leftBranch is not None) else None

diff --git a/tests/main.py b/tests/main.py
@@ -67,6 +67,8 @@ def test_global(config:Config, logger:Logger, model: SFXGBoostClassifierBase, ge
                             learning_rate=0.3, n_estimators=config.max_tree, gamma=config.gamma, reg_alpha=0, reg_lambda=config.lam)
         xgboostmodel.fit(X_train, np.argmax(y_train, axis=1))
         from sklearn.metrics import accuracy_score
+        print(np.shape(X_test))
+
         y_pred_xgb = xgboostmodel.predict(X_test)
         print(f"Accuracy xgboost normal = {accuracy_score(np.argmax(y_test, axis=1), y_pred_xgb)}")
 
@@ -102,12 +104,21 @@ def main():
         shadow_model = SFXGBoost(config, logger)
         # shadow_model = xgb.XGBClassifier(ax_depth=config.max_depth, objective="multi:softmax", tree_method="approx",
         #                 learning_rate=0.3, n_estimators=config.max_tree, gamma=config.gamma, reg_alpha=0, reg_lambda=config.lam)
-        attack_model = xgb.XGBClassifier(tree_method="exact", objective='binary:logistic', max_depth=10, n_estimators=30, learning_rate=0.3)
+        # attack_model = xgb.XGBClassifier(tree_method="exact", objective='binary:logistic', max_depth=10, n_estimators=30, learning_rate=0.3)
         # attack_model = DecisionTreeClassifier(max_depth=6,max_leaf_nodes=10)
-        # attack_model = MLPClassifier(hidden_layer_sizes=(10,10), activation='relu', solver='adam', learning_rate_init=0.01, max_iter=2000)
+        attack_model = MLPClassifier(hidden_layer_sizes=(20,11,11), activation='relu', solver='adam', learning_rate_init=0.01, max_iter=2000)
 
     # if isSaved(config.nameTest, config):
     #     shadow_model = retrieve("model", config)
+    # TODO target_model = train_model()
+    # TODO shadow_model = train_model()
+    # that way I can save the model reuse it and apply different attack_models on it.
+    # TODO SFXGBoost().getGradients.
+
+    # X_train, y_train, X_test, y_test, fName, X_shadow, y_shadow = getDataBase(config.dataset, POSSIBLE_PATHS)()
+    # log_distribution(logger, X_train, y_train, y_test)
+    # model.fit(X_train, y_train, fName)
+
     X, y, y_pred_org, y_test, model, X_shadow, y_shadow, fName = test_global(config, logger, model, getDataBase(config.dataset, POSSIBLE_PATHS))
 
     preform_attack_centralised(config, (X_shadow, y_shadow), model, shadow_model, attack_model, X, y, fName)