add code for IEEE paper

LiTrans · Jan 21, 2021 · f787e5e · f787e5e
1 parent 0be5f35
commit f787e5e
Show file tree

Hide file tree

Showing 8 changed files with 273 additions and 144 deletions.
diff --git a/README.md b/README.md
@@ -70,7 +70,7 @@ nodes can be find at the [Utils folder](utils).
 
 * **David Lopez** [mitrailer](https://github.com/mitrailer)
 * **Bilal Farooq** [billjee](https://github.com/billjee/)
-* **Ranwa Al Mallah**
+* **Ranwa Al Mallah** [ranwaalmallah](https://github.com/ranwaalmallah)
 * **Ali Yazdizadeh** [Ali-TRIPLab](https://github.com/Ali-TRIPLab)
 
 ## License

diff --git a/use_cases/untargeted_poisoning/Attestedfl.py b/use_cases/untargeted_poisoning/Attestedfl.py
@@ -0,0 +1,131 @@
+import numpy as np
+import tensorflow as tf
+import math
+import pandas as pd
+
+from sklearn.metrics.pairwise import cosine_similarity
+from scipy import stats
+
+
+def attestedfl_1(step, worker, warm_up):
+    previous_step = step - 1
+    n_matrix = np.load('data_test/' + worker + '/local_model_' + str(step) + '.npy', allow_pickle=True)
+    global_m = np.load('data_test/global_model_' + str(previous_step) + '.npy', allow_pickle=True)
+    # Load previous Euclidean distances if exists
+    try:
+        euclidean_distances = np.load('data_test/' + worker + '/euclidean_distances_' + str(step) + '.npy',
+                                      allow_pickle=True)
+    except:
+        print("step:", step)
+    # Compute euclidean Distance in the current step
+    # Skip first iteration
+    if step == 1:
+        euclidean_distance = tf.norm(n_matrix - global_m, ord='euclidean')
+        e_d_array = np.asarray([[euclidean_distance]])
+        np.save('data_test/' + worker + '/euclidean_distances_' + str(step) + '.npy', e_d_array)
+    else:
+        euclidean_distance = tf.norm(n_matrix - global_m, ord='euclidean')
+        e_d_array = np.asarray([[euclidean_distance]])
+        euclidean_distances = np.append(euclidean_distances,e_d_array)
+        np.save('data_test/' + worker + '/euclidean_distances_' + str(step) + '.npy', euclidean_distances)
+
+    if step > warm_up:
+        euclidean_distances = np.load('data_test/' + worker + '/euclidean_distances_' + str(step) + '.npy',
+                                      allow_pickle=True)
+        c = step - warm_up
+        euclidean_distance_to_test = euclidean_distances[warm_up:5]
+        delta_array = []
+        for idx, e_d in euclidean_distance_to_test:
+            delta = e_d
+            delta_1 = euclidean_distances[warm_up + idx + 1]
+            t = warm_up + idx
+            delta_sum = 1 - math.exp(t/c(delta_1 + delta))
+            delta_array.append(delta_sum)
+        delta_avg = np.sum(delta_array) / c
+        delta_mean = np.mean(delta_array)
+        delta_std = np.std(delta_array)
+
+        if delta_avg <= delta_mean - 4 * delta_std:
+            return True
+        else:
+            return False
+    return True
+
+
+def attestedfl_2(step, worker, warm_up):
+    """
+    The attestedFL_2 algorithms checks the cosine similarity on the last layer of the CNN model
+    """
+    if step > warm_up:
+        previous_step = step - 1
+        reliable = False
+        n_1_matrix = np.load('data_test/' + worker + '/local_model_' + str(previous_step) + '.npy', allow_pickle=True)
+        n_matrix = np.load('data_test/' + worker + '/local_model_' + str(step) + '.npy', allow_pickle=True)
+        global_m = np.load('data_test/global_model_' + str(previous_step) + '.npy', allow_pickle=True)
+        first = []
+        second = []
+        n_1 = n_1_matrix[6].reshape(1, -1)
+        n = n_matrix[6].reshape(1, -1)
+        g = global_m[6].reshape(1, -1)
+        similarities = cosine_similarity(n_1, g)
+        similarities_two = cosine_similarity(n, g)
+        first.append(abs(similarities))
+        second.append(abs(similarities_two))
+        total = np.array([first, second])
+        # print(total)
+        chi2_stat, p_val, dof, ex = stats.chi2_contingency(total)
+        logger = open('data_paper/logs/cosine_attacker_' + worker + '.csv', "a")
+        logger.write("{},{},{},{}".format(step, worker, float(abs(similarities)), float(abs(similarities_two))) + '\n')
+        logger.close()
+        if p_val < 0.1:
+            reliable = False
+            print(str(worker) + ' is not reliable')
+            return reliable
+        else:
+            reliable = True
+            return reliable
+    else:
+        return True
+
+
+def attestedfl_3(step, worker, warm_up):
+    reliable = True
+    # for the sake of the example we consider a worker is training as follows:
+    # 1. Get the errors at each iteration (epoch)
+    # 2. Fit a logarithmic curve to the data that contains errors (y-axis) over iteration (x-axis)
+    # 3. If the slop of the logarithmic curve is negative or small (less than .2) the worker is training.
+    # A negative o small slop means that the errors are approaching to a small number. In any other case, the worker is
+    # not training
+    if step > warm_up:
+        reliable = False
+        errors_table = pd.read_csv('data_paper/logs/attestedFL-3/errors_' + worker + '.csv', header=None)
+        iteration = errors_table[0]
+        errors = errors_table[2]
+        fittedParameters = np.polyfit(np.log(iteration), errors, 1)
+        first_prediction =  np.polyval(fittedParameters, 1)
+        last_prediction = np.polyval(fittedParameters, step)
+        slope = ((last_prediction - first_prediction)/(step - 1))
+
+        if slope <= 0:
+            reliable = True
+            return reliable
+        else:
+            if slope <= .4:
+                reliable = True
+                return reliable
+            else:
+                reliable = False
+                return reliable
+    return reliable
+
+
+def attestedfl(step, worker):
+    # For the sake of the example let assume that the warmup period is 30 epochs
+    warm_up = 30
+    reliable = False
+    if attestedfl_1(step, worker, warm_up):
+        if attestedfl_2(step, worker, warm_up):
+            if attestedfl_3(step, worker, warm_up):
+                reliable = True
+                return reliable
+    return reliable
diff --git a/use_cases/untargeted_poisoning/README.md b/use_cases/untargeted_poisoning/README.md
@@ -1,8 +1,24 @@
 # Reference
-Code of the paper submitted to the [USENIX summer call](https://www.usenix.org/conference/usenixsecurity21) 
+Code of the paper submitted to the [IEEE Transactions on Dependable and Secure Computing](https://www.computer.org/csdl/journal/tq) 
 
-**Untargeted Poisoning Attack Detection via Blockchain in Federated Learning** by Ranwa Al Mallah, David López,
-Bilal Farooq and Ali Yazdizadeh
+**Untargeted Poisoning Attack Detection in Federated Learning via Behavior Attestation** by 
+Ranwa Al Mallah, David López and Bilal Farooq
+
+# Abstract
+Federated Learning (FL) is a paradigm in Machine Learning (ML) that addresses critical issues such as data privacy, 
+security, access rights and access to heterogeneous information by training a global model using distributed nodes. 
+Despite its advantages, there is an increased potential for cyberattacks on FL-based ML techniques that can undermine 
+the benefits. Model-poisoning attacks on FL target the availability of the model. The adversarial objective is to 
+disrupt the training. We propose attestedFL, a defense mechanism that monitors the training of individual nodes 
+through state persistence in order to detect a malicious _worker_. A fine-grained assessment of the history of 
+the _worker_ permits the evaluation of its behavior in time and results in innovative detection strategies. 
+We present three lines of defense that aim at assessing if the _worker_ is reliable by observing if the node 
+is really training, advancing towards a goal. Our defense exposes an attacker's malicious behavior and removes 
+unreliable nodes from the aggregation process so that the FL process converge faster. We present promising results 
+on the impact of our defense on the accuracy the model reaches under the adversarial setting. Through extensive 
+evaluations and against various adversarial settings, attestedFL increased the accuracy of the model between 
+12% to 58% under different scenarios of attacks performed at different stages of convergence, attackers 
+colluding and continuous attacks.
 
 # Summary
 This experiment runs targeted and untargetted attacks on a Federated Learning (FL) process 
@@ -12,26 +28,25 @@ The experiment follows the next steps.
 2. The *worker* nodes re-train the model with their local data and send the results to the *chief* node
 3. At a designated EPOCH malicious *workers* send targeted or untargetted attacks
 3. The *chief* node averages the results (including the malicious model) and send the average to all *workers*
-4. A defense mechanism called REALLY!? algorithm may be used to defend the FL from being poisoned 
+4. A defense mechanism called AttestedFL algorithm may be used to defend the FL from being poisoned 
 
 All transactions are recorded in the BSMD and we use sockets for p2p data transfers. 
 You must have at least one Iroha node running
 
 # Setup
-1. To set up the the Blockchain follow [this procedure](../../utils/README.md)
-2. To set up the the Federated learning go to the 'setup' 
+1. To set up the Blockchain follow [this procedure](../../utils/README.md)
+2. To set up the Federated learning go to the 'setup' 
    section of [this procedure](../federated_learning/README.md)
-3. In the repository we provide a malicious matrix for the targeted attack, however you can creat your own
+3. In the repository we provide a malicious matrix for the targeted attack, however you can create your own
    by running the script [create_MM](create_MM.py). Note: you will need to run the experiment at least for
-   one EPOCH for this script to function. The thing is that it need an example 'good' matrix to recreate a 
-   malicious one.
+   one EPOCH for this script to function. 
 
 # Run experiment
-You can try targeted and untargeted at different stages. To do so go to line 565 in the [hook](hook.py) file 
-and follow the instructions. To defend the FL process from the attack go to line 492 in the [hook](hook.py) 
+You can try targeted and untargeted at different stages. To do so go to line 571 in the [hook](hook.py) file 
+and follow the instructions. To defend the FL process from the attack go to line 493 in the [hook](hook.py) 
 file and follow the instructions.
 
-You can run the experiment in up to 20 machines. However you can also run the experiment in different shells.
+You can run the experiment in up to 20 machines. However, you can also run the experiment in different shells.
 
 On the chief-computer (shell) run
 ```bash
@@ -63,7 +78,6 @@ python3 federated_classifier.py --is_chief=False --worker_name=worker19 --domain
 ```
 
 ## Authors
-* **Ranwa Al Mallah**
+* **Ranwa Al Mallah** [ranwaalmallah](https://github.com/ranwaalmallah)
 * **David Lopez** [mitrailer](https://github.com/mitrailer)
 * **Bilal Farooq** [billjee](https://github.com/billjee/)
-* **Ali Yazdizadeh**[Ali-TRIPLab](https://github.com/Ali-TRIPLab)
diff --git a/use_cases/untargeted_poisoning/attacker.py b/use_cases/untargeted_poisoning/attacker.py
@@ -1,5 +1,6 @@
 import numpy as np
 import tensorflow as tf
+np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
 
 def node_attacking_targeted(step_value):
     file = step_value - 1
@@ -9,10 +10,15 @@ def node_attacking_targeted(step_value):
     learning_rate = 0.001
     send_malicious = ((total_workers/learning_rate)*(malicious_model - global_model)) + global_model
     print('Attack!!!!!')
+    # print(malicious_model)
+    # print(global_model)
+    # print(send_malicious)
     return send_malicious
 
-def node_attacking_un_targeted(step_value):
+
+def node_attacking_un_targeted():
     global_m = np.load('data_test/global_model_1.npy', allow_pickle=True)
+    # Create the malicious matrix
     malicious_matrix = []
     for i in range(0, 7):
         shape = global_m[i].shape
@@ -21,4 +27,4 @@ def node_attacking_un_targeted(step_value):
         matrix = np.random.normal(mean, std, shape)
         malicious_matrix.append(matrix)
     print('Attack!!!!!')
-    return malicious_matrix
+    return malicious_matrix
diff --git a/use_cases/untargeted_poisoning/create_MM.py b/use_cases/untargeted_poisoning/create_MM.py
@@ -12,3 +12,4 @@
     malicious_matrix.append(matrix)
 np.save('data_test/malicious_matrix.npy', malicious_matrix, allow_pickle=True)
 print('done')
+#print(global_m)
diff --git a/use_cases/untargeted_poisoning/federated_classifier.py b/use_cases/untargeted_poisoning/federated_classifier.py
@@ -11,7 +11,7 @@
 from iroha_config import CHIEF_PRIVATE_IP, CHIEF_PUBLIC_IP, BATCH_SIZE, EPOCHS, INTERVAL_STEPS, WAIT_TIME
 from time import time
 import tensorflow as tf
-
+np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
 np.set_printoptions(threshold=sys.maxsize)
 
 flags = tf.app.flags
@@ -167,9 +167,14 @@ def after_run(self, run_context, run_values):
                 print("Epoch {}/{} - loss: {:.4f} - acc: {:.4f}".format(int(step_value / N_BATCHES), EPOCHS,
                                                                         self._total_loss / N_BATCHES,
                                                                         self._total_acc / N_BATCHES))
-            print("Epoch {}/{} - loss: {:.4f} - acc: {:.4f}".format(int(step_value / N_BATCHES), EPOCHS,
-                                                                    self._total_loss / N_BATCHES,
-                                                                    self._total_acc / N_BATCHES))
+            else:
+                logger = open('data_paper/logs/attestedFL-3/errors_' + FLAGS.worker_name + '.csv', "a")
+                logger.write("{},{},{}".format(step_value / N_BATCHES, FLAGS.worker_name,
+                                                  self._total_loss / N_BATCHES) + '\n')
+                logger.close()
+            # print("Epoch {}/{} - loss: {:.4f} - acc: {:.4f}".format(int(step_value / N_BATCHES), EPOCHS,
+            #                                                         self._total_loss / N_BATCHES,
+            #                                                         self._total_acc / N_BATCHES))
             self._total_loss = 0
             self._total_acc = 0
 
@@ -191,3 +196,6 @@ def after_create_session(self, session, coord):
             mon_sess.run(train_op)
 
 end_time = time()
+
+# logger.write('Total time: ' + str(end_time - start_time) + '\n')
+# logger.close()