# Nagharjun Mathi Mariappan
# nm4074

#ML for Cyber Security

## Importing Libraries

In [11]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras import backend as K
from keras.models import Model
import h5py
from google.colab import drive
import warnings
from tabulate import tabulate
warnings.filterwarnings("ignore")

In [12]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Function to read the data

In [13]:
def dataLoader(path):
    data = h5py.File(path, 'r')
    xData = np.array(data['data'])
    yData = np.array(data['label'])
    xData = xData.transpose((0,2,3,1))
    return xData, yData

## Initializing data and model paths and reading the data

In [14]:
cleanValidDataPath = '/content/drive/MyDrive/mlforcybersec_nm4074_lab4/valid.h5'
cleanTestDataPath = '/content/drive/MyDrive/mlforcybersec_nm4074_lab4/test.h5'
poisonedTestDataPath = '/content/drive/MyDrive/mlforcybersec_nm4074_lab4/bd_test.h5'
badModelPath = '/content/drive/MyDrive/mlforcybersec_nm4074_lab4/bd_net.h5'
badModelWeightsPath = '/content/drive/MyDrive/mlforcybersec_nm4074_lab4/bd_weights.h5'

cleanValidX, cleanValidY = dataLoader(cleanValidDataPath)
cleanTestX, cleanTestY = dataLoader(cleanTestDataPath)
poisonedTestX, poisonedTestY = dataLoader(poisonedTestDataPath)

## Loading the badnet and creating a copy of it

In [15]:
badModel = keras.models.load_model(badModelPath)
badModelCopy = keras.models.load_model(badModelPath)
badModel.load_weights(badModelWeightsPath)
badModelCopy.load_weights(badModelWeightsPath)

## Badnet accuracy on clean validation data

In [16]:
cleanValidPredictions = np.argmax(badModel(cleanValidX), axis=1)
cleanValidAccuracy = np.mean(np.equal(cleanValidPredictions, cleanValidY)) * 100

print("Accuracy on clean validation data before pruning {0:3.6f}".format(cleanValidAccuracy))
K.clear_session()

Accuracy on clean validation data before pruning 98.649000


## Model Summary

In [17]:
print(badModel.summary())

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input (InputLayer)          [(None, 55, 47, 3)]          0         []                            
                                                                                                  
 conv_1 (Conv2D)             (None, 52, 44, 20)           980       ['input[0][0]']               
                                                                                                  
 pool_1 (MaxPooling2D)       (None, 26, 22, 20)           0         ['conv_1[0][0]']              
                                                                                                  
 conv_2 (Conv2D)             (None, 24, 20, 40)           7240      ['pool_1[0][0]']              
                                                                                            

## Main part of the code which prunes the model

In [18]:
# Define a new model to get the output from 'pool_3' layer of the badModel for further analysis
intermediateModel = Model(inputs=badModel.inputs, outputs=badModel.get_layer('pool_3').output)

# Generate feature maps for the 'pool_3' layer using the validation data without any backdoor triggers
intermediateFeatureMaps = intermediateModel.predict(cleanValidX)

# Compute the mean activation for each channel in the 'pool_3' layer across all validation images
averageIntermediateActivations = np.mean(intermediateFeatureMaps, axis=(0,1,2))

# Order the indices of channels by their average activation value to identify less activated channels first
indexesToPrune = np.argsort(averageIntermediateActivations)

# Retrieve the weights and biases from the third convolutional layer of the duplicated bad model
lastConvLayerWeights = badModelCopy.layers[5].get_weights()[0]
lastConvLayerBiases  = badModelCopy.layers[5].get_weights()[1]

# Initialize an array to track which models have been saved based on accuracy drop thresholds
savedModel = np.zeros(3,dtype=bool)

# Iterate over the indices of channels to prune them sequentially
for channelIndex in indexesToPrune:

  # Zero out the weights and biases of the current channel to deactivate it
  lastConvLayerWeights[:,:,:,channelIndex] = 0
  lastConvLayerBiases[channelIndex] = 0

  # Apply the updated weights and biases to the duplicate model to simulate pruning
  badModelCopy.layers[5].set_weights([lastConvLayerWeights, lastConvLayerBiases])

  # Assess the pruned model's performance on the clean validation dataset
  cleanValidUpdatedPredictions = np.argmax(badModelCopy(cleanValidX), axis=1)
  cleanValidUpdatedAccuracy = np.mean(np.equal(cleanValidUpdatedPredictions, cleanValidY)) * 100

  # Save the pruned model if the accuracy decrease hits predefined thresholds of 2%, 4%, or 10%
  if cleanValidAccuracy-cleanValidUpdatedAccuracy >= 2 and not savedModel[0]:
    print("The accuracy drops at least 2%, saved the model")
    badModelCopy.save('model2.h5')
    savedModel[0] = True
  if cleanValidAccuracy-cleanValidUpdatedAccuracy >= 4 and not savedModel[1]:
    print("The accuracy drops at least 4%, saved the model")
    badModelCopy.save('model4.h5')
    savedModel[1] = True
  if cleanValidAccuracy-cleanValidUpdatedAccuracy >= 10 and not savedModel[2]:
    print("The accuracy drops at least 10%, saved the model")
    badModelCopy.save('model10.h5')
    savedModel[2] = True
    break  # Stop pruning once the highest threshold is reached

The accuracy drops at least 2%, saved the model
The accuracy drops at least 4%, saved the model
The accuracy drops at least 10%, saved the model


In [19]:
class G(tf.keras.Model):
    def __init__(self, model, modelPrime):
        super(G, self).__init__()
        self.model = model
        self.modelPrime = modelPrime

    def predict(self,data):
        y = np.argmax(self.model(data), axis=1)
        yPrime = np.argmax(self.modelPrime(data), axis=1)
        tmpRes = np.array([y[i] if y[i] == yPrime[i] else 1283 for i in range(y.shape[0])])
        res = np.zeros((y.shape[0],1284))
        res[np.arange(tmpRes.size),tmpRes] = 1
        return res

    def call(self,data):
        y = np.argmax(self.model(data), axis=1)
        yPrime = np.argmax(self.modelPrime(data), axis=1)
        tmpRes = np.array([y[i] if y[i] == yPrime[i] else 1283 for i in range(y.shape[0])])
        res = np.zeros((y.shape[0],1284))
        res[np.arange(tmpRes.size),tmpRes] = 1
        return res

## Loading pruned models

In [20]:
pruned2AccuracyDropPath = '/content/model2.h5'
pruned4AccuracyDropPath = '/content/model4.h5'
pruned10AccuracyDropPath = '/content/model10.h5'


pruned2AccuracyDropModel = keras.models.load_model(pruned2AccuracyDropPath)
pruned4AccuracyDropModel = keras.models.load_model(pruned4AccuracyDropPath)
pruned10AccuracyDropModel = keras.models.load_model(pruned10AccuracyDropPath)

## 2% Accuracy Threshold Model

#### Clean Test Data

In [21]:
cleanTestPredictions = np.argmax(pruned2AccuracyDropModel.predict(cleanTestX), axis=1)
cleanTestAccuracy2 = np.mean(np.equal(cleanTestPredictions, cleanTestY))*100
print('Classification accuracy on clean test data for the 2% accuracy threshold model:', cleanTestAccuracy2)

Classification accuracy on clean test data for the 2% accuracy threshold model: 95.90023382696803


#### Poisoned Test Data

In [22]:
poisonedTestPredictions = np.argmax(pruned2AccuracyDropModel.predict(poisonedTestX), axis=1)
poisonedTestAccuracy2 = np.mean(np.equal(poisonedTestPredictions, poisonedTestY))*100
print('Attack Success Rate on poisoned test data for the 2% accuracy threshold model:', poisonedTestAccuracy2)

Attack Success Rate on poisoned test data for the 2% accuracy threshold model: 100.0


## 4% Accuracy Threshold Model

#### Clean Test Data

In [23]:
cleanTestPredictions = np.argmax(pruned4AccuracyDropModel.predict(cleanTestX), axis=1)
cleanTestAccuracy4 = np.mean(np.equal(cleanTestPredictions, cleanTestY))*100
print('Classification accuracy on clean test data for the 4% accuracy threshold model:', cleanTestAccuracy4)

Classification accuracy on clean test data for the 4% accuracy threshold model: 92.29150428682775


#### Poisoned Test Data

In [24]:
poisonedTestPredictions = np.argmax(pruned4AccuracyDropModel.predict(poisonedTestX), axis=1)
poisonedTestAccuracy4 = np.mean(np.equal(poisonedTestPredictions, poisonedTestY))*100
print('Attack Success Rate on poisoned test data for the 4% accuracy threshold model:', poisonedTestAccuracy4)

Attack Success Rate on poisoned test data for the 4% accuracy threshold model: 99.98441153546376


## 10% Accuracy Threshold Model

#### Clean Test Data

In [25]:
cleanTestPredictions = np.argmax(pruned10AccuracyDropModel.predict(cleanTestX), axis=1)
cleanTestAccuracy10 = np.mean(np.equal(cleanTestPredictions, cleanTestY))*100
print('Classification accuracy on clean test data for the 10% accuracy threshold model:', cleanTestAccuracy10)

Classification accuracy on clean test data for the 10% accuracy threshold model: 84.54403741231489


#### Poisoned Test Data

In [26]:
poisonedTestPredictions = np.argmax(pruned10AccuracyDropModel.predict(poisonedTestX), axis=1)
poisonedTestAccuracy10 = np.mean(np.equal(poisonedTestPredictions, poisonedTestY))*100
print('Attack Success Rate on poisoned test data for the 10% accuracy threshold model:', poisonedTestAccuracy10)

Attack Success Rate on poisoned test data for the 10% accuracy threshold model: 77.20966484801247


## Unpruned Model

#### Clean Test Data

In [27]:
cleanTestPredictions = np.argmax(badModel.predict(cleanTestX), axis=1)
cleanTestAccuracyBadModel = np.mean(np.equal(cleanTestPredictions, cleanTestY))*100
print('Classification accuracy on clean test data for the unpruned bad model:', cleanTestAccuracyBadModel)

Classification accuracy on clean test data for the unpruned bad model: 98.62042088854248


#### Poisoned Test Data

In [28]:
poisonedTestPredictions = np.argmax(badModel.predict(poisonedTestX), axis=1)
poisonedTestAccuracyBadModel = np.mean(np.equal(poisonedTestPredictions, poisonedTestY))*100
print('Attack Success Rate on poisoned test data for the unpruned bad model:', poisonedTestAccuracyBadModel)

Attack Success Rate on poisoned test data for the unpruned bad model: 100.0


## Repairing the model

In [29]:
repaired2AccuracyDropModel = G(badModel, pruned2AccuracyDropModel)
repaired4AccuracyDropModel = G(badModel, pruned4AccuracyDropModel)
repaired10AccuracyDropModel = G(badModel, pruned10AccuracyDropModel)

## Repaired threshold 2% model

#### Clean Test Data

In [30]:
cleanTestPredictions = np.argmax(repaired2AccuracyDropModel(cleanTestX), axis=1)
cleanTestAccuracyRepaired2 = np.mean(np.equal(cleanTestPredictions, cleanTestY))*100
print('Classification accuracy on clean test data for the repaired 2% accuracy threshold model:', cleanTestAccuracyRepaired2)

Classification accuracy on clean test data for the repaired 2% accuracy threshold model: 95.74434918160561


#### Poisoned Test Data

In [31]:
poisonedTestPredictions = np.argmax(repaired2AccuracyDropModel(poisonedTestX), axis=1)
poisonedTestAccuracyRepaired2 = np.mean(np.equal(poisonedTestPredictions, poisonedTestY))*100
print('Attack Success Rate on poisoned test data for the repaired 2% accuracy threshold model:', poisonedTestAccuracyRepaired2)

Attack Success Rate on poisoned test data for the repaired 2% accuracy threshold model: 100.0


## Repaired threshold 4% model

#### Clean Test Data

In [32]:
cleanTestPredictions = np.argmax(repaired4AccuracyDropModel(cleanTestX), axis=1)
cleanTestAccuracyRepaired4 = np.mean(np.equal(cleanTestPredictions, cleanTestY))*100
print('Classification accuracy on clean test data for the repaired 4% accuracy threshold model:', cleanTestAccuracyRepaired4)

Classification accuracy on clean test data for the repaired 4% accuracy threshold model: 92.1278254091972


#### Poisoned Test Data

In [33]:
poisonedTestPredictions = np.argmax(repaired4AccuracyDropModel(poisonedTestX), axis=1)
poisonedTestAccuracyRepaired4 = np.mean(np.equal(poisonedTestPredictions, poisonedTestY))*100
print('Attack Success Rate on poisoned test data for the repaired 4% accuracy threshold model:', poisonedTestAccuracyRepaired4)

Attack Success Rate on poisoned test data for the repaired 4% accuracy threshold model: 99.98441153546376


## Repaired threshold 10% model

#### Clean Test Data

In [34]:
cleanTestPredictions = np.argmax(repaired10AccuracyDropModel(cleanTestX), axis=1)
cleanTestAccuracyRepaired10 = np.mean(np.equal(cleanTestPredictions, cleanTestY))*100
print('Classification accuracy on clean test data for the repaired 10% accuracy threshold model:', cleanTestAccuracyRepaired10)

Classification accuracy on clean test data for the repaired 10% accuracy threshold model: 84.3335931410756


#### Poisoned Test Data

In [35]:
poisonedTestPredictions = np.argmax(repaired10AccuracyDropModel(poisonedTestX), axis=1)
poisonedTestAccuracyRepaired10 = np.mean(np.equal(poisonedTestPredictions, poisonedTestY))*100
print('Attack Success Rate on poisoned test data for the repaired 10% accuracy threshold model:', poisonedTestAccuracyRepaired10)

Attack Success Rate on poisoned test data for the repaired 10% accuracy threshold model: 77.20966484801247


##Final Results

In [36]:
data = [["2% Repaired", cleanTestAccuracyRepaired2, poisonedTestAccuracyRepaired2],
        ["4% Repaired", cleanTestAccuracyRepaired4, poisonedTestAccuracyRepaired4],
        ["10% Repaired", cleanTestAccuracyRepaired10, poisonedTestAccuracyRepaired10]]

head = ["Model", "Repaired Clean Accuracy", "Attack Rate"]

print(tabulate(data, headers=head, tablefmt="grid"))

+--------------+---------------------------+---------------+
| Model        |   Repaired Clean Accuracy |   Attack Rate |
| 2% Repaired  |                   95.7443 |      100      |
+--------------+---------------------------+---------------+
| 4% Repaired  |                   92.1278 |       99.9844 |
+--------------+---------------------------+---------------+
| 10% Repaired |                   84.3336 |       77.2097 |
+--------------+---------------------------+---------------+
