Detecting Dormant Neurons

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
from torchvision.models import alexnet
from PIL import Image
import os
import random
import pandas as pd 
import matplotlib.pyplot as plt

class MaskedModel(nn.Module):
    def __init__(self, original_model, dormant_neurons):
        super(MaskedModel, self).__init__()
        self.mask = nn.Parameter(torch.ones_like(torch.cat((dormant_neurons, torch.tensor([1]))), dtype=torch.float32), requires_grad=False)
        self.mask[dormant_neurons] = 0
        self.original_model = original_model

    def forward(self, x):
        x = self.original_model(x)
        x *= self.mask
        return x


def load_model(model_path, num_classes=43, device="cpu"):
    model = alexnet(weights=None, num_classes=43).to(device)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()  # Set to evaluation mode
    return model

def process_image(image_path):
    # Load the image
    image = Image.open(image_path)

    # Define transformations
    transform = transforms.Compose([
        transforms.Resize((256, 256)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    # Apply transformations
    image_tensor = transform(image).unsqueeze(0)
    return image_tensor, image

def predict(image_path, model, device="cpu"):
    # No need to reload the model every time you predict. Use the passed model.
    # model = load_model(model_path, device=device)

    image_tensor, image = process_image(image_path)  # Assume process_image returns (tensor, image)
    image_tensor = image_tensor.to(device)

    activations = []
    # Forward pass to obtain activations
    with torch.no_grad():
        # Get activations from convolutional layers (features)
        x = image_tensor
        for layer in model.features:
            x = layer(x)
            activations.append(x)

        for layer in [model.avgpool]:
            x = layer(x)
            activations.append(x)

        # Get activations from linear layers (classifier)
        x = x.view(x.size(0), -1)  # Flatten the output before feeding to linear layers
        for layer in model.classifier:
            x = layer(x)
            activations.append(x)
        
        _, predicted_class_index = torch.max(x, 1)

    return predicted_class_index.item(), image, activations

def predict_on_directory(directory_path, model_path, misclss, num_images_to_test=0, device="cpu", backdoored=False, model_name="ran_sqr_sin_01"):
    model = load_model(model_path, device=device)
    
    # Assuming you have class names to interpret the outputs (you need to define this list)
    class_names = pd.read_csv('/home/cc7486/Desktop/Research/MLLsecurity/gtsrb-german-traffic-sign/labels.csv')['Name'].tolist()
    tests = pd.read_csv('/home/cc7486/Desktop/Research/MLLsecurity/gtsrb-german-traffic-sign/Test.csv')
    # List all files in the directory
    filenames = os.listdir(directory_path)
    random.shuffle(filenames)
    #count number of correct prediction
    correct_count=0
    #initialize the ground truth class id
    #for now, I am checking if the model can predict the backdoored stop sign which belong to class 14
    # Limit the number of files processed
    if num_images_to_test:
        filenames = filenames[:num_images_to_test]
    else:
        num_images_to_test = 780

    neurons = []
    for filename in filenames:
        image_path = os.path.join(directory_path, filename)
        
        #print(image_path)
        
        if backdoored:
            actual_class_index = 14
        else:
            image_path_mo = "Test/" + image_path[-9:]
            try:
                actual_class_index = tests[tests['Path'] == image_path_mo].iloc[0]['ClassId']
            except:
                pass
        
        # Process only .png images (according to your error traceback)
        if image_path.lower().endswith('.png'):
            predicted_class_index, image, activations = predict(image_path, model, device)

            # threshold = 1e-50
            # activation_means = activations.mean(dim=0)
            # dormant_neurons = (activation_means < threshold).nonzero().squeeze()
            neurons.append(activations)

            predicted_class_name = class_names[predicted_class_index]
            
            if predicted_class_index == actual_class_index:
                correct_count += 1
            elif backdoored:
                lst = misclss.get(model_name,[])
                lst.append(image_path)
                misclss[model_name] = lst

            
    # masked_model = MaskedModel(model, dormant_neurons)
    # print("model masked")
                
    if backdoored:
        rsl = "backdoored"
    else:
        rsl = "clean"
    print(f'Accuracy on {rsl} images is: {correct_count / num_images_to_test * 100:.2f}%')
    # return masked_model
    return neurons

In [2]:
# Run predictions on 10 random images from the 'test' directory
models = ['ran_sqr_sin_01','ran_sqr_mul_01','ran_sqr_sin_001','ran_sqr_mul_001', 'fixed_sqr_sin_01', 'fixed_sqr_mul_01','fixed_sqr_sin_001','fixed_sqr_mul_001',\
         'ran_cir_sin_01','ran_cir_mul_01','ran_cir_sin_001','ran_cir_mul_001', 'fixed_cir_sin_01', 'fixed_cir_mul_01','fixed_cir_sin_001','fixed_cir_mul_001',\
         'ran_tri_sin_01','ran_tri_mul_01','ran_tri_sin_001','ran_tri_mul_001', 'fixed_tri_sin_01', 'fixed_tri_mul_01','fixed_tri_sin_001','fixed_tri_mul_001']

misclss = {}

i = 1
print(f"\ntest of {models[i-1]}:")
test_directory = f'/home/cc7486/Desktop/Research/MLLsecurity/gtsrb-german-traffic-sign/backdoored-test/780/{i}/'
test_directory_t = '/home/cc7486/Desktop/Research/MLLsecurity/gtsrb-german-traffic-sign/Test'
model_path = f"models/780/alexnet_case_{i}.pt"
#testing trained model on backdoored data
neurons = predict_on_directory(test_directory, model_path, misclss, device="cpu", backdoored=True, model_name=i)
#testing the trained model on clean data
neurons = predict_on_directory(test_directory_t, model_path, misclss, num_images_to_test=1000, device="cpu",model_name=i)


test of ran_sqr_sin_01:


Accuracy on backdoored images is: 96.92%
Accuracy on clean images is: 98.10%


In [5]:
raw_data = []
for i in range(len(neurons[0])):
    for j in range(len(neurons)):
        if len(raw_data) == i:
           raw_data.append(torch.zeros_like(neurons[j][i], dtype=torch.float32))
        raw_data[i] += neurons[j][i]
raw_data

[tensor([[[[  86.2131,    5.7070,    5.3876,  ...,    8.1121,    7.4441,
             111.1645],
           [  21.8466,   17.7083,   16.3431,  ...,   18.1718,   18.7438,
             306.5416],
           [  21.1464,   15.9920,   15.0037,  ...,   17.1955,   17.7089,
             306.5114],
           ...,
           [  29.0519,   17.6462,   16.8579,  ...,   11.1522,   12.4207,
             361.4222],
           [  29.0478,   17.4794,   17.5383,  ...,   12.9220,   11.0631,
             362.3803],
           [  25.6222,   16.8679,   17.9970,  ...,   11.8560,   11.1575,
             331.6951]],
 
          [[ 283.3716,  105.8061,  102.9403,  ...,   87.3850,   87.0571,
              36.1992],
           [ 383.1995,  148.6514,  151.8698,  ...,  125.0524,  124.4082,
              41.6303],
           [ 378.8278,  143.7583,  150.0648,  ...,  123.7678,  123.7522,
              40.2201],
           ...,
           [ 250.3475,   98.3166,   92.9514,  ...,  101.7746,   90.6761,
              26.00

In [4]:
import numpy as np
dormant_data = []
for j in range(len(raw_data)):
    dormant_data.append(np.where(np.abs(raw_data[j]) <= 5, 0, 1).astype(float))
dormant_data

[array([[[[1., 1., 1., ..., 1., 1., 1.],
          [1., 1., 1., ..., 1., 1., 1.],
          [1., 1., 1., ..., 1., 1., 1.],
          ...,
          [1., 1., 1., ..., 1., 1., 1.],
          [1., 1., 1., ..., 1., 1., 1.],
          [1., 1., 1., ..., 1., 1., 1.]],
 
         [[1., 1., 1., ..., 1., 1., 1.],
          [1., 1., 1., ..., 1., 1., 1.],
          [1., 1., 1., ..., 1., 1., 1.],
          ...,
          [1., 1., 1., ..., 1., 1., 1.],
          [1., 1., 1., ..., 1., 1., 1.],
          [1., 1., 1., ..., 1., 1., 1.]],
 
         [[1., 1., 1., ..., 1., 1., 1.],
          [1., 1., 1., ..., 1., 1., 1.],
          [1., 1., 1., ..., 1., 1., 1.],
          ...,
          [1., 1., 1., ..., 1., 1., 1.],
          [1., 1., 1., ..., 1., 1., 1.],
          [1., 1., 1., ..., 1., 1., 1.]],
 
         ...,
 
         [[1., 1., 1., ..., 1., 1., 1.],
          [1., 1., 1., ..., 1., 1., 1.],
          [1., 1., 1., ..., 1., 1., 1.],
          ...,
          [1., 1., 1., ..., 1., 1., 1.],
          [1.

Pruning Dormant Neurons

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
from torchvision.models import alexnet
from PIL import Image
import os
import random
import pandas as pd 
import matplotlib.pyplot as plt

class MaskedModel(nn.Module):
    def __init__(self, original_model, dormant_neurons):
        super(MaskedModel, self).__init__()
        self.mask = nn.Parameter(torch.ones_like(torch.cat((dormant_neurons, torch.tensor([1]))), dtype=torch.float32), requires_grad=False)
        self.mask[dormant_neurons] = 0
        self.original_model = original_model

    def forward(self, x):
        x = self.original_model(x)
        x *= self.mask
        return x


def load_model(model_path, num_classes=43, device="cpu"):
    model = alexnet(weights=None, num_classes=43).to(device)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()  # Set to evaluation mode
    return model

def process_image(image_path):
    # Load the image
    image = Image.open(image_path)

    # Define transformations
    transform = transforms.Compose([
        transforms.Resize((256, 256)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    # Apply transformations
    image_tensor = transform(image).unsqueeze(0)
    return image_tensor, image

def predict(image_path, model, device="cpu"):
    # No need to reload the model every time you predict. Use the passed model.
    # model = load_model(model_path, device=device)

    image_tensor, image = process_image(image_path)  # Assume process_image returns (tensor, image)
    image_tensor = image_tensor.to(device)

    i = 0
    # Forward pass to obtain activations
    with torch.no_grad():
        # Get activations from convolutional layers (features)
        x = image_tensor
        bias_type = model.features[0].bias.dtype
        for layer in model.features:
            x = layer(x)
            x *= dormant_data[i]
            x = x.to(bias_type)
            i += 1

        for layer in [model.avgpool]:
            x = layer(x)
            x *= dormant_data[i]
            x = x.to(bias_type)
            i += 1

        # Get activations from linear layers (classifier)
        x = x.view(x.size(0), -1)  # Flatten the output before feeding to linear layers
        for layer in model.classifier:
            x = layer(x)
            x *= dormant_data[i]
            x = x.to(bias_type)
            i += 1
        
        _, predicted_class_index = torch.max(x, 1)

    return predicted_class_index.item(), image

def predict_on_directory(directory_path, model_path, misclss, num_images_to_test=0, device="cpu", backdoored=False, model_name="ran_sqr_sin_01"):
    model = load_model(model_path, device=device)
    
    # Assuming you have class names to interpret the outputs (you need to define this list)
    class_names = pd.read_csv('/home/cc7486/Desktop/Research/MLLsecurity/gtsrb-german-traffic-sign/labels.csv')['Name'].tolist()
    tests = pd.read_csv('/home/cc7486/Desktop/Research/MLLsecurity/gtsrb-german-traffic-sign/Test.csv')
    # List all files in the directory
    filenames = os.listdir(directory_path)
    random.shuffle(filenames)
    #count number of correct prediction
    correct_count=0
    #initialize the ground truth class id
    #for now, I am checking if the model can predict the backdoored stop sign which belong to class 14
    # Limit the number of files processed
    if num_images_to_test:
        filenames = filenames[:num_images_to_test]
    else:
        num_images_to_test = 780

    for filename in filenames:
        image_path = os.path.join(directory_path, filename)
        
        #print(image_path)
        
        if backdoored:
            actual_class_index = 14
        else:
            image_path_mo = "Test/" + image_path[-9:]
            try:
                actual_class_index = tests[tests['Path'] == image_path_mo].iloc[0]['ClassId']
            except:
                pass
        
        # Process only .png images (according to your error traceback)
        if image_path.lower().endswith('.png'):
            predicted_class_index, image = predict(image_path, model, device)

            predicted_class_name = class_names[predicted_class_index]
            
            if predicted_class_index == actual_class_index:
                correct_count += 1
            elif backdoored:
                lst = misclss.get(model_name,[])
                lst.append(image_path)
                misclss[model_name] = lst

            
    # masked_model = MaskedModel(model, dormant_neurons)
    # print("model masked")
                
    if backdoored:
        rsl = "backdoored"
    else:
        rsl = "clean"
    print(f'Accuracy on {rsl} images is: {correct_count / num_images_to_test * 100:.2f}%')
    # return masked_model
    return neurons, model, correct_count / num_images_to_test

In [6]:
# Run predictions on 10 random images from the 'test' directory
models = ['ran_sqr_sin_01','ran_sqr_mul_01','ran_sqr_sin_001','ran_sqr_mul_001', 'fixed_sqr_sin_01', 'fixed_sqr_mul_01','fixed_sqr_sin_001','fixed_sqr_mul_001',\
         'ran_cir_sin_01','ran_cir_mul_01','ran_cir_sin_001','ran_cir_mul_001', 'fixed_cir_sin_01', 'fixed_cir_mul_01','fixed_cir_sin_001','fixed_cir_mul_001',\
         'ran_tri_sin_01','ran_tri_mul_01','ran_tri_sin_001','ran_tri_mul_001', 'fixed_tri_sin_01', 'fixed_tri_mul_01','fixed_tri_sin_001','fixed_tri_mul_001']

misclss = {}

i = 1
print(f"\ntest of {models[i-1]}:")
test_directory = f'/home/cc7486/Desktop/Research/MLLsecurity/gtsrb-german-traffic-sign/backdoored-test/780/{i}/'
test_directory_t = '/home/cc7486/Desktop/Research/MLLsecurity/gtsrb-german-traffic-sign/Test'
model_path = f"models/780/alexnet_case_{i}.pt"
#testing trained model on backdoored data
# neurons = predict_on_directory(test_directory, model_path, misclss, device="cpu", backdoored=True, model_name=i)
#testing the trained model on clean data
neurons = predict_on_directory(test_directory_t, model_path, misclss, num_images_to_test=1000, device="cpu",model_name=i)


test of ran_sqr_sin_01:


Accuracy on clean images is: 97.20%


In [7]:
# Run predictions on 10 random images from the 'test' directory
models = ['ran_sqr_sin_01','ran_sqr_mul_01','ran_sqr_sin_001','ran_sqr_mul_001', 'fixed_sqr_sin_01', 'fixed_sqr_mul_01','fixed_sqr_sin_001','fixed_sqr_mul_001',\
         'ran_cir_sin_01','ran_cir_mul_01','ran_cir_sin_001','ran_cir_mul_001', 'fixed_cir_sin_01', 'fixed_cir_mul_01','fixed_cir_sin_001','fixed_cir_mul_001',\
         'ran_tri_sin_01','ran_tri_mul_01','ran_tri_sin_001','ran_tri_mul_001', 'fixed_tri_sin_01', 'fixed_tri_mul_01','fixed_tri_sin_001','fixed_tri_mul_001']

misclss = {}

i = 1
print(f"\ntest of {models[i-1]}:")
test_directory = f'/home/cc7486/Desktop/Research/MLLsecurity/gtsrb-german-traffic-sign/backdoored-test/780/{i}/'
test_directory_t = '/home/cc7486/Desktop/Research/MLLsecurity/gtsrb-german-traffic-sign/Test'
model_path = f"models/780/alexnet_case_{i}.pt"
#testing trained model on backdoored data
neurons = predict_on_directory(test_directory, model_path, misclss, device="cpu", backdoored=True, model_name=i)


test of ran_sqr_sin_01:


Accuracy on backdoored images is: 52.05%


Fine-tuning

In [6]:
import numpy as np

clean_acc, back_acc = [],[]

for i in range(20):
    dormant_data = []
    for j in range(len(raw_data)):
        dormant_data.append(np.where(np.abs(raw_data[j]) <= i, 0, 1).astype(float))
    # dormant_data
    print(f"\ntest of pruned model with threshold {i}:")
    test_directory = f'/home/cc7486/Desktop/Research/MLLsecurity/gtsrb-german-traffic-sign/backdoored-test/780/1/'
    test_directory_t = '/home/cc7486/Desktop/Research/MLLsecurity/gtsrb-german-traffic-sign/Test'
    model_path = f"models/780/alexnet_case_1.pt"
    #testing trained model on backdoored data
    neurons, model, acc = predict_on_directory(test_directory, model_path, misclss, device="cpu", backdoored=True, model_name=i)
    back_acc.append(acc)
    #testing the trained model on clean data
    neurons, model, acc = predict_on_directory(test_directory_t, model_path, misclss, num_images_to_test=1000, device="cpu",model_name=i)
    clean_acc.append(acc)


test of pruned model with threshold 0:


Accuracy on backdoored images is: 95.77%
Accuracy on clean images is: 96.40%

test of pruned model with threshold 1:
Accuracy on backdoored images is: 86.67%
Accuracy on clean images is: 97.90%

test of pruned model with threshold 2:
Accuracy on backdoored images is: 78.97%
Accuracy on clean images is: 97.40%

test of pruned model with threshold 3:
Accuracy on backdoored images is: 67.82%
Accuracy on clean images is: 97.40%

test of pruned model with threshold 4:
Accuracy on backdoored images is: 61.67%
Accuracy on clean images is: 96.30%

test of pruned model with threshold 5:
Accuracy on backdoored images is: 57.44%
Accuracy on clean images is: 98.20%

test of pruned model with threshold 6:
Accuracy on backdoored images is: 55.13%
Accuracy on clean images is: 97.80%

test of pruned model with threshold 7:
Accuracy on backdoored images is: 51.67%
Accuracy on clean images is: 96.50%

test of pruned model with threshold 8:
Accuracy on backdoored images is: 49.36%
Accuracy on clean image

In [7]:
# clean_acc, back_acc = [],[]
for i in range(20,51):
    dormant_data = []
    for j in range(len(raw_data)):
        dormant_data.append(np.where(np.abs(raw_data[j]) <= i, 0, 1).astype(float))
    # dormant_data
    print(f"\ntest of pruned model with threshold {i}:")
    test_directory = f'/home/cc7486/Desktop/Research/MLLsecurity/gtsrb-german-traffic-sign/backdoored-test/780/1/'
    test_directory_t = '/home/cc7486/Desktop/Research/MLLsecurity/gtsrb-german-traffic-sign/Test'
    model_path = f"models/780/alexnet_case_1.pt"
    #testing trained model on backdoored data
    neurons, model, acc = predict_on_directory(test_directory, model_path, misclss, device="cpu", backdoored=True, model_name=i)
    back_acc.append(acc)
    #testing the trained model on clean data
    neurons, model, acc = predict_on_directory(test_directory_t, model_path, misclss, num_images_to_test=1000, device="cpu",model_name=i)
    clean_acc.append(acc)


test of pruned model with threshold 20:
Accuracy on backdoored images is: 22.44%
Accuracy on clean images is: 96.50%

test of pruned model with threshold 21:
Accuracy on backdoored images is: 20.90%
Accuracy on clean images is: 95.50%

test of pruned model with threshold 22:
Accuracy on backdoored images is: 18.59%
Accuracy on clean images is: 96.40%

test of pruned model with threshold 23:
Accuracy on backdoored images is: 17.31%
Accuracy on clean images is: 96.80%

test of pruned model with threshold 24:
Accuracy on backdoored images is: 16.28%
Accuracy on clean images is: 95.70%

test of pruned model with threshold 25:
Accuracy on backdoored images is: 14.74%
Accuracy on clean images is: 95.90%

test of pruned model with threshold 26:
Accuracy on backdoored images is: 13.72%
Accuracy on clean images is: 95.90%

test of pruned model with threshold 27:
Accuracy on backdoored images is: 11.15%
Accuracy on clean images is: 95.40%

test of pruned model with threshold 28:
Accuracy on bac

In [8]:
# clean_acc, back_acc = [],[]
for i in range(51,61):
    dormant_data = []
    for j in range(len(raw_data)):
        dormant_data.append(np.where(np.abs(raw_data[j]) <= i, 0, 1).astype(float))
    # dormant_data
    print(f"\ntest of pruned model with threshold {i}:")
    test_directory = f'/home/cc7486/Desktop/Research/MLLsecurity/gtsrb-german-traffic-sign/backdoored-test/780/1/'
    test_directory_t = '/home/cc7486/Desktop/Research/MLLsecurity/gtsrb-german-traffic-sign/Test'
    model_path = f"models/780/alexnet_case_1.pt"
    #testing trained model on backdoored data
    neurons, model, acc = predict_on_directory(test_directory, model_path, misclss, device="cpu", backdoored=True, model_name=i)
    back_acc.append(acc)
    #testing the trained model on clean data
    neurons, model, acc = predict_on_directory(test_directory_t, model_path, misclss, num_images_to_test=1000, device="cpu",model_name=i)
    clean_acc.append(acc)


test of pruned model with threshold 51:
Accuracy on backdoored images is: 0.00%
Accuracy on clean images is: 90.90%

test of pruned model with threshold 52:
Accuracy on backdoored images is: 0.00%
Accuracy on clean images is: 92.30%

test of pruned model with threshold 53:
Accuracy on backdoored images is: 0.00%
Accuracy on clean images is: 90.70%

test of pruned model with threshold 54:
Accuracy on backdoored images is: 0.00%
Accuracy on clean images is: 89.40%

test of pruned model with threshold 55:
Accuracy on backdoored images is: 0.00%
Accuracy on clean images is: 90.60%

test of pruned model with threshold 56:
Accuracy on backdoored images is: 0.13%
Accuracy on clean images is: 90.30%

test of pruned model with threshold 57:
Accuracy on backdoored images is: 0.13%
Accuracy on clean images is: 89.90%

test of pruned model with threshold 58:
Accuracy on backdoored images is: 0.13%
Accuracy on clean images is: 88.80%

test of pruned model with threshold 59:
Accuracy on backdoored 