In [None]:
#loaded important libraries
import secml_malware
import configparser
import magic
from secml.array import CArray
import os
import shutil
from secml_malware.models.malconv import MalConv
from secml_malware.models.c_classifier_end2end_malware import CClassifierEnd2EndMalware, End2EndModel

net = MalConv()
net = CClassifierEnd2EndMalware(net)
net.load_pretrained_model()
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import euclidean_distances
from matplotlib import rcParams
import sklearn

In [None]:
config = configparser.ConfigParser()

# Read the configuration file
config.read('config.ini')
sed = 'gamma_subset_euclidean_distance'
output = config.get(sed, 'output')
gamma = config.get(sed, 'gamma')
gamma_subset_ad = config.get(sed, 'gamma_subset_ad')
c_gamma = config.get(sed, 'c_gamma')
c_gamma_subset = config.get(sed, 'c_gamma_subset')
gamma_ad_csv = config.get(sed, 'gamma_ad_csv')
gamma_subset_ad_csv = config.get(sed, 'gamma_subset_ad_csv')
c_gamma_csv = config.get(sed, 'c_gamma_csv')
c_gamma_subset_csv = config.get(sed, 'c_gamma_subset_csv')

if os.path.exists(output):
    shutil.rmtree(output)
os.makedirs(output)

In [None]:

scaler = MinMaxScaler(feature_range=(0, 1))

In [None]:
#loaded PE origional malicious and corresponding adversarial files and sorted 
malicious_files = os.listdir(c_gamma)
adversarial_files = os.listdir(gamma)
malicious_files.sort()
adversarial_files.sort()
raw_byte_distances_list = []
files_name_list = []
#Read files, insert or drop values to follow the file length rule and computed euclidean distance 
for malicious_file, adversarial_file in zip(malicious_files, adversarial_files):
    malicious_file_path = os.path.join(c_gamma, malicious_file)
    adversarial_file_path = os.path.join(gamma, adversarial_file)

    with open( malicious_file_path, "rb") as file_handle:
        malicious_file_code = file_handle.read()
        
    with open(adversarial_file_path, "rb") as file_handle:
        adversarial_file_code = file_handle.read()  
    
    malicious_file_bytes = End2EndModel.bytes_to_numpy(malicious_file_code, net.get_input_max_length(),256, False)
    malicious_file_bytes = malicious_file_bytes.reshape(-1, 1)
    malicious_file_bytes = scaler.fit_transform(malicious_file_bytes)
    adversarial_file_bytes = End2EndModel.bytes_to_numpy(adversarial_file_code, net.get_input_max_length(),256, False)
    adversarial_file_bytes = adversarial_file_bytes.reshape(-1, 1)
    adversarial_file_bytes = scaler.fit_transform(adversarial_file_bytes)
    eclidean_distance = np.linalg.norm(malicious_file_bytes- adversarial_file_bytes)
    dist = eclidean_distance/1048576
    raw_byte_distances_list.append(dist)
    files_name_list.append(malicious_file)
    


In [None]:
#loaded subset PE origional malicious and corresponding adversarial files and sorted 
malicious_files = os.listdir(c_gamma_subset)
adversarial_files = os.listdir(gamma_subset_ad)
malicious_files.sort()
adversarial_files.sort()
subset_raw_byte_distances_list = []
subset_files_name_list = []
#Read files, insert or drop values to follow the file length rule and computed euclidean distance 
for malicious_file, adversarial_file in zip(malicious_files, adversarial_files):
    malicious_file_path = os.path.join(c_gamma_subset, malicious_file)
    adversarial_file_path = os.path.join(gamma_subset_ad, adversarial_file)

    with open( malicious_file_path, "rb") as file_handle:
        malicious_file_code = file_handle.read()
        
    with open(adversarial_file_path, "rb") as file_handle:
        adversarial_file_code = file_handle.read()  
    
    malicious_file_bytes = End2EndModel.bytes_to_numpy(malicious_file_code, net.get_input_max_length(),256, False)
    malicious_file_bytes = malicious_file_bytes.reshape(-1, 1)
    malicious_file_bytes = scaler.fit_transform(malicious_file_bytes)
    adversarial_file_bytes = End2EndModel.bytes_to_numpy(adversarial_file_code, net.get_input_max_length(),256, False)
    adversarial_file_bytes = adversarial_file_bytes.reshape(-1, 1)
    adversarial_file_bytes = scaler.fit_transform(adversarial_file_bytes)
    eclidean_distance = np.linalg.norm(malicious_file_bytes- adversarial_file_bytes)
    dist = eclidean_distance/1048576
    subset_raw_byte_distances_list.append(dist)
    subset_files_name_list.append(malicious_file)
    


In [None]:
mean_raw_byte_distance = np.mean(raw_byte_distances_list)
subset_mean_raw_byte_distance = np.mean(subset_raw_byte_distances_list)

In [None]:
scaler = MinMaxScaler()

In [None]:
#Loaded csv files of features extracetd from Orginal malicious PE files and corresponding adversarial files 
malcious_sample = pd.read_csv(c_gamma_csv) 
adversarial_samples = pd.read_csv(gamma_ad_csv)
concatenate = pd.concat([malcious_sample, adversarial_samples])
concatenate_scaled = scaler.fit_transform(concatenate)
#Seprate concatenated to back corresponding DataFrames 
scaled_df = pd.DataFrame(concatenate_scaled)
malicious_scaled = scaled_df.iloc[:len(malcious_sample)].reset_index(drop=True)
adversarial_scaled = scaled_df.iloc[len(malcious_sample):].reset_index(drop=True)
#Compute euclidean distance between malicious files and corresponding adversarial files
eucl_dist = euclidean_distances(malicious_scaled, adversarial_scaled)
euclid_distan = eucl_dist/2381
feature_distances_matrix = np.diag(euclid_distan)
feature_distance_list = feature_distances_matrix.tolist()


In [None]:
#Loaded csv files of features extracetd from Orginal malicious PE files and corresponding adversarial files 
malcious_sample = pd.read_csv(c_gamma_subset_csv) 
adversarial_samples = pd.read_csv(gamma_subset_ad_csv) 
concatenate = pd.concat([malcious_sample, adversarial_samples])
concatenate_scaled = scaler.transform(concatenate)
#Seprate concatenated to back corresponding DataFrames 
scaled_df = pd.DataFrame(concatenate_scaled)
malicious_scaled = scaled_df.iloc[:len(malcious_sample)].reset_index(drop=True)
adversarial_scaled = scaled_df.iloc[len(malcious_sample):].reset_index(drop=True)
#Compute euclidean distance between malicious files and corresponding adversarial files
eucl_dist = euclidean_distances(malicious_scaled, adversarial_scaled)
euclid_distan = eucl_dist/2381
feature_distances_matrix = np.diag(euclid_distan)
subset_feature_distance_list = feature_distances_matrix.tolist()

In [None]:
mean_feature_distance = np.mean(feature_distance_list)
subset_mean_feature_distance = np.mean(subset_feature_distance_list)

In [None]:
# Create a DataFrame
df = pd.DataFrame({
    'File_Names': files_name_list,
    'Raw_byte_Distances': raw_byte_distances_list,
    'Feature_Distances': feature_distance_list
})

# Create a new DataFrame for the mean distances
mean_df = pd.DataFrame({
    'File_Names': ['Mean Values'],
    'mean_Raw_byte_Distances': [mean_raw_byte_distance],
    'mean_Feature_Distances': [mean_feature_distance]
})

# Concatenate the original DataFrame with the new DataFrame
df = pd.concat([df, mean_df], ignore_index=True)

# Save to a CSV file
csv_file_path = os.path.join(output,'gamma_distances.csv')
df.to_csv(csv_file_path, index=False)

In [None]:
# Create a DataFrame
df = pd.DataFrame({
    'File_Names': subset_files_name_list,
    'Raw_byte_Distances': subset_raw_byte_distances_list,
    'Feature_Distances': subset_feature_distance_list
})

# Create a new DataFrame for the mean distances
mean_df = pd.DataFrame({
    'File_Names': ['Mean Values'],
    'mean_Raw_byte_Distances': [subset_mean_raw_byte_distance],
    'mean_Feature_Distances': [subset_mean_feature_distance]
})

# Concatenate the original DataFrame with the new DataFrame
df = pd.concat([df, mean_df], ignore_index=True)

# Save to a CSV file
csv_file_path = os.path.join(output, 'subset_gamma_distances.csv')
df.to_csv(csv_file_path, index=False)