# Classification of other datasets, not used for training

In [108]:
from keras.models import load_model
import tensorflow as tf
import os
from keras.preprocessing import image
import numpy as np
import pandas as pd

In [109]:
ih, iw = 64, 64
ch = 'rgb'
model_dir = "C:/Users/Max/Documents/GitHub/DenseNet/DenseNet-cgan-kaggle-v008/model.h5"

In [None]:
model = load_model(model_dir)
model.compile(optimizer='adam', loss='losses.sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
index_1 = model_dir.find('/', 30) + 1
index_2 = model_dir.find('/', index_1)
index_3 = model_dir.find('-', index_1)
model_type = model_dir[index_1:index_2]
trimmed_model_type = model_dir[index_1:index_3]

print(model_type)
print(trimmed_model_type)

In [None]:
data_dir = "C:/Users/Max/Documents/ransomware_val/virusshare"

data_i_1 = data_dir.find('/', 30)+1

data_type = data_dir[data_i_1:]

if data_type == 'virusshare':
    data_file_count = 4332
elif data_type == 'bazaarImages':
    data_file_count = 968
else: 
    data_file_count = 'unknown'

In [None]:
rw_imgs = np.array([])
rw_family = np.array([])
rw_labels = np.array([]).astype("int32")

b_imgs = np.array([])
b_family = np.array([])
b_labels = np.array([]).astype("int32")

imgs = np.array([])
family = np.array([])
labels = np.array([]).astype("int32")

i = 0

for root, dirs, files in os.walk(data_dir): 
    for filename in files:
        path = os.path.join(root, filename)
        img = image.load_img(path, target_size=(iw,ih), color_mode=ch)
        x = image.img_to_array(img)
        x = np.expand_dims(x, axis = 0)
    
        images = np.vstack([x])
        p = model.predict(images)

        # Keep track of progress
        i = i + 1
        print(f"{i} of {data_file_count} files")
        
        # Retrieve name of software family (e.g. Reveton)
        p_i_1 = path.find('/', 30)+len(data_type)+2
        p_i_2 = path.find("\\", p_i_1)
        family_name = path[p_i_1:p_i_2]
    
        # Predicted label
        q = p.argmax()
        
        imgs = np.append(imgs, path)
        family = np.append(family, family_name)
        labels = np.append(labels, q)
        
        if np.any(p>0.5):
            if not q == 10:
                rw_imgs = np.append(rw_imgs, path)
                rw_family = np.append(rw_family, family_name)
                rw_labels = np.append(rw_labels, q)
            if q == 10:
                b_imgs = np.append(b_imgs, path)
                b_family = np.append(b_family, family_name)
                b_labels = np.append(b_labels, q)

rw_imgs_family_labels = pd.DataFrame({'file': rw_imgs, 'family': rw_family, 'label': rw_labels}, 
                                     columns=['file', 'family', 'label'])
b_imgs_family_labels = pd.DataFrame({'file': b_imgs, 'family': b_family, 'label': b_labels}, 
                                     columns=['file', 'family', 'label'])
imgs_family_labels = pd.DataFrame({'file': imgs, 'family': family, 'label': labels}, 
                                     columns=['file', 'family', 'label'])

In [None]:
save_dir = f"C:/Users/Max/Documents/validation_runs/{data_type}_{model_type}"
try:
    os.makedirs(save_dir)
except:
    print("Folder already exists! Do you want to continue?")
    input("Press 'Enter' to continue")

rw_imgs_family_labels.to_csv(path_or_buf=f"{save_dir}/rw_{data_type}_{model_type}.csv")
b_imgs_family_labels.to_csv(path_or_buf=f"{save_dir}/b_{data_type}_{model_type}.csv")
imgs_family_labels.to_csv(path_or_buf=f"{save_dir}/{data_type}_{model_type}.csv")

# Analysis

In [None]:
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, matthews_corrcoef, accuracy_score
from prettytable import PrettyTable, MSWORD_FRIENDLY
import seaborn as sns

In [None]:
y_predbin = []
y_truebin = []

for count, value in enumerate(labels):
    y_truebin.append(0)
    if labels[count] in range(10):
        y_predbin.append(0)
        
    else: y_predbin.append(1)
    

In [None]:
labels

In [None]:
print(np.unique(labels, return_counts=True))

In [None]:
c_matrix_bin = metrics.confusion_matrix(y_truebin, y_predbin)

In [None]:
def confusion_matrix_bin(confusion_matrix, class_names_bin, figsize = (5,2), fontsize=7):
   
    df_cm = pd.DataFrame(
        confusion_matrix, index=class_names_bin, columns=class_names_bin, 
    )
    fig = plt.figure(figsize=figsize)
    try:
        heatmap = sns.heatmap(df_cm, annot=True, fmt="d")
    except ValueError:
        raise ValueError("Confusion matrix values must be integers.")
    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.title(f"{model_type}")
    plt.savefig(f"C:/Users/Max/Documents/validation_runs/visuals/CM_{data_type}_{model_type}.png", bbox_inches = 'tight')
    plt.savefig(f"{save_dir}/CM_{data_type}_{model_type}.png", bbox_inches = 'tight')

In [None]:
class_names_bin= ("ransomware", "benign")
confusion_matrix_bin(c_matrix_bin, class_names_bin, figsize = (5,2), fontsize=10)

**Performance per family**

In [None]:
fam_labels = pd.DataFrame(imgs_family_labels, columns=['family', 'label'])
fam_labels['label'] = fam_labels['label'].astype('str')

In [None]:
for count, value in enumerate(fam_labels['label'].values):
    if int(fam_labels['label'].values[count]) in range(10):
        fam_labels['label'].values[count] = 'ransomware'
    else:
        fam_labels['label'].values[count] = 'benign'

In [None]:
fam_labels['label'].values

In [None]:
df = fam_labels.value_counts(sort=False)

In [None]:
df.unstack().plot(kind = 'bar', stacked=True, y=['ransomware', 'benign'])

In [None]:
dfu = df.unstack()

In [None]:
df.unstack()

In [None]:
for count, label in enumerate(dfu['benign']):
    total = dfu['benign'][count] + dfu['ransomware'][count]
    dfu['benign'][count] = dfu['benign'][count] / total * 100
    dfu['ransomware'][count] = dfu['ransomware'][count] / total * 100

In [None]:
per_distr = dfu.plot(kind = 'bar', stacked=True, y=['ransomware', 'benign'], color = ['cornflowerblue', 'darkorange'])
plt.legend(['Ransomware', 'Benign'])
plt.xlabel('Family')
plt.ylabel('Relative percentage (%)')
plt.title(trimmed_model_type)
plt.savefig(f"{save_dir}/BP_{data_type}_{model_type}.png", bbox_inches = 'tight')
plt.savefig(f"C:/Users/Max/Documents/validation_runs/visuals/BP_{data_type}_{model_type}.png", bbox_inches = 'tight')