In [25]:
# Data libs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Classifiers 
from sklearn.ensemble import RandomForestClassifier

# Analysis libs
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split

In [2]:
malware_data_filepath = '../datasets/Obfuscated/Obfuscated-MalMem2022_edited.csv'
malware_data_reduced_filepath = '../datasets/Obfuscated/Obfuscated-MalMem2022_reduced.csv'

malware_data = pd.read_csv(malware_data_filepath)
malware_data_reduced = pd.read_csv(malware_data_reduced_filepath)

In [3]:
malware_data

Unnamed: 0,Category,pslist.nproc,pslist.nppid,pslist.avg_threads,pslist.nprocs64bit,pslist.avg_handlers,dlllist.ndlls,dlllist.avg_dlls_per_proc,handles.nhandles,handles.avg_handles_per_proc,...,svcscan.kernel_drivers,svcscan.fs_drivers,svcscan.process_services,svcscan.shared_process_services,svcscan.interactive_process_services,svcscan.nactive,callbacks.ncallbacks,callbacks.nanonymous,callbacks.ngeneric,Class
0,Benign,45,17,10.555556,0,202.844444,1694,38.500000,9129,212.302326,...,221,26,24,116,0,121,87,0,8,Benign
1,Benign,47,19,11.531915,0,242.234043,2074,44.127660,11385,242.234043,...,222,26,24,118,0,122,87,0,8,Benign
2,Benign,40,14,14.725000,0,288.225000,1932,48.300000,11529,288.225000,...,222,26,27,118,0,120,88,0,8,Benign
3,Benign,32,13,13.500000,0,264.281250,1445,45.156250,8457,264.281250,...,222,26,27,118,0,120,88,0,8,Benign
4,Benign,42,16,11.452381,0,281.333333,2067,49.214286,11816,281.333333,...,222,26,24,118,0,124,87,0,8,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58591,Ransomware,37,15,10.108108,0,215.486487,1453,39.270270,7973,215.486487,...,221,26,24,116,0,120,86,0,8,Malware
58592,Ransomware,37,14,9.945946,0,190.216216,1347,36.405405,7038,190.216216,...,221,26,24,116,0,116,88,0,8,Malware
58593,Ransomware,38,15,9.842105,0,210.026316,1448,38.105263,7982,215.729730,...,221,26,24,116,0,120,88,0,8,Malware
58594,Ransomware,37,15,10.243243,0,215.513513,1452,39.243243,7974,215.513513,...,221,26,24,116,0,120,87,0,8,Malware


In [4]:
malware_data_reduced

Unnamed: 0.1,Unnamed: 0,Category,pslist.nproc,pslist.nppid,pslist.avg_threads,pslist.nprocs64bit,pslist.avg_handlers,dlllist.ndlls,dlllist.avg_dlls_per_proc,handles.nhandles,...,psxview.not_in_deskthrd_false_avg,svcscan.nservices,svcscan.kernel_drivers,svcscan.fs_drivers,svcscan.process_services,svcscan.shared_process_services,svcscan.interactive_process_services,svcscan.nactive,callbacks.ncallbacks,Class
0,0,Benign,45,17,10.555556,0,202.844444,1694,38.500000,9129,...,0.191489,389,221,26,24,116,0,121,87,Benign
1,1,Benign,47,19,11.531915,0,242.234043,2074,44.127660,11385,...,0.127660,392,222,26,24,118,0,122,87,Benign
2,2,Benign,40,14,14.725000,0,288.225000,1932,48.300000,11529,...,0.125000,395,222,26,27,118,0,120,88,Benign
3,3,Benign,32,13,13.500000,0,264.281250,1445,45.156250,8457,...,0.187500,395,222,26,27,118,0,120,88,Benign
4,4,Benign,42,16,11.452381,0,281.333333,2067,49.214286,11816,...,0.217391,392,222,26,24,118,0,124,87,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58591,58591,Ransomware,37,15,10.108108,0,215.486487,1453,39.270270,7973,...,0.184211,389,221,26,24,116,0,120,86,Malware
58592,58592,Ransomware,37,14,9.945946,0,190.216216,1347,36.405405,7038,...,0.162162,389,221,26,24,116,0,116,88,Malware
58593,58593,Ransomware,38,15,9.842105,0,210.026316,1448,38.105263,7982,...,0.225000,389,221,26,24,116,0,120,88,Malware
58594,58594,Ransomware,37,15,10.243243,0,215.513513,1452,39.243243,7974,...,0.162162,389,221,26,24,116,0,120,87,Malware


In [5]:
malware_data.shape

(58596, 57)

In [6]:
malware_data_reduced.shape

(58596, 54)

In [7]:
# Grab the target variable from the dataset
y1 = malware_data.Class
y2 = malware_data.Category

In [8]:
# Label encode the binary classification feature
# class_le = LabelEncoder()
# y = class_le.fit_transform(malware_data['Class'].values)
y = np.array(malware_data.Class)

array(['Benign', 'Benign', 'Benign', ..., 'Malware', 'Malware', 'Malware'],
      dtype=object)

In [9]:
# Get all data except for the 'Class' label 
X = malware_data.drop(columns=['Class'])
X = X.values
X

array([['Benign', 45, 17, ..., 87, 0, 8],
       ['Benign', 47, 19, ..., 87, 0, 8],
       ['Benign', 40, 14, ..., 88, 0, 8],
       ...,
       ['Ransomware', 38, 15, ..., 88, 0, 8],
       ['Ransomware', 37, 15, ..., 87, 0, 8],
       ['Ransomware', 38, 15, ..., 86, 0, 8]], dtype=object)

In [10]:
np.unique(malware_data['Category'])

array(['Benign', 'Ransomware', 'Spyware', 'Trojan'], dtype=object)

In [11]:
# One-Hot Encoding for the categorical data
malware_ohe = OneHotEncoder()
malware_ohe.fit_transform(X[:, 0].reshape(-1,1)).toarray()

array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       ...,
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.]])

In [12]:
c_transform = ColumnTransformer([
    ('onehot', OneHotEncoder(), [0])
], remainder='passthrough')

In [15]:
X = c_transform.fit_transform(X).astype(float)

In [16]:
X

array([[ 1.,  0.,  0., ..., 87.,  0.,  8.],
       [ 1.,  0.,  0., ..., 87.,  0.,  8.],
       [ 1.,  0.,  0., ..., 88.,  0.,  8.],
       ...,
       [ 0.,  1.,  0., ..., 88.,  0.,  8.],
       [ 0.,  1.,  0., ..., 87.,  0.,  8.],
       [ 0.,  1.,  0., ..., 86.,  0.,  8.]])

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

In [18]:
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (41017, 59)
y_train shape: (41017,)
X_test shape: (17579, 59)
y_test shape: (17579,)


In [20]:
rf = RandomForestClassifier(n_estimators=malware_data.shape[1])
rf.fit(X,y)

In [21]:
print('Training accuracy:', rf.score(X_train, y_train))
print('Test accuracy:', rf.score(X_test, y_test))

Training accuracy: 1.0
Test accuracy: 1.0


In [23]:
y_pred = rf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm

array([[8790,    0],
       [   0, 8789]])

In [26]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      Benign       1.00      1.00      1.00      8790
     Malware       1.00      1.00      1.00      8789

    accuracy                           1.00     17579
   macro avg       1.00      1.00      1.00     17579
weighted avg       1.00      1.00      1.00     17579

