In [2]:
import pprint

# Data libs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Classifiers 
from sklearn.ensemble import RandomForestClassifier

# Analysis libs
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [4]:
malware_data_filepath = '../datasets/Obfuscated/Obfuscated-MalMem2022_edited.csv'

malware_data = pd.read_csv(malware_data_filepath)

In [5]:
malware_data

Unnamed: 0,Category,pslist.nproc,pslist.nppid,pslist.avg_threads,pslist.nprocs64bit,pslist.avg_handlers,dlllist.ndlls,dlllist.avg_dlls_per_proc,handles.nhandles,handles.avg_handles_per_proc,...,svcscan.kernel_drivers,svcscan.fs_drivers,svcscan.process_services,svcscan.shared_process_services,svcscan.interactive_process_services,svcscan.nactive,callbacks.ncallbacks,callbacks.nanonymous,callbacks.ngeneric,Class
0,Benign,45,17,10.555556,0,202.844444,1694,38.500000,9129,212.302326,...,221,26,24,116,0,121,87,0,8,Benign
1,Benign,47,19,11.531915,0,242.234043,2074,44.127660,11385,242.234043,...,222,26,24,118,0,122,87,0,8,Benign
2,Benign,40,14,14.725000,0,288.225000,1932,48.300000,11529,288.225000,...,222,26,27,118,0,120,88,0,8,Benign
3,Benign,32,13,13.500000,0,264.281250,1445,45.156250,8457,264.281250,...,222,26,27,118,0,120,88,0,8,Benign
4,Benign,42,16,11.452381,0,281.333333,2067,49.214286,11816,281.333333,...,222,26,24,118,0,124,87,0,8,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58591,Ransomware,37,15,10.108108,0,215.486487,1453,39.270270,7973,215.486487,...,221,26,24,116,0,120,86,0,8,Malware
58592,Ransomware,37,14,9.945946,0,190.216216,1347,36.405405,7038,190.216216,...,221,26,24,116,0,116,88,0,8,Malware
58593,Ransomware,38,15,9.842105,0,210.026316,1448,38.105263,7982,215.729730,...,221,26,24,116,0,120,88,0,8,Malware
58594,Ransomware,37,15,10.243243,0,215.513513,1452,39.243243,7974,215.513513,...,221,26,24,116,0,120,87,0,8,Malware


In [6]:
malware_data.shape

(58596, 57)

In [7]:
pprint.pprint(malware_data.describe())

       pslist.nproc  pslist.nppid  pslist.avg_threads  pslist.nprocs64bit  \
count  58596.000000  58596.000000        58596.000000             58596.0   
mean      41.394771     14.713837           11.341655                 0.0   
std        5.777249      2.656748            1.588231                 0.0   
min       21.000000      8.000000            1.650000                 0.0   
25%       40.000000     12.000000            9.972973                 0.0   
50%       41.000000     15.000000           11.000000                 0.0   
75%       43.000000     16.000000           12.861955                 0.0   
max      240.000000     72.000000           16.818182                 0.0   

       pslist.avg_handlers  dlllist.ndlls  dlllist.avg_dlls_per_proc  \
count         58596.000000   58596.000000               58596.000000   
mean            247.509819    1810.805447                  43.707806   
std             111.857790     329.782639                   5.742023   
min              3

In [8]:
# Grab the target variable from the dataset
Class = malware_data.Class
Category = malware_data.Category

In [9]:
# Label encode the binary classification feature
class_le = LabelEncoder()
y = class_le.fit_transform(Category)
# y = np.array(malware_data.Class)
np.unique(y)

array([0, 1, 2, 3])

In [10]:
# Get all data except for the 'Class' label 
X = malware_data.drop(columns=['Class', 'Category'])
X = X.values
X

array([[45.        , 17.        , 10.55555556, ..., 87.        ,
         0.        ,  8.        ],
       [47.        , 19.        , 11.53191489, ..., 87.        ,
         0.        ,  8.        ],
       [40.        , 14.        , 14.725     , ..., 88.        ,
         0.        ,  8.        ],
       ...,
       [38.        , 15.        ,  9.84210526, ..., 88.        ,
         0.        ,  8.        ],
       [37.        , 15.        , 10.24324324, ..., 87.        ,
         0.        ,  8.        ],
       [38.        , 15.        ,  9.86842105, ..., 86.        ,
         0.        ,  8.        ]])

In [12]:
scaler = StandardScaler()
X_scaler = scaler.fit_transform(X)

In [13]:
# Feature Selection 
select = SelectFromModel(estimator=RandomForestClassifier()).fit(X_scaler, y)


In [14]:
# How many features were important in comparison to the number of features in the dataset
print(len(select.transform(X)))
print(len(X))

58596
58596


In [15]:
# Skip this if you dropped the Category feature
# One-Hot Encoding for the categorical data
malware_ohe = OneHotEncoder()
# malware_ohe.fit_transform(X[:, 0].reshape(-1,1)).toarray()
malware_ohe.fit_transform(X[:, 0].reshape(-1,1)).toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
c_transform = ColumnTransformer([
    ('onehot', OneHotEncoder(), [0])
], remainder='passthrough')

In [None]:
X = c_transform.fit_transform(X).astype(float)

In [16]:
X

array([[45.        , 17.        , 10.55555556, ..., 87.        ,
         0.        ,  8.        ],
       [47.        , 19.        , 11.53191489, ..., 87.        ,
         0.        ,  8.        ],
       [40.        , 14.        , 14.725     , ..., 88.        ,
         0.        ,  8.        ],
       ...,
       [38.        , 15.        ,  9.84210526, ..., 88.        ,
         0.        ,  8.        ],
       [37.        , 15.        , 10.24324324, ..., 87.        ,
         0.        ,  8.        ],
       [38.        , 15.        ,  9.86842105, ..., 86.        ,
         0.        ,  8.        ]])

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_scaler, y, test_size=0.3, random_state=0, stratify=y)

In [31]:
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (41017, 55)
y_train shape: (41017,)
X_test shape: (17579, 55)
y_test shape: (17579,)


In [33]:
rf = RandomForestClassifier(n_estimators=malware_data.shape[1], random_state=42)
clf = rf.fit(X_scaler,y)
clf

In [29]:
rf = RandomForestClassifier(n_estimators=malware_data.shape[1], random_state=42)
clf = rf.fit(X_scaler,y)
print('Training accuracy:', clf.score(X_train, y_train))
print('Test accuracy:', clf.score(X_test, y_test))

Training accuracy: 0.9997805787844065
Test accuracy: 0.9997724557710905


In [34]:
rf = RandomForestClassifier(n_estimators=malware_data.shape[1], random_state=42)
clf = rf.fit(X,y)
print('Training accuracy:', clf.score(X_train, y_train))
print('Test accuracy:', clf.score(X_test, y_test))

Training accuracy: 0.9997805787844065
Test accuracy: 0.9997724557710905


In [14]:
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm

array([[8790,    0,    0,    0],
       [   0, 2936,    1,    0],
       [   0,    1, 3005,    0],
       [   0,    0,    2, 2844]])

In [15]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8790
           1       1.00      1.00      1.00      2937
           2       1.00      1.00      1.00      3006
           3       1.00      1.00      1.00      2846

    accuracy                           1.00     17579
   macro avg       1.00      1.00      1.00     17579
weighted avg       1.00      1.00      1.00     17579

