In [17]:
import pprint

# Data libs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Classifiers 
from sklearn.ensemble import RandomForestClassifier

# Analysis libs
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split


In [18]:
malware_data_reduced_filepath = '../datasets/Obfuscated/Obfuscated-MalMem2022_reduced.csv'
malware_data_reduced = pd.read_csv(malware_data_reduced_filepath)

In [19]:
malware_data_reduced

Unnamed: 0.1,Unnamed: 0,Category,pslist.nproc,pslist.nppid,pslist.avg_threads,pslist.nprocs64bit,pslist.avg_handlers,dlllist.ndlls,dlllist.avg_dlls_per_proc,handles.nhandles,...,psxview.not_in_deskthrd_false_avg,svcscan.nservices,svcscan.kernel_drivers,svcscan.fs_drivers,svcscan.process_services,svcscan.shared_process_services,svcscan.interactive_process_services,svcscan.nactive,callbacks.ncallbacks,Class
0,0,Benign,45,17,10.555556,0,202.844444,1694,38.500000,9129,...,0.191489,389,221,26,24,116,0,121,87,Benign
1,1,Benign,47,19,11.531915,0,242.234043,2074,44.127660,11385,...,0.127660,392,222,26,24,118,0,122,87,Benign
2,2,Benign,40,14,14.725000,0,288.225000,1932,48.300000,11529,...,0.125000,395,222,26,27,118,0,120,88,Benign
3,3,Benign,32,13,13.500000,0,264.281250,1445,45.156250,8457,...,0.187500,395,222,26,27,118,0,120,88,Benign
4,4,Benign,42,16,11.452381,0,281.333333,2067,49.214286,11816,...,0.217391,392,222,26,24,118,0,124,87,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58591,58591,Ransomware,37,15,10.108108,0,215.486487,1453,39.270270,7973,...,0.184211,389,221,26,24,116,0,120,86,Malware
58592,58592,Ransomware,37,14,9.945946,0,190.216216,1347,36.405405,7038,...,0.162162,389,221,26,24,116,0,116,88,Malware
58593,58593,Ransomware,38,15,9.842105,0,210.026316,1448,38.105263,7982,...,0.225000,389,221,26,24,116,0,120,88,Malware
58594,58594,Ransomware,37,15,10.243243,0,215.513513,1452,39.243243,7974,...,0.162162,389,221,26,24,116,0,120,87,Malware


In [20]:
# run this cell always 
malware_data_reduced = malware_data_reduced.drop(columns=['Unnamed: 0'])

In [21]:
pprint.pprint(malware_data_reduced.describe())

       pslist.nproc  pslist.nppid  pslist.avg_threads  pslist.nprocs64bit  \
count  58596.000000  58596.000000        58596.000000             58596.0   
mean      41.394771     14.713837           11.341655                 0.0   
std        5.777249      2.656748            1.588231                 0.0   
min       21.000000      8.000000            1.650000                 0.0   
25%       40.000000     12.000000            9.972973                 0.0   
50%       41.000000     15.000000           11.000000                 0.0   
75%       43.000000     16.000000           12.861955                 0.0   
max      240.000000     72.000000           16.818182                 0.0   

       pslist.avg_handlers  dlllist.ndlls  dlllist.avg_dlls_per_proc  \
count         58596.000000   58596.000000               58596.000000   
mean            247.509819    1810.805447                  43.707806   
std             111.857790     329.782639                   5.742023   
min              3

In [26]:
# Grab the target variable from the dataset
Class = malware_data_reduced.Class
Category = malware_data_reduced.Category

In [37]:
# Label encode the binary classification feature
class_le = LabelEncoder()
y = class_le.fit_transform(Category)
# y = np.array(malware_data.Class)
y

array([0, 0, 0, ..., 1, 1, 1])

In [38]:
X = malware_data_reduced.drop(columns=['Class', 'Category'])
X = X.values
X

array([[ 45.        ,  17.        ,  10.55555556, ...,   0.        ,
        121.        ,  87.        ],
       [ 47.        ,  19.        ,  11.53191489, ...,   0.        ,
        122.        ,  87.        ],
       [ 40.        ,  14.        ,  14.725     , ...,   0.        ,
        120.        ,  88.        ],
       ...,
       [ 38.        ,  15.        ,   9.84210526, ...,   0.        ,
        120.        ,  88.        ],
       [ 37.        ,  15.        ,  10.24324324, ...,   0.        ,
        120.        ,  87.        ],
       [ 38.        ,  15.        ,   9.86842105, ...,   0.        ,
        120.        ,  86.        ]])

In [44]:
select = SelectFromModel(estimator=RandomForestClassifier()).fit(X, y)


In [45]:
print(len(select.transform(X)))
print(len(X))

58596
58596


In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

In [47]:
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (41017, 51)
y_train shape: (41017,)
X_test shape: (17579, 51)
y_test shape: (17579,)


In [49]:
rf = RandomForestClassifier(n_estimators=malware_data_reduced.shape[1], random_state=42)
clf = rf.fit(X,y)
clf

In [50]:
print('Training accuracy:', clf.score(X_train, y_train))
print('Test accuracy:', clf.score(X_test, y_test))

Training accuracy: 0.9997074383792086
Test accuracy: 0.9997155697138631


In [51]:
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm

array([[8790,    0,    0,    0],
       [   0, 2936,    1,    0],
       [   0,    1, 3005,    0],
       [   0,    0,    3, 2843]])

In [52]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8790
           1       1.00      1.00      1.00      2937
           2       1.00      1.00      1.00      3006
           3       1.00      1.00      1.00      2846

    accuracy                           1.00     17579
   macro avg       1.00      1.00      1.00     17579
weighted avg       1.00      1.00      1.00     17579



In [53]:
pprint.pprint(malware_data_reduced)

         Category  pslist.nproc  pslist.nppid  pslist.avg_threads  \
0          Benign            45            17           10.555556   
1          Benign            47            19           11.531915   
2          Benign            40            14           14.725000   
3          Benign            32            13           13.500000   
4          Benign            42            16           11.452381   
...           ...           ...           ...                 ...   
58591  Ransomware            37            15           10.108108   
58592  Ransomware            37            14            9.945946   
58593  Ransomware            38            15            9.842105   
58594  Ransomware            37            15           10.243243   
58595  Ransomware            38            15            9.868421   

       pslist.nprocs64bit  pslist.avg_handlers  dlllist.ndlls  \
0                       0           202.844444           1694   
1                       0           242.2

In [54]:
# Grab the target variable from the dataset
Class = malware_data_reduced.Class
Category = malware_data_reduced.Category

In [67]:
# Label encode the binary classification feature
class_le = LabelEncoder()
y = class_le.fit_transform(Category)
# y = np.array(malware_data.Class)
np.unique(y)

array([0, 1, 2, 3])

In [68]:
# Get all data except for the 'Class' label 
X = malware_data_reduced.drop(columns=['Class', 'Category'])
X = X.values
X

array([[ 45.        ,  17.        ,  10.55555556, ...,   0.        ,
        121.        ,  87.        ],
       [ 47.        ,  19.        ,  11.53191489, ...,   0.        ,
        122.        ,  87.        ],
       [ 40.        ,  14.        ,  14.725     , ...,   0.        ,
        120.        ,  88.        ],
       ...,
       [ 38.        ,  15.        ,   9.84210526, ...,   0.        ,
        120.        ,  88.        ],
       [ 37.        ,  15.        ,  10.24324324, ...,   0.        ,
        120.        ,  87.        ],
       [ 38.        ,  15.        ,   9.86842105, ...,   0.        ,
        120.        ,  86.        ]])

In [69]:
# Feature Selection 
select = SelectFromModel(estimator=RandomForestClassifier()).fit(X, y)


In [70]:
# How many features were important in comparison to the number of features in the dataset
print(len(select.transform(X)))
print(len(X))

58596
58596


In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

In [72]:
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (41017, 51)
y_train shape: (41017,)
X_test shape: (17579, 51)
y_test shape: (17579,)


In [73]:
rf = RandomForestClassifier(n_estimators=malware_data_reduced.shape[1], random_state=42)
clf = rf.fit(X,y)
clf

In [74]:
print('Training accuracy:', clf.score(X_train, y_train))
print('Test accuracy:', clf.score(X_test, y_test))

Training accuracy: 0.9997074383792086
Test accuracy: 0.9997155697138631


In [75]:
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm

array([[8790,    0,    0,    0],
       [   0, 2936,    1,    0],
       [   0,    1, 3005,    0],
       [   0,    0,    3, 2843]])

In [76]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8790
           1       1.00      1.00      1.00      2937
           2       1.00      1.00      1.00      3006
           3       1.00      1.00      1.00      2846

    accuracy                           1.00     17579
   macro avg       1.00      1.00      1.00     17579
weighted avg       1.00      1.00      1.00     17579

