In [54]:
import pprint

# Data libs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Classifiers 
from sklearn.ensemble import RandomForestClassifier

# Analysis libs
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split


In [55]:
malware_data_filepath = '../datasets/Obfuscated/Obfuscated-MalMem2022_edited.csv'
malware_data_new = pd.read_csv(malware_data_filepath)

In [121]:
# X_drop_columns = ['svcscan.interactive_process_services', 'handles.nport', 'pslist.nprocs64bit', 'Class', 'Category']
X_drop_columns = ['Class', 'Category']
X = malware_data_new.drop(columns=X_drop_columns)
X = X.values

# y_column = malware_data_new.Class
y_column = malware_data_new.Category
class_le = LabelEncoder()
y = class_le.fit_transform(y_column)

In [96]:
X

array([[45.        , 17.        , 10.55555556, ..., 87.        ,
         0.        ,  8.        ],
       [47.        , 19.        , 11.53191489, ..., 87.        ,
         0.        ,  8.        ],
       [40.        , 14.        , 14.725     , ..., 88.        ,
         0.        ,  8.        ],
       ...,
       [38.        , 15.        ,  9.84210526, ..., 88.        ,
         0.        ,  8.        ],
       [37.        , 15.        , 10.24324324, ..., 87.        ,
         0.        ,  8.        ],
       [38.        , 15.        ,  9.86842105, ..., 86.        ,
         0.        ,  8.        ]])

In [97]:
y

array([0, 0, 0, ..., 1, 1, 1])

In [98]:
scaler = StandardScaler()
X_scaler = scaler.fit_transform(X)

### With Scaling

In [99]:
X_train, X_test, y_train, y_test = train_test_split(X_scaler, y, test_size=0.3, random_state=0, stratify=y)

In [100]:
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (41017, 55)
y_train shape: (41017,)
X_test shape: (17579, 55)
y_test shape: (17579,)


In [101]:
rf = RandomForestClassifier(n_estimators=malware_data_new.shape[1], random_state=42)
clf = rf.fit(X_train,y_train)
print('Training accuracy:', clf.score(X_train, y_train))
print('Test accuracy:', clf.score(X_test, y_test))

Training accuracy: 1.0
Test accuracy: 0.9997155697138631


In [102]:
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm

array([[8785,    5],
       [   0, 8789]])

In [103]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8790
           1       1.00      1.00      1.00      8789

    accuracy                           1.00     17579
   macro avg       1.00      1.00      1.00     17579
weighted avg       1.00      1.00      1.00     17579



### Without Scaling

In [104]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

In [105]:
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (41017, 55)
y_train shape: (41017,)
X_test shape: (17579, 55)
y_test shape: (17579,)


In [106]:
rf = RandomForestClassifier(n_estimators=malware_data_new.shape[1], random_state=42)
clf = rf.fit(X_train,y_train)
print('Training accuracy:', clf.score(X_train, y_train))
print('Test accuracy:', clf.score(X_test, y_test))

Training accuracy: 1.0
Test accuracy: 0.9997155697138631


In [109]:
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm

array([[8785,    5],
       [   0, 8789]])

In [110]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8790
           1       1.00      1.00      1.00      8789

    accuracy                           1.00     17579
   macro avg       1.00      1.00      1.00     17579
weighted avg       1.00      1.00      1.00     17579



### Dropping some useless features

In [122]:
X_drop_columns = ['svcscan.interactive_process_services', 'handles.nport', 'pslist.nprocs64bit', 'Class', 'Category']
X = malware_data_new.drop(columns=X_drop_columns)
X = X.values

In [123]:
X

array([[45.        , 17.        , 10.55555556, ..., 87.        ,
         0.        ,  8.        ],
       [47.        , 19.        , 11.53191489, ..., 87.        ,
         0.        ,  8.        ],
       [40.        , 14.        , 14.725     , ..., 88.        ,
         0.        ,  8.        ],
       ...,
       [38.        , 15.        ,  9.84210526, ..., 88.        ,
         0.        ,  8.        ],
       [37.        , 15.        , 10.24324324, ..., 87.        ,
         0.        ,  8.        ],
       [38.        , 15.        ,  9.86842105, ..., 86.        ,
         0.        ,  8.        ]])

In [124]:
y

array([0, 0, 0, ..., 1, 1, 1])

In [125]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

In [126]:
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (41017, 52)
y_train shape: (41017,)
X_test shape: (17579, 52)
y_test shape: (17579,)


In [127]:
rf = RandomForestClassifier(n_estimators=malware_data_new.shape[1], random_state=42)
clf = rf.fit(X_train,y_train)
print('Training accuracy:', clf.score(X_train, y_train))
print('Test accuracy:', clf.score(X_test, y_test))

Training accuracy: 0.9998293390545384
Test accuracy: 0.8720632572956368


In [128]:
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm

array([[8790,    0,    0,    0],
       [   0, 2123,  386,  428],
       [   0,  307, 2357,  342],
       [   0,  488,  298, 2060]])

In [None]:
print(classification_report(y_test, y_pred))

### One-hot encoding

In [145]:
# X_drop_columns = ['svcscan.interactive_process_services', 'handles.nport', 'pslist.nprocs64bit', 'Class', 'Category']
X_drop_columns = ['Class']
X = malware_data_new.drop(columns=X_drop_columns)
X = X.values

y_column = malware_data_new.Class
class_le = LabelEncoder()
y = class_le.fit_transform(y_column)

In [146]:
malware_ohe = OneHotEncoder()
malware_ohe.fit_transform(X[:, 0].reshape(-1,1)).toarray()

array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       ...,
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.]])

In [147]:
c_transform = ColumnTransformer([
    ('onehot', OneHotEncoder(), [0])
], remainder='passthrough')

In [148]:
X = c_transform.fit_transform(X).astype(float)

In [149]:
X

array([[ 1.,  0.,  0., ..., 87.,  0.,  8.],
       [ 1.,  0.,  0., ..., 87.,  0.,  8.],
       [ 1.,  0.,  0., ..., 88.,  0.,  8.],
       ...,
       [ 0.,  1.,  0., ..., 88.,  0.,  8.],
       [ 0.,  1.,  0., ..., 87.,  0.,  8.],
       [ 0.,  1.,  0., ..., 86.,  0.,  8.]])

In [150]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

In [151]:
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (41017, 59)
y_train shape: (41017,)
X_test shape: (17579, 59)
y_test shape: (17579,)


In [152]:
rf = RandomForestClassifier(n_estimators=malware_data_new.shape[1], random_state=42)
clf = rf.fit(X_train,y_train)
print('Training accuracy:', clf.score(X_train, y_train))
print('Test accuracy:', clf.score(X_test, y_test))

Training accuracy: 1.0
Test accuracy: 0.9999431139427726


In [153]:
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm

array([[8789,    1],
       [   0, 8789]])

In [154]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8790
           1       1.00      1.00      1.00      8789

    accuracy                           1.00     17579
   macro avg       1.00      1.00      1.00     17579
weighted avg       1.00      1.00      1.00     17579

