In [1]:
# Data libs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Classifiers 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Analysis libs
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split

In [2]:
malware_data_filepath = './datasets/Obfuscated/Obfuscated-MalMem2022_edited.csv'
malware_data_reduced_filepath = './datasets/Obfuscated/Obfuscated-MalMem2022_reduced.csv'

malware_data = pd.read_csv(malware_data_filepath)
malware_data_reduced = pd.read_csv(malware_data_reduced_filepath)

In [3]:
malware_data

Unnamed: 0,Category,pslist.nproc,pslist.nppid,pslist.avg_threads,pslist.nprocs64bit,pslist.avg_handlers,dlllist.ndlls,dlllist.avg_dlls_per_proc,handles.nhandles,handles.avg_handles_per_proc,...,svcscan.kernel_drivers,svcscan.fs_drivers,svcscan.process_services,svcscan.shared_process_services,svcscan.interactive_process_services,svcscan.nactive,callbacks.ncallbacks,callbacks.nanonymous,callbacks.ngeneric,Class
0,Benign,45,17,10.555556,0,202.844444,1694,38.500000,9129,212.302326,...,221,26,24,116,0,121,87,0,8,Benign
1,Benign,47,19,11.531915,0,242.234043,2074,44.127660,11385,242.234043,...,222,26,24,118,0,122,87,0,8,Benign
2,Benign,40,14,14.725000,0,288.225000,1932,48.300000,11529,288.225000,...,222,26,27,118,0,120,88,0,8,Benign
3,Benign,32,13,13.500000,0,264.281250,1445,45.156250,8457,264.281250,...,222,26,27,118,0,120,88,0,8,Benign
4,Benign,42,16,11.452381,0,281.333333,2067,49.214286,11816,281.333333,...,222,26,24,118,0,124,87,0,8,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58591,Ransomware,37,15,10.108108,0,215.486487,1453,39.270270,7973,215.486487,...,221,26,24,116,0,120,86,0,8,Malware
58592,Ransomware,37,14,9.945946,0,190.216216,1347,36.405405,7038,190.216216,...,221,26,24,116,0,116,88,0,8,Malware
58593,Ransomware,38,15,9.842105,0,210.026316,1448,38.105263,7982,215.729730,...,221,26,24,116,0,120,88,0,8,Malware
58594,Ransomware,37,15,10.243243,0,215.513513,1452,39.243243,7974,215.513513,...,221,26,24,116,0,120,87,0,8,Malware


In [4]:
malware_data_reduced

Unnamed: 0.1,Unnamed: 0,Category,pslist.nproc,pslist.nppid,pslist.avg_threads,pslist.nprocs64bit,pslist.avg_handlers,dlllist.ndlls,dlllist.avg_dlls_per_proc,handles.nhandles,...,psxview.not_in_deskthrd_false_avg,svcscan.nservices,svcscan.kernel_drivers,svcscan.fs_drivers,svcscan.process_services,svcscan.shared_process_services,svcscan.interactive_process_services,svcscan.nactive,callbacks.ncallbacks,Class
0,0,Benign,45,17,10.555556,0,202.844444,1694,38.500000,9129,...,0.191489,389,221,26,24,116,0,121,87,Benign
1,1,Benign,47,19,11.531915,0,242.234043,2074,44.127660,11385,...,0.127660,392,222,26,24,118,0,122,87,Benign
2,2,Benign,40,14,14.725000,0,288.225000,1932,48.300000,11529,...,0.125000,395,222,26,27,118,0,120,88,Benign
3,3,Benign,32,13,13.500000,0,264.281250,1445,45.156250,8457,...,0.187500,395,222,26,27,118,0,120,88,Benign
4,4,Benign,42,16,11.452381,0,281.333333,2067,49.214286,11816,...,0.217391,392,222,26,24,118,0,124,87,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58591,58591,Ransomware,37,15,10.108108,0,215.486487,1453,39.270270,7973,...,0.184211,389,221,26,24,116,0,120,86,Malware
58592,58592,Ransomware,37,14,9.945946,0,190.216216,1347,36.405405,7038,...,0.162162,389,221,26,24,116,0,116,88,Malware
58593,58593,Ransomware,38,15,9.842105,0,210.026316,1448,38.105263,7982,...,0.225000,389,221,26,24,116,0,120,88,Malware
58594,58594,Ransomware,37,15,10.243243,0,215.513513,1452,39.243243,7974,...,0.162162,389,221,26,24,116,0,120,87,Malware


### Models
In this section, we will be testing each classification model (called from sklearn) for both binary and multi-classification tasks. The following models we will use for this experiment are the listed:

* Logistic Regression
* Random Forest
* AdaBoost
* Gaussian Process
* RBF
* Gaussian Naive Bayes
* K-nearest Neighbors (KNN)
* MLP
* SVM (SVC)
* Decision Tree
* Extra Trees
* Gradient Boost

It is important to metion that there are a polethora of classification models out in the wild, however, we limit ourselves to these as they are both well known and well used in the feild of Machine Learning. Additionally, these are light-weight models that might merit deployment if these perform exceptionally. 

Note: since these are light-weight models, we may change to more complex models if these models do not perform as well with the number of samples within the dataset. Here are the potential replacements:

* SVC -> LinearSVC or SGDClassifier
* Gradient Boost -> HistGradientBoostClassifier

