In [46]:
import pprint

# Data libs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Classifiers 
from sklearn.ensemble import RandomForestClassifier

# Analysis libs
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import StratifiedKFold

In [3]:
malware_data_filepath = '../datasets/Obfuscated/Obfuscated-MalMem2022_edited.csv'
malware_data = pd.read_csv(malware_data_filepath)

In [4]:
X_drop_columns = ['Class', 'Category']
X = malware_data.drop(columns=X_drop_columns)

y_column = malware_data.Class
class_le = LabelEncoder()
y = class_le.fit_transform(y_column)

In [5]:
def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_classif(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores
mi_scores = make_mi_scores(X,y) 

In [7]:
mi_scores

svcscan.nservices                         0.681695
svcscan.shared_process_services           0.673322
dlllist.avg_dlls_per_proc                 0.672397
svcscan.kernel_drivers                    0.669221
handles.avg_handles_per_proc              0.658680
handles.nhandles                          0.656784
pslist.avg_handlers                       0.654955
handles.nevent                            0.653687
handles.nmutant                           0.652358
handles.nsection                          0.645991
dlllist.ndlls                             0.638625
handles.nkey                              0.637390
handles.nsemaphore                        0.625400
handles.ntimer                            0.606279
pslist.avg_threads                        0.605076
handles.nfile                             0.596608
handles.nthread                           0.591058
ldrmodules.not_in_load                    0.585584
ldrmodules.not_in_mem                     0.585277
ldrmodules.not_in_mem_avg      

In [42]:

y_temp = y
scale = StandardScaler()
remove_feature = []
for feature in reversed(dict(mi_scores)):
    remove_feature.append(feature)
    x_temp = X.drop(columns=remove_feature)
    if x_temp.shape[1] < 1:
        break
    x_temp = scale.fit_transform(x_temp)
    X_train, X_test, y_train, y_test = train_test_split(x_temp, y_temp, test_size=0.3, random_state=0, stratify=y)
    rf = RandomForestClassifier(n_estimators=x_temp.shape[1], random_state=42)
    clf = rf.fit(X_train,y_train)
    print("=======================================================")
    print(f"Number of features {x_temp.shape[1]}")
    print('Training accuracy:', clf.score(X_train, y_train))
    print('Test accuracy:', clf.score(X_test, y_test))

number of features 54
Training accuracy: 1.0
Test accuracy: 0.9997724557710905
number of features 53
Training accuracy: 1.0
Test accuracy: 0.9997724557710905
number of features 52
Training accuracy: 1.0
Test accuracy: 0.9997155697138631
number of features 51
Training accuracy: 1.0
Test accuracy: 0.9997724557710905
number of features 50
Training accuracy: 1.0
Test accuracy: 0.9997155697138631
number of features 49
Training accuracy: 1.0
Test accuracy: 0.9997155697138631
number of features 48
Training accuracy: 1.0
Test accuracy: 0.9997155697138631
number of features 47
Training accuracy: 1.0
Test accuracy: 0.9997155697138631
number of features 46
Training accuracy: 1.0
Test accuracy: 0.9997155697138631
number of features 45
Training accuracy: 1.0
Test accuracy: 0.9997724557710905
number of features 44
Training accuracy: 1.0
Test accuracy: 0.9996017975994084
number of features 43
Training accuracy: 1.0
Test accuracy: 0.9997155697138631
number of features 42
Training accuracy: 1.0
Test ac

In [45]:
y_temp = malware_data.Category
y_temp = class_le.fit_transform(y_temp)
# scale = StandardScaler()
remove_feature = []
for feature in reversed(dict(mi_scores)):
    remove_feature.append(feature)
    x_temp = X.drop(columns=remove_feature)
    if x_temp.shape[1] < 1:
        break
    # x_temp = scale.fit_transform(x_temp)
    X_train, X_test, y_train, y_test = train_test_split(x_temp, y_temp, test_size=0.3, random_state=0, stratify=y)
    rf = RandomForestClassifier(n_estimators=x_temp.shape[1], random_state=42)
    clf = rf.fit(X_train,y_train)
    print("=======================================================")
    print(f"Number of features {x_temp.shape[1]}")
    print('Training accuracy:', clf.score(X_train, y_train))
    print('Test accuracy:', clf.score(X_test, y_test))

Number of features 54
Training accuracy: 0.9998049589194724
Test accuracy: 0.8683087775186302
Number of features 53
Training accuracy: 0.9997805787844065
Test accuracy: 0.8685363217475397
Number of features 52
Training accuracy: 0.9997805787844065
Test accuracy: 0.8691620683770408
Number of features 51
Training accuracy: 0.9998537191896043
Test accuracy: 0.8698447010637693
Number of features 50
Training accuracy: 0.9998780993246703
Test accuracy: 0.8677399169463564
Number of features 49
Training accuracy: 0.9997805787844065
Test accuracy: 0.8669435121451732
Number of features 48
Training accuracy: 0.9997805787844065
Test accuracy: 0.8675692587746743
Number of features 47
Training accuracy: 0.9998537191896043
Test accuracy: 0.8666590818590364
Number of features 46
Training accuracy: 0.9998049589194724
Test accuracy: 0.8688776380909039
Number of features 45
Training accuracy: 0.9998049589194724
Test accuracy: 0.8672848284885375
Number of features 44
Training accuracy: 0.9997318185142746


In [50]:
y_temp = malware_data.Class
y_temp = class_le.fit_transform(y_temp)
kfold = StratifiedKFold(n_splits=10)
X_sc = scale.fit_transform(X)
scores = []
for k, (train, test) in enumerate(kfold.split(X_sc, y_temp)): # iterator
    rf_model = RandomForestClassifier(n_estimators=X_sc.shape[1], random_state=42)
    #print(train.shape, test.shape)
    rf_model.fit(X_sc[train], y_temp[train])
    score = rf_model.score(X_sc[test], y_temp[test])
    scores.append(score)
    print(f'Fold:{k+1:2d}, Class dist.:{np.bincount(y_temp[train])}, Acc: {score:.3f}')

Fold: 1, Class dist.:[26368 26368], Acc: 0.999
Fold: 2, Class dist.:[26368 26368], Acc: 1.000
Fold: 3, Class dist.:[26368 26368], Acc: 1.000
Fold: 4, Class dist.:[26368 26368], Acc: 1.000
Fold: 5, Class dist.:[26368 26368], Acc: 0.990
Fold: 6, Class dist.:[26368 26368], Acc: 1.000
Fold: 7, Class dist.:[26368 26369], Acc: 0.999
Fold: 8, Class dist.:[26368 26369], Acc: 1.000
Fold: 9, Class dist.:[26369 26368], Acc: 1.000
Fold:10, Class dist.:[26369 26368], Acc: 1.000


In [51]:
y_temp = malware_data.Category
y_temp = class_le.fit_transform(y_temp)
kfold = StratifiedKFold(n_splits=10)
X_sc = scale.fit_transform(X)
scores = []
for k, (train, test) in enumerate(kfold.split(X_sc, y_temp)): # iterator
    rf_model = RandomForestClassifier(n_estimators=X_sc.shape[1], random_state=42)
    #print(train.shape, test.shape)
    rf_model.fit(X_sc[train], y_temp[train])
    score = rf_model.score(X_sc[test], y_temp[test])
    scores.append(score)
    print(f'Fold:{k+1:2d}, Class dist.:{np.bincount(y_temp[train])}, Acc: {score:.3f}')

Fold: 1, Class dist.:[26368  8812  9018  8538], Acc: 0.759
Fold: 2, Class dist.:[26368  8812  9018  8538], Acc: 0.792
Fold: 3, Class dist.:[26368  8812  9018  8538], Acc: 0.844
Fold: 4, Class dist.:[26368  8812  9018  8538], Acc: 0.865
Fold: 5, Class dist.:[26368  8812  9018  8538], Acc: 0.874
Fold: 6, Class dist.:[26368  8812  9018  8538], Acc: 0.857
Fold: 7, Class dist.:[26368  8812  9018  8539], Acc: 0.851
Fold: 8, Class dist.:[26368  8812  9018  8539], Acc: 0.853
Fold: 9, Class dist.:[26369  8811  9018  8539], Acc: 0.806
Fold:10, Class dist.:[26369  8812  9018  8538], Acc: 0.812


In [56]:
y_temp = malware_data.Class
y_temp = class_le.fit_transform(y_temp)
kfold = StratifiedKFold(n_splits=10)
scores = []
for k, (train, test) in enumerate(kfold.split(X, y_temp)): # iterator
    rf_model = RandomForestClassifier(n_estimators=X.shape[1], random_state=42)
    #print(train.shape, test.shape)
    rf_model.fit(X.values[train], y_temp[train])
    score = rf_model.score(X.values[test], y_temp[test])
    scores.append(score)
    print(f'Fold:{k+1:2d}, Class dist.:{np.bincount(y_temp[train])}, Acc: {score:.3f}')

Fold: 1, Class dist.:[26368 26368], Acc: 1.000
Fold: 2, Class dist.:[26368 26368], Acc: 1.000
Fold: 3, Class dist.:[26368 26368], Acc: 1.000
Fold: 4, Class dist.:[26368 26368], Acc: 1.000
Fold: 5, Class dist.:[26368 26368], Acc: 0.990
Fold: 6, Class dist.:[26368 26368], Acc: 1.000
Fold: 7, Class dist.:[26368 26369], Acc: 0.999
Fold: 8, Class dist.:[26368 26369], Acc: 1.000
Fold: 9, Class dist.:[26369 26368], Acc: 1.000
Fold:10, Class dist.:[26369 26368], Acc: 1.000


In [57]:
y_temp = malware_data.Category
y_temp = class_le.fit_transform(y_temp)
kfold = StratifiedKFold(n_splits=10)
scores = []
for k, (train, test) in enumerate(kfold.split(X, y_temp)): # iterator
    rf_model = RandomForestClassifier(n_estimators=X.shape[1], random_state=42)
    #print(train.shape, test.shape)
    rf_model.fit(X.values[train], y_temp[train])
    score = rf_model.score(X.values[test], y_temp[test])
    scores.append(score)
    print(f'Fold:{k+1:2d}, Class dist.:{np.bincount(y_temp[train])}, Acc: {score:.3f}')

Fold: 1, Class dist.:[26368  8812  9018  8538], Acc: 0.758
Fold: 2, Class dist.:[26368  8812  9018  8538], Acc: 0.792
Fold: 3, Class dist.:[26368  8812  9018  8538], Acc: 0.844
Fold: 4, Class dist.:[26368  8812  9018  8538], Acc: 0.865
Fold: 5, Class dist.:[26368  8812  9018  8538], Acc: 0.875
Fold: 6, Class dist.:[26368  8812  9018  8538], Acc: 0.857
Fold: 7, Class dist.:[26368  8812  9018  8539], Acc: 0.851
Fold: 8, Class dist.:[26368  8812  9018  8539], Acc: 0.851
Fold: 9, Class dist.:[26369  8811  9018  8539], Acc: 0.806
Fold:10, Class dist.:[26369  8812  9018  8538], Acc: 0.811


In [60]:
rf = RandomForestClassifier(n_estimators=X.shape[1], random_state=42)
cvs = cross_val_score(estimator=rf, X=X, y=y, n_jobs=-1)
print(cvs)
print(f"Mean: {cvs.mean()} \n std: {cvs.std()}")

[0.99982935 0.99991467 0.99488011 0.99957334 1.        ]
Mean: 0.9988394943365211 
 std: 0.0019848351882600142
