# LOAD DATASET 

In [49]:
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, classification_report, f1_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np 


In [50]:
dataset = pd.read_csv("Obfuscated-MalMem2022.csv")

In [51]:
dataset.head()

Unnamed: 0,Category,pslist.nproc,pslist.nppid,pslist.avg_threads,pslist.nprocs64bit,pslist.avg_handlers,dlllist.ndlls,dlllist.avg_dlls_per_proc,handles.nhandles,handles.avg_handles_per_proc,...,svcscan.kernel_drivers,svcscan.fs_drivers,svcscan.process_services,svcscan.shared_process_services,svcscan.interactive_process_services,svcscan.nactive,callbacks.ncallbacks,callbacks.nanonymous,callbacks.ngeneric,Class
0,Benign,45,17,10.555556,0,202.844444,1694,38.5,9129,212.302326,...,221,26,24,116,0,121,87,0,8,Benign
1,Benign,47,19,11.531915,0,242.234043,2074,44.12766,11385,242.234043,...,222,26,24,118,0,122,87,0,8,Benign
2,Benign,40,14,14.725,0,288.225,1932,48.3,11529,288.225,...,222,26,27,118,0,120,88,0,8,Benign
3,Benign,32,13,13.5,0,264.28125,1445,45.15625,8457,264.28125,...,222,26,27,118,0,120,88,0,8,Benign
4,Benign,42,16,11.452381,0,281.333333,2067,49.214286,11816,281.333333,...,222,26,24,118,0,124,87,0,8,Benign


In [52]:
## Drop Category, not needed. 

In [53]:
df = dataset.drop(['Category'], axis=1)

In [54]:
## make class 0 and 1 instead of malware and benign 
## malware = 1
## Bennign = 0 


In [55]:
df['Class'] = df['Class'].replace(['Malware', 'Benign'], [1, 0]) 

In [56]:
data = pd.DataFrame()

In [57]:
data['psxview.not_in_ethread_pool_false_avg'] = df['psxview.not_in_ethread_pool_false_avg']
data['psxview.not_in_deskthrd'] = df['psxview.not_in_deskthrd']
data['psxview.not_in_session_false_avg'] = df['psxview.not_in_session_false_avg']
data['psxview.not_in_csrss_handles_false_avg'] = df['psxview.not_in_csrss_handles_false_avg']
data['malfind.protection'] = df['malfind.protection']
data['malfind.ninjections'] = df['malfind.ninjections']
data['malfind.uniqueInjections'] = df['malfind.uniqueInjections']
data['malfind.commitCharge'] = df['malfind.commitCharge']
data['psxview.not_in_deskthrd_false_avg'] = df['psxview.not_in_deskthrd_false_avg']
data['pslist.nppid'] = df['pslist.nppid']
data['Class'] = df['Class']

In [58]:
data.head()

Unnamed: 0,psxview.not_in_ethread_pool_false_avg,psxview.not_in_deskthrd,psxview.not_in_session_false_avg,psxview.not_in_csrss_handles_false_avg,malfind.protection,malfind.ninjections,malfind.uniqueInjections,malfind.commitCharge,psxview.not_in_deskthrd_false_avg,pslist.nppid,Class
0,0.06383,9,0.085106,0.148936,30,5,1.25,21,0.191489,17,0
1,0.0,6,0.042553,0.085106,72,12,1.714286,77,0.12766,19,0
2,0.0,5,0.05,0.1,30,5,1.25,6,0.125,14,0
3,0.0,6,0.0625,0.125,12,2,1.0,2,0.1875,13,0
4,0.086957,10,0.130435,0.173913,72,12,2.0,77,0.217391,16,0


In [59]:
##Split dataset into X and Y values
## Because these models are unsupervised, labels are not needed for training 
# only used for evaluation at the end. 

In [60]:
X = data.iloc[:,:-1].values
y = data.iloc[:,-1].values

In [61]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size=0.2, random_state=42)

# Training And Evaluating Isolation Forest

In [62]:
Isolation_Forest_model=IsolationForest(max_samples= 100, n_estimators= 200, contamination=0.5, bootstrap = True)

In [63]:
Isolation_Forest_model.fit(Xtrain)

In [64]:
predictions = Isolation_Forest_model.predict(Xtest)

In [65]:
print(predictions)

[ 1  1 -1 ...  1  1  1]


In [66]:
## printing the predictions
## model predicted 5904 anomalies and 5816 normal instances 

In [67]:
print(pd.value_counts(predictions))

-1    5886
 1    5834
dtype: int64


In [68]:
## map the predictions to 0 and 1 from -1 and 1
## map anomalies to 1 and normal instances to 0

In [69]:
predictions[predictions== 1] = 0
predictions[predictions== -1] = 1

In [70]:
print(pd.value_counts(predictions))

1    5886
0    5834
dtype: int64


In [71]:
## start evaluation. 
## Classification Report
## confusion Matrix 
## area under curve 

In [72]:
Isolation_forest_classification_report = classification_report(predictions, ytest)

In [73]:
print(Isolation_forest_classification_report)

              precision    recall  f1-score   support

           0       0.56      0.56      0.56      5834
           1       0.57      0.57      0.57      5886

    accuracy                           0.56     11720
   macro avg       0.56      0.56      0.56     11720
weighted avg       0.56      0.56      0.56     11720



In [74]:
Isolation_forest_confusion_matrix = confusion_matrix(predictions, ytest)
print(Isolation_forest_confusion_matrix)

[[3259 2575]
 [2531 3355]]


# Training And Evaluating Local Outlier Factor

In [75]:
Local_outlier_factor = LocalOutlierFactor(n_neighbors = 100, algorithm='auto', contamination=0.5, novelty=True)

In [76]:
Local_outlier_factor.fit(Xtrain)

In [77]:
LOF_predictions = Local_outlier_factor.predict(Xtest)

In [78]:
LOF_predictions[LOF_predictions== 1] = 0
LOF_predictions[LOF_predictions== -1] = 1

print(pd.value_counts(LOF_predictions))

0    7375
1    4345
dtype: int64


In [79]:
LOF_classification_report = classification_report(LOF_predictions, ytest)
print(LOF_classification_report)

              precision    recall  f1-score   support

           0       0.59      0.46      0.52      7375
           1       0.33      0.45      0.38      4345

    accuracy                           0.46     11720
   macro avg       0.46      0.46      0.45     11720
weighted avg       0.49      0.46      0.47     11720



In [80]:
LOF_confusion_matrix = confusion_matrix(LOF_predictions, ytest)
print(LOF_confusion_matrix)

[[3401 3974]
 [2389 1956]]


# Training And Evaluating ONE-CLASS SUPPORT VECTOR MACHINE (SVM)

In [81]:
one_class_svm = OneClassSVM()

In [82]:
one_class_svm.fit(Xtrain)

In [83]:
one_class_svm_predictions=one_class_svm.predict(Xtest)

In [84]:
one_class_svm_predictions[one_class_svm_predictions== 1] = 0
one_class_svm_predictions[one_class_svm_predictions== -1] = 1

print(pd.value_counts(one_class_svm_predictions))

1    5946
0    5774
dtype: int64


In [85]:
SVM_classification_report = classification_report(one_class_svm_predictions, ytest)
print(SVM_classification_report)

              precision    recall  f1-score   support

           0       0.86      0.87      0.87      5774
           1       0.87      0.87      0.87      5946

    accuracy                           0.87     11720
   macro avg       0.87      0.87      0.87     11720
weighted avg       0.87      0.87      0.87     11720



In [86]:
SVM_confusion_matrix = confusion_matrix(one_class_svm_predictions, ytest)
print(SVM_confusion_matrix)

[[5003  771]
 [ 787 5159]]


# Training And Evaluating ONE-CLASS SVM WITH STOCHASTIC GRADIENT DESCENT

In [87]:
# Initialize model parameters
w = np.zeros(Xtrain.shape[1])
b = 0

# Set SGD parameters
learning_rate = 0.01
epochs = 100 

In [88]:
for epoch in range(epochs):
    for i in range(len(Xtrain)):
        x = Xtrain[i]

        # Calculate the loss and gradient
        loss = np.maximum(0, 1 - np.dot(w, x) + b)
        gradient = -x if loss > 0 else 0

        # Update the model parameters
        w -= learning_rate * gradient
        b -= learning_rate

In [89]:
for epoch in range(epochs):
    for i in range(len(X_train)):
        x = X_train[i]

        # Calculate the loss and gradient
        loss = np.maximum(0, 1 - np.dot(w, x) + b)
        gradient = -x if loss > 0 else 0

        # Update the model parameters
        w -= learning_rate * gradient
        b -= learning_rate

NameError: name 'X_train' is not defined

In [None]:
def predict(x):
    return np.sign(np.dot(w, x) + b)


In [None]:
y_pred = np.array([predict(x) for x in Xtest])

In [None]:
precision = precision_score(ytest, y_pred)
recall = recall_score(ytest, y_pred)
f1 = f1_score(ytest, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

In [None]:
SVM_SGD_classification_report = classification_report(y_pred, ytest)
print(SVM_SGD_classification_report)