In [1]:
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
import pandas as pd
import numpy as np 


In [2]:
dataset = pd.read_csv("Obfuscated-MalMem2022.csv")

In [3]:
dataset.head()

Unnamed: 0,Category,pslist.nproc,pslist.nppid,pslist.avg_threads,pslist.nprocs64bit,pslist.avg_handlers,dlllist.ndlls,dlllist.avg_dlls_per_proc,handles.nhandles,handles.avg_handles_per_proc,...,svcscan.kernel_drivers,svcscan.fs_drivers,svcscan.process_services,svcscan.shared_process_services,svcscan.interactive_process_services,svcscan.nactive,callbacks.ncallbacks,callbacks.nanonymous,callbacks.ngeneric,Class
0,Benign,45,17,10.555556,0,202.844444,1694,38.5,9129,212.302326,...,221,26,24,116,0,121,87,0,8,Benign
1,Benign,47,19,11.531915,0,242.234043,2074,44.12766,11385,242.234043,...,222,26,24,118,0,122,87,0,8,Benign
2,Benign,40,14,14.725,0,288.225,1932,48.3,11529,288.225,...,222,26,27,118,0,120,88,0,8,Benign
3,Benign,32,13,13.5,0,264.28125,1445,45.15625,8457,264.28125,...,222,26,27,118,0,120,88,0,8,Benign
4,Benign,42,16,11.452381,0,281.333333,2067,49.214286,11816,281.333333,...,222,26,24,118,0,124,87,0,8,Benign


In [4]:
#Unsupervised is a bit different. 
#It does not need a label,only features 
#will Remove Category

In [5]:
df = dataset.drop(['Category'], axis=1)

In [6]:
df['Class'] = df['Class'].replace(['Malware', 'Benign'], [1, 0]) 

In [7]:
df.head()

Unnamed: 0,pslist.nproc,pslist.nppid,pslist.avg_threads,pslist.nprocs64bit,pslist.avg_handlers,dlllist.ndlls,dlllist.avg_dlls_per_proc,handles.nhandles,handles.avg_handles_per_proc,handles.nport,...,svcscan.kernel_drivers,svcscan.fs_drivers,svcscan.process_services,svcscan.shared_process_services,svcscan.interactive_process_services,svcscan.nactive,callbacks.ncallbacks,callbacks.nanonymous,callbacks.ngeneric,Class
0,45,17,10.555556,0,202.844444,1694,38.5,9129,212.302326,0,...,221,26,24,116,0,121,87,0,8,0
1,47,19,11.531915,0,242.234043,2074,44.12766,11385,242.234043,0,...,222,26,24,118,0,122,87,0,8,0
2,40,14,14.725,0,288.225,1932,48.3,11529,288.225,0,...,222,26,27,118,0,120,88,0,8,0
3,32,13,13.5,0,264.28125,1445,45.15625,8457,264.28125,0,...,222,26,27,118,0,120,88,0,8,0
4,42,16,11.452381,0,281.333333,2067,49.214286,11816,281.333333,0,...,222,26,24,118,0,124,87,0,8,0


In [8]:
#Splitting Values into X and y. 
# Where X = Features 
# Y = Label. 
# NOTE: Don't use y values to train, because unsupervised learning does not need labels. 
# labels are only kept to evaluate the models at the end. 

In [9]:
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

In [10]:
print(X)

[[45.         17.         10.55555556 ... 87.          0.
   8.        ]
 [47.         19.         11.53191489 ... 87.          0.
   8.        ]
 [40.         14.         14.725      ... 88.          0.
   8.        ]
 ...
 [38.         15.          9.84210526 ... 88.          0.
   8.        ]
 [37.         15.         10.24324324 ... 87.          0.
   8.        ]
 [38.         15.          9.86842105 ... 86.          0.
   8.        ]]


In [11]:
print(y)

[0 0 0 ... 1 1 1]


In [None]:
#Splitting data into training and test set

In [12]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size=0.2, random_state=42)

In [None]:
## Training and evaluating Isolation Forest Model

In [13]:
Isolation_Forest_model=IsolationForest(contamination=0.5)

In [14]:
Isolation_Forest_model.fit(Xtrain)

In [15]:
Isolation_Forest_predictions = Isolation_Forest_model.predict(Xtest)

In [16]:
Isolation_Forest_predictions

array([ 1,  1, -1, ...,  1,  1,  1])

In [47]:
anomalies_1 = ytest[Isolation_Forest_predictions == -1]

In [48]:
print(anomalies_1.shape)

(5903,)


In [49]:
print(anomalies_1)

[1 0 0 ... 0 0 0]


In [50]:
pd.value_counts(anomalies_1)

0    3098
1    2805
dtype: int64

In [46]:
print(last_col_anomalies)

[8. 8. 8. ... 8. 8. 8.]


In [17]:
pd.value_counts(Isolation_Forest_predictions)

-1    5903
 1    5817
dtype: int64

In [21]:
predictions_formatted = np.where(Isolation_Forest_predictions == -1,1,0)

In [22]:
pd.value_counts(predictions_formatted)

1    5903
0    5817
dtype: int64

In [23]:
pd.value_counts(ytest)

1    5930
0    5790
dtype: int64

In [24]:
Isolation_forest_accuracy = np.mean(predictions_formatted == ytest)
print(Isolation_forest_accuracy)

0.4690273037542662


In [25]:
Isolation_forest_precision = precision_score(ytest, predictions_formatted, average='weighted')

In [26]:
print(Isolation_forest_precision)

0.46905587170853735


In [27]:
Isolation_forest_recall = recall_score(ytest, predictions_formatted, average='weighted')
print(Isolation_forest_recall)

0.4690273037542662


In [28]:
Isolation_forest_confusion_matrix = confusion_matrix(ytest, predictions_formatted)
Isolation_forest_confusion_matrix

array([[2692, 3098],
       [3125, 2805]], dtype=int64)

In [None]:
## Training and Evaluating Local Outlier Factor Model

In [48]:
LOF = LocalOutlierFactor(n_neighbors=20, contamination=0.5, novelty=True)

In [49]:
LOF.fit(Xtrain)

In [50]:
preds = LOF.predict(Xtest)

In [51]:
pd.value_counts(preds)

-1    6031
 1    5689
dtype: int64

In [53]:
LOF_predictions_formatted = np.where(preds == -1,1,0)

In [54]:
pd.value_counts(LOF_predictions_formatted)

1    6031
0    5689
dtype: int64

In [55]:
acc = np.mean(LOF_predictions_formatted == ytest)
print(acc)

0.5181740614334471
