# Import Libraries 

In [31]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score
import pandas as pd
import numpy as np 


# Load Dataset

In [2]:
dataset = pd.read_csv("Malware_Dataset.csv")

In [3]:
dataset.head()


Unnamed: 0,Category,pslist.nproc,pslist.nppid,pslist.avg_threads,pslist.nprocs64bit,pslist.avg_handlers,dlllist.ndlls,dlllist.avg_dlls_per_proc,handles.nhandles,handles.avg_handles_per_proc,...,svcscan.kernel_drivers,svcscan.fs_drivers,svcscan.process_services,svcscan.shared_process_services,svcscan.interactive_process_services,svcscan.nactive,callbacks.ncallbacks,callbacks.nanonymous,callbacks.ngeneric,Class
0,Benign,45,17,10.555556,0,202.844444,1694,38.5,9129,212.302326,...,221,26,24,116,0,121,87,0,8,Benign
1,Benign,47,19,11.531915,0,242.234043,2074,44.12766,11385,242.234043,...,222,26,24,118,0,122,87,0,8,Benign
2,Benign,40,14,14.725,0,288.225,1932,48.3,11529,288.225,...,222,26,27,118,0,120,88,0,8,Benign
3,Benign,32,13,13.5,0,264.28125,1445,45.15625,8457,264.28125,...,222,26,27,118,0,120,88,0,8,Benign
4,Benign,42,16,11.452381,0,281.333333,2067,49.214286,11816,281.333333,...,222,26,24,118,0,124,87,0,8,Benign


# dataset Preparation

In [4]:
# Will Drop Category, not useful in the training of the models. 
dataset = dataset.drop(['Category'], axis=1)

In [5]:
dataset["Class"] = dataset['Class'].replace(['Malware', 'Benign'], [1, 0]) 

In [6]:
dataset.head()

Unnamed: 0,pslist.nproc,pslist.nppid,pslist.avg_threads,pslist.nprocs64bit,pslist.avg_handlers,dlllist.ndlls,dlllist.avg_dlls_per_proc,handles.nhandles,handles.avg_handles_per_proc,handles.nport,...,svcscan.kernel_drivers,svcscan.fs_drivers,svcscan.process_services,svcscan.shared_process_services,svcscan.interactive_process_services,svcscan.nactive,callbacks.ncallbacks,callbacks.nanonymous,callbacks.ngeneric,Class
0,45,17,10.555556,0,202.844444,1694,38.5,9129,212.302326,0,...,221,26,24,116,0,121,87,0,8,0
1,47,19,11.531915,0,242.234043,2074,44.12766,11385,242.234043,0,...,222,26,24,118,0,122,87,0,8,0
2,40,14,14.725,0,288.225,1932,48.3,11529,288.225,0,...,222,26,27,118,0,120,88,0,8,0
3,32,13,13.5,0,264.28125,1445,45.15625,8457,264.28125,0,...,222,26,27,118,0,120,88,0,8,0
4,42,16,11.452381,0,281.333333,2067,49.214286,11816,281.333333,0,...,222,26,24,118,0,124,87,0,8,0


# Split into training and test set

In [7]:
## Split data into X and Y
## X = Features that will be trained. 
## y = Labels of those features
## will further split into training and test set. 

In [8]:
X=dataset.iloc[:,:-1].values
y=dataset.iloc[:,-1].values

In [9]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size=0.2,random_state=42)

In [27]:
accuracies = []
precisions = []
recalls = []
f1_scores = [] 


# Train and Evaluate Random Forest Classifier

In [10]:
Random_Forest_Classifier = RandomForestClassifier(n_estimators = 20, random_state=42)

In [11]:
Random_Forest_Classifier.fit(Xtrain, ytrain)

In [12]:
Random_Forest_Predictions = Random_Forest_Classifier.predict(Xtest)

In [13]:
Random_Forest_Classification_report = classification_report(Random_Forest_Predictions, ytest)
print(Random_Forest_Classification_report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5790
           1       1.00      1.00      1.00      5930

    accuracy                           1.00     11720
   macro avg       1.00      1.00      1.00     11720
weighted avg       1.00      1.00      1.00     11720



In [14]:
Random_Forest_confusion_matrix = confusion_matrix(Random_Forest_Predictions, ytest)
print(Random_Forest_confusion_matrix)

[[5790    0]
 [   0 5930]]


In [32]:
Random_forest_accuracy = accuracy_score(Random_Forest_Predictions, ytest)
Random_forest_precision = precision_score(Random_Forest_Predictions, ytest)
Random_forest_recall = recall_score(Random_Forest_Predictions, ytest)
Random_forest_f1_score = f1_score(Random_Forest_Predictions, ytest)

print(Random_forest_accuracy)
print(Random_forest_precision)
print(Random_forest_recall)
print(Random_forest_f1_score)

1.0
1.0
1.0
1.0


In [30]:
accuracies.append(Random_forest_accuracy)

In [34]:
precisions.append(Random_forest_precision)

In [35]:
recalls.append(Random_forest_recall)

In [36]:
f1_scores.append(Random_forest_f1_score)

# Train and Evaluate Nearest Neighbor Classifier

In [15]:
KNN = KNeighborsClassifier()

In [16]:
KNN.fit(Xtrain, ytrain)

In [17]:
KNN_predictions = KNN.predict(Xtest)

In [18]:
KNN_Classification_report = classification_report(KNN_predictions, ytest)
print(KNN_Classification_report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5798
           1       1.00      1.00      1.00      5922

    accuracy                           1.00     11720
   macro avg       1.00      1.00      1.00     11720
weighted avg       1.00      1.00      1.00     11720



In [19]:
KNN_confusion_matrix = confusion_matrix(KNN_predictions, ytest)
print(KNN_confusion_matrix)

[[5786   12]
 [   4 5918]]


In [37]:
KNN_accuracy = accuracy_score(KNN_predictions, ytest)
KNN_precision = precision_score(KNN_predictions, ytest)
KNN_recall = recall_score(KNN_predictions, ytest)
KNN_f1_score = f1_score(KNN_predictions, ytest)

print(KNN_accuracy)
print(KNN_precision)
print(KNN_recall)
print(KNN_f1_score)

0.9986348122866894
0.9979763912310287
0.9993245525160419
0.998650016874789


In [38]:
accuracies.append(KNN_accuracy)

In [39]:
precisions.append(KNN_precision)

In [40]:
recalls.append(KNN_recall)

In [41]:
f1_scores.append(KNN_f1_score)

# Train and Evaluate SVM Classifier

In [21]:
svc = SVC(C=1.0, kernel='rbf')

In [22]:
svc.fit(Xtrain, ytrain)

In [23]:
SVC_Predictions = svc.predict(Xtest)

In [24]:
SVC_classification_report = classification_report(SVC_Predictions, ytest)
print(SVC_classification_report)

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      5696
           1       0.99      0.98      0.99      6024

    accuracy                           0.99     11720
   macro avg       0.99      0.99      0.99     11720
weighted avg       0.99      0.99      0.99     11720



In [25]:
SVC_confusion_matrix = confusion_matrix(SVC_Predictions, ytest)
print(SVC_confusion_matrix)

[[5664   32]
 [ 126 5898]]


In [43]:
SVC_accuracy = accuracy_score(SVC_Predictions, ytest)
SVC_precision = precision_score(SVC_Predictions, ytest)
SVC_recall = recall_score(SVC_Predictions, ytest)
SVC_f1_score = f1_score(SVC_Predictions, ytest)

print(SVC_accuracy)
print(SVC_precision)
print(SVC_recall)
print(SVC_f1_score)

0.986518771331058
0.9946037099494098
0.9790836653386454
0.986782666889744


In [44]:
accuracies.append(SVC_accuracy)

In [45]:
precisions.append(SVC_precision)

In [46]:
recalls.append(SVC_recall)

In [47]:
f1_scores.append(SVC_f1_score)

In [49]:
print(accuracies)
print(precisions)
print(recalls)
print(f1_scores)

[1.0, 0.9986348122866894, 0.986518771331058]
[1.0, 0.9979763912310287, 0.9946037099494098]
[1.0, 0.9993245525160419, 0.9790836653386454]
[1.0, 0.998650016874789, 0.986782666889744]


In [83]:
Evaluation_Table = pd.DataFrame(columns=['Accuracy', 'Precision', 'Recall', 'F1-Score'])


Evaluation_Table = pd.concat([Evaluation_Table, pd.DataFrame({
    "Accuracy": accuracies,
    "Precision": precisions,
    "Recall": recalls,
    "F1-Score": f1_scores
})], ignore_index=True)

Evaluation_Table.index = ['Random Forest Classifier', 'K-Nearest-Neighbour', 'Support Vector Classifier']
Evaluation_Table.index.names = ['Model']

# Final Evaluation Table

In [84]:
Evaluation_Table

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Random Forest Classifier,1.0,1.0,1.0,1.0
K-Nearest-Neighbour,0.998635,0.997976,0.999325,0.99865
Support Vector Classifier,0.986519,0.994604,0.979084,0.986783
