# AMAML

## General Imports
In this section, besides each machine learning model, every import used will be shown.

In [229]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, cross_val_score

## Data Load
We will construct a *dataframe* called **df**. This *dataframe* will be used all along in the models.
It will be built from the file *data.csv*. The *data.csv* file is generated by *datagen.py*

In [230]:
df = pd.read_csv('data.csv')

## Data Preview

Let's check how the data is formatted after being loaded using some functions.

In [231]:
df.head()             # To check the beginning

Unnamed: 0,Name,Machine,AddressOfEntryPoint,SizeOfCode,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode.1,SizeOfInitializedData,SizeOfUninitializedData,...,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,DllCharacteristics,ImageDirectoryEntryExport,ImageDirectoryEntryImport,ImageDirectoryEntryResource,ImageDirectoryEntryException,ImageDirectoryEntrySecurity,Malflag
0,7637ebd20f2b197995d6876d3cd4b01479614f5c0859bf...,34404,5328,9216,39,2,27,9216,25088,3072,...,4096,0,16,0,0,36864,49152,24576,30464,0
1,a9447372cc62144ca19196f8f18ffcb35f3907c10b8e4b...,34404,5328,13312,39,2,27,13312,31232,3072,...,4096,0,16,0,0,45056,61440,32768,37432,0
2,8bc53c486cba7fca5ffe4dd43976cbaac6bfb24acc95d2...,332,78862,73728,258,8,0,73728,16384,0,...,4096,0,16,34112,0,78780,81920,0,94208,0
3,ff6d6d846bb0ef538a95836a52e6187c855cbf93e2fce3...,34404,5328,32768,39,2,27,32768,62976,3072,...,4096,0,16,0,0,61440,86016,49152,79160,0
4,c2563bb38ff3a2f9109febfc012afa329401a15aea4a1a...,34404,43856,41472,34,12,10,41472,17408,0,...,4096,0,16,49504,0,56260,69632,65536,0,0


In [232]:
df.tail()            # To check the end

Unnamed: 0,Name,Machine,AddressOfEntryPoint,SizeOfCode,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode.1,SizeOfInitializedData,SizeOfUninitializedData,...,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,DllCharacteristics,ImageDirectoryEntryExport,ImageDirectoryEntryImport,ImageDirectoryEntryResource,ImageDirectoryEntryException,ImageDirectoryEntrySecurity,Malflag
9927,b437e136a5a2e16fce380833475f84d912175149f7a0e2...,332,5248,145408,270,2,50,145408,46592,0,...,4096,0,16,0,0,191768,0,0,193024,1
9928,0b626ff78872af9976e86b331c860d9105ffd9f1509391...,332,55440,105984,258,10,0,105984,0,0,...,4096,0,16,0,190239,190383,196046,0,0,1
9929,69a35069b7951f06a9ca499db9a84f045433e1c7dc8a42...,332,77738,143360,8450,8,0,143360,61440,0,...,4096,0,16,320,179248,176652,204800,0,0,1
9930,b3c0aa1724caf5795d6248f20965cfdc6dca6a7ad1e07f...,332,110272,102912,270,2,50,102912,5632,0,...,4096,0,16,0,0,4224,114688,0,109056,1
9931,f3887457c3279fcec344da10411688b0b161b5fbc852a9...,332,77738,143360,8450,8,0,143360,61440,0,...,4096,0,16,320,178800,176204,204800,0,0,1


In [233]:
df.shape

(9932, 34)

## Data Preparation

In [234]:
df.drop(['Name', 'Machine'], axis=1, inplace=True)         # These two columns are obsolete in our study.
df = df.dropna(how='all')
display(df)

Unnamed: 0,AddressOfEntryPoint,SizeOfCode,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode.1,SizeOfInitializedData,SizeOfUninitializedData,BaseOfCode,ImageBase,...,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,DllCharacteristics,ImageDirectoryEntryExport,ImageDirectoryEntryImport,ImageDirectoryEntryResource,ImageDirectoryEntryException,ImageDirectoryEntrySecurity,Malflag
0,5328,9216,39,2,27,9216,25088,3072,4096,4194304,...,4096,0,16,0,0,36864,49152,24576,30464,0
1,5328,13312,39,2,27,13312,31232,3072,4096,4194304,...,4096,0,16,0,0,45056,61440,32768,37432,0
2,78862,73728,258,8,0,73728,16384,0,8192,4194304,...,4096,0,16,34112,0,78780,81920,0,94208,0
3,5328,32768,39,2,27,32768,62976,3072,4096,4194304,...,4096,0,16,0,0,61440,86016,49152,79160,0
4,43856,41472,34,12,10,41472,17408,0,4096,5368709120,...,4096,0,16,49504,0,56260,69632,65536,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9927,5248,145408,270,2,50,145408,46592,0,4096,4194304,...,4096,0,16,0,0,191768,0,0,193024,1
9928,55440,105984,258,10,0,105984,0,0,4096,4194304,...,4096,0,16,0,190239,190383,196046,0,0,1
9929,77738,143360,8450,8,0,143360,61440,0,4096,1677721600,...,4096,0,16,320,179248,176652,204800,0,0,1
9930,110272,102912,270,2,50,102912,5632,0,8192,4194304,...,4096,0,16,0,0,4224,114688,0,109056,1


In [235]:
# train, valid, test = np.split(df.sample(frac=1), [int(0.7*len(df)), int(0.9*len(df))])  # We will divide the data in three

train, temp = train_test_split(df, test_size=0.3, random_state=42)
valid, test = train_test_split(temp, test_size=0.33, random_state=42)

print("Size of train:", train.shape)
print("Size of valid:", valid.shape)
print("Size of test :", test.shape)

Size of train: (6952, 32)
Size of valid: (1996, 32)
Size of test : (984, 32)


In [236]:
def data_preparator(dataframe, os=False, scaler=False):
    # We extract all the data and put it appart from the label.
    X = dataframe[dataframe.columns[:-1]].values
    y = dataframe[dataframe.columns[-1]].values

    # In case we want to scale it.
    if scaler:
        scaler = MinMaxScaler()
        X = scaler.fit_transform(X)

    # In case we want to over sample it.
    if os:
        smote = SMOTE(random_state=42)
        X, y = smote.fit_resample(X, y)
    
    data = np.hstack((X, np.reshape(y, (-1, 1))))
    
    return data, X, y

In [237]:
OSS = False

if OSS == False:
    train, xT, yT = data_preparator(train, os=False, scaler=False)
    valid, xV, yV = data_preparator(valid, os=False, scaler=False)
    test, xTest, yTest = data_preparator(test, os=False, scaler=False)
    print("OSS [OVERSAMPLER & SCALER] Desactivated")
else:
    train, xT, yT = data_preparator(train, os=True, scaler=True)
    valid, xV, yV = data_preparator(valid, os=False, scaler=True)
    test, xTest, yTest = data_preparator(test, os=False, scaler=True)
    print("OSS [OVERSAMPLER & SCALER] Activated")


OSS [OVERSAMPLER & SCALER] Desactivated


In [238]:
sum(yT==1)

6303

In [239]:
sum(yT==0)

649

In [240]:
display(yT)

array([1, 1, 1, ..., 1, 0, 1])

## Models

In this section we are going to generate, train, validate and test the desired ML models.

### kNN k-Nearest Neighbors

In [241]:
from sklearn.neighbors import KNeighborsClassifier

In [242]:
mBox = []

In [243]:
knnM = KNeighborsClassifier(n_neighbors=5)
knnM.fit(xT, yT)

In [244]:
yPredict = knnM.predict(xTest)

In [245]:
print(classification_report(yTest, yPredict))

              precision    recall  f1-score   support

           0       0.97      0.94      0.95        93
           1       0.99      1.00      0.99       891

    accuracy                           0.99       984
   macro avg       0.98      0.97      0.97       984
weighted avg       0.99      0.99      0.99       984



In [246]:
mBox.append((knnM, 'KNN'))

### SVM Support Vector Machines

In [247]:
from sklearn.svm import SVC

In [248]:
svmM = SVC()
svmM = svmM.fit(xT, yT)

In [249]:
yPredict = svmM.predict(xTest)

In [250]:
print(classification_report(yTest, yPredict))

              precision    recall  f1-score   support

           0       1.00      0.20      0.34        93
           1       0.92      1.00      0.96       891

    accuracy                           0.92       984
   macro avg       0.96      0.60      0.65       984
weighted avg       0.93      0.92      0.90       984



In [251]:
mBox.append((svmM, 'SVM'))

### RF Random Forest

In [252]:
from sklearn.ensemble import RandomForestClassifier

In [253]:
rfM = RandomForestClassifier()
rfM = rfM.fit(xT, yT)

In [254]:
yPredict = rfM.predict(xTest)

In [255]:
print(classification_report(yTest, yPredict))

              precision    recall  f1-score   support

           0       1.00      0.97      0.98        93
           1       1.00      1.00      1.00       891

    accuracy                           1.00       984
   macro avg       1.00      0.98      0.99       984
weighted avg       1.00      1.00      1.00       984



In [256]:
mBox.append((rfM, 'RF'))

## DT Decision Tree

In [257]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

In [258]:
dtM = tree.DecisionTreeClassifier()
dtM = dtM.fit(xT, yT)

In [259]:
yPredict = dtM.predict(xTest)

In [260]:
print(classification_report(yTest, yPredict))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98        93
           1       1.00      1.00      1.00       891

    accuracy                           1.00       984
   macro avg       0.99      0.99      0.99       984
weighted avg       1.00      1.00      1.00       984



In [261]:
mBox.append((dtM, 'DT'))

In [262]:
## NN Neural Network

In [263]:
import tensorflow as tf

In [264]:
def nnCreator(xT, yT, bSize, epochs, nNodes, doProb, lRate):
  nnM = tf.keras.Sequential([
      tf.keras.layers.Input(shape=(31,)),
      tf.keras.layers.Dense(nNodes, activation='relu'),
      tf.keras.layers.Dropout(doProb),
      tf.keras.layers.Dense(nNodes, activation='relu'),
      tf.keras.layers.Dropout(doProb),
      tf.keras.layers.Dense(1, activation='sigmoid')
  ])

  nnM.compile(optimizer=tf.keras.optimizers.Adam(lRate), loss='binary_crossentropy',
              metrics=['accuracy'])
    
  history = nnM.fit(
    xT, yT, epochs=epochs, batch_size=bSize, validation_split=0.2, verbose=0
  )

  return history, nnM

In [265]:
history, nnM = nnCreator(xT, yT, 128, 100, 64, 0.1, 0.001)

In [266]:
yPredict = nnM.predict(xTest)

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [267]:
yPredict = (yPredict > 0.5).astype(int).reshape(-1,)
print(classification_report(yTest, yPredict))

              precision    recall  f1-score   support

           0       1.00      0.03      0.06        93
           1       0.91      1.00      0.95       891

    accuracy                           0.91       984
   macro avg       0.95      0.52      0.51       984
weighted avg       0.92      0.91      0.87       984



In [268]:
#mBox.append((nnM, 'NN'))

In [269]:
## K-Fold Validation

In [270]:
accuracies = []
kf = KFold(n_splits=5, shuffle=True, random_state=33)

for m, name in mBox:
    validation_score = cross_val_score(m, xV, yV, cv=kf, scoring='accuracy')
    print(f'{name} model. Accuracy: {validation_score.mean()}. Standard Deviation: {validation_score.std()}.')
    accuracies.append(validation_score.mean())
    




KNN model. Accuracy: 0.9819636591478696. Standard Deviation: 0.007830405434974142.
SVM model. Accuracy: 0.918329573934837. Standard Deviation: 0.017289950834496238.
RF model. Accuracy: 0.992983709273183. Standard Deviation: 0.0036849785229231737.
DT model. Accuracy: 0.9884749373433583. Standard Deviation: 0.0030125365410152735.


## Generation of .pkl

In [271]:
import joblib

In [272]:
joblib.dump(knnM, 'model_knnM.pkl')
joblib.dump(svmM, 'model_svmM.pkl')
joblib.dump(rfM, 'model_rfM.pkl')
joblib.dump(dtM, 'model_dtM.pkl')

['model_dtM.pkl']