# AMAML

## General Imports
In this section, besides each machine learning model, every import used will be shown.

In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time

from itertools import product

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_curve, precision_score, auc
from sklearn.model_selection import KFold, cross_val_score

from sklearn.exceptions import ConvergenceWarning
import warnings

## Data Load
We will construct a *dataframe* called **df**. This *dataframe* will be used all along in the models.
It will be built from the file *data.csv*. The *data.csv* file is generated by *datagen.py*

In [2]:
df = pd.read_csv('data.csv')

## Data Preview

Let's check how the data is formatted after being loaded using some functions.

In [3]:
df.head()             # To check the beginning

Unnamed: 0,Name,Machine,AddressOfEntryPoint,SizeOfCode,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode.1,SizeOfInitializedData,SizeOfUninitializedData,...,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,DllCharacteristics,ImageDirectoryEntryExport,ImageDirectoryEntryImport,ImageDirectoryEntryResource,ImageDirectoryEntryException,ImageDirectoryEntrySecurity,Malflag
0,7637ebd20f2b197995d6876d3cd4b01479614f5c0859bf...,34404,5328,9216,39,2,27,9216,25088,3072,...,4096,0,16,0,0,36864,49152,24576,30464,0
1,a9447372cc62144ca19196f8f18ffcb35f3907c10b8e4b...,34404,5328,13312,39,2,27,13312,31232,3072,...,4096,0,16,0,0,45056,61440,32768,37432,0
2,8bc53c486cba7fca5ffe4dd43976cbaac6bfb24acc95d2...,332,78862,73728,258,8,0,73728,16384,0,...,4096,0,16,34112,0,78780,81920,0,94208,0
3,ff6d6d846bb0ef538a95836a52e6187c855cbf93e2fce3...,34404,5328,32768,39,2,27,32768,62976,3072,...,4096,0,16,0,0,61440,86016,49152,79160,0
4,c2563bb38ff3a2f9109febfc012afa329401a15aea4a1a...,34404,43856,41472,34,12,10,41472,17408,0,...,4096,0,16,49504,0,56260,69632,65536,0,0


In [4]:
df.tail()            # To check the end

Unnamed: 0,Name,Machine,AddressOfEntryPoint,SizeOfCode,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode.1,SizeOfInitializedData,SizeOfUninitializedData,...,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,DllCharacteristics,ImageDirectoryEntryExport,ImageDirectoryEntryImport,ImageDirectoryEntryResource,ImageDirectoryEntryException,ImageDirectoryEntrySecurity,Malflag
9927,b437e136a5a2e16fce380833475f84d912175149f7a0e2...,332,5248,145408,270,2,50,145408,46592,0,...,4096,0,16,0,0,191768,0,0,193024,1
9928,0b626ff78872af9976e86b331c860d9105ffd9f1509391...,332,55440,105984,258,10,0,105984,0,0,...,4096,0,16,0,190239,190383,196046,0,0,1
9929,69a35069b7951f06a9ca499db9a84f045433e1c7dc8a42...,332,77738,143360,8450,8,0,143360,61440,0,...,4096,0,16,320,179248,176652,204800,0,0,1
9930,b3c0aa1724caf5795d6248f20965cfdc6dca6a7ad1e07f...,332,110272,102912,270,2,50,102912,5632,0,...,4096,0,16,0,0,4224,114688,0,109056,1
9931,f3887457c3279fcec344da10411688b0b161b5fbc852a9...,332,77738,143360,8450,8,0,143360,61440,0,...,4096,0,16,320,178800,176204,204800,0,0,1


In [5]:
df.shape

(9932, 34)

## Data Preparation

In [6]:
df.drop(['Name', 'Machine'], axis=1, inplace=True)  # These two columns are obsolete in our study.

df = df.dropna(how='all')
display(df)

Unnamed: 0,AddressOfEntryPoint,SizeOfCode,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode.1,SizeOfInitializedData,SizeOfUninitializedData,BaseOfCode,ImageBase,...,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,DllCharacteristics,ImageDirectoryEntryExport,ImageDirectoryEntryImport,ImageDirectoryEntryResource,ImageDirectoryEntryException,ImageDirectoryEntrySecurity,Malflag
0,5328,9216,39,2,27,9216,25088,3072,4096,4194304,...,4096,0,16,0,0,36864,49152,24576,30464,0
1,5328,13312,39,2,27,13312,31232,3072,4096,4194304,...,4096,0,16,0,0,45056,61440,32768,37432,0
2,78862,73728,258,8,0,73728,16384,0,8192,4194304,...,4096,0,16,34112,0,78780,81920,0,94208,0
3,5328,32768,39,2,27,32768,62976,3072,4096,4194304,...,4096,0,16,0,0,61440,86016,49152,79160,0
4,43856,41472,34,12,10,41472,17408,0,4096,5368709120,...,4096,0,16,49504,0,56260,69632,65536,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9927,5248,145408,270,2,50,145408,46592,0,4096,4194304,...,4096,0,16,0,0,191768,0,0,193024,1
9928,55440,105984,258,10,0,105984,0,0,4096,4194304,...,4096,0,16,0,190239,190383,196046,0,0,1
9929,77738,143360,8450,8,0,143360,61440,0,4096,1677721600,...,4096,0,16,320,179248,176652,204800,0,0,1
9930,110272,102912,270,2,50,102912,5632,0,8192,4194304,...,4096,0,16,0,0,4224,114688,0,109056,1


In [7]:
train, temp = train_test_split(df, test_size=0.3, random_state=42)
valid, test = train_test_split(temp, test_size=0.33, random_state=42)

print("Size of train:", train.shape)
print("Size of valid:", valid.shape)
print("Size of test :", test.shape)

Size of train: (6952, 32)
Size of valid: (1996, 32)
Size of test : (984, 32)


In [8]:
def data_preparator(dataframe, os=False, scaler=False):
    # We extract all the data and put it appart from the label.
    X = dataframe[dataframe.columns[:-1]].values
    y = dataframe[dataframe.columns[-1]].values
    
    # In case we want to scale it.
    if scaler:
        scaler = MinMaxScaler()
        X = scaler.fit_transform(X)

    # In case we want to over sample it.
    if os:
        smote = SMOTE(random_state=42)
        X, y = smote.fit_resample(X, y)
    
    data = np.hstack((X, np.reshape(y, (-1, 1))))
    
    return data, X, y

In [9]:
OSS = False

if OSS == False:
    train, xT, yT = data_preparator(train, os=False, scaler=False)
    valid, xV, yV = data_preparator(valid, os=False, scaler=False)
    test, xTest, yTest = data_preparator(test, os=False, scaler=False)
    print("OSS [OVERSAMPLER & SCALER] Desactivated")
else:
    train, xT, yT = data_preparator(train, os=True, scaler=True)
    valid, xV, yV = data_preparator(valid, os=False, scaler=True)
    test, xTest, yTest = data_preparator(test, os=False, scaler=True)
    print("OSS [OVERSAMPLER & SCALER] Activated")


OSS [OVERSAMPLER & SCALER] Desactivated


In [10]:
sum(yT==1)

6303

In [11]:
sum(yT==0)

649

In [12]:
display(yT)

array([1, 1, 1, ..., 1, 0, 1])

## Models

In this section we are going to generate, train, validate and test the desired ML models.

### kNN k-Nearest Neighbors

In [13]:
from sklearn.neighbors import KNeighborsClassifier

In [14]:
mBox = []

In [15]:
knnM = KNeighborsClassifier(n_neighbo--------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[72], line 1
----> 1 for name, (history, nnM) in NNBox:
      2     recall_values = history.history.get('recall', [])
      3     val_recall_values = history.history.get('val_recall', [])

ValueError: too many values to unpack (expected 2)rs=5)
knnM.fit(xT, yT)
yPredict = knnM.predict(xTest)

In [16]:
yPredict = knnM.predict(xTest)

In [17]:
print(classification_report(yTest, yPredict, digits=4))

              precision    recall  f1-score   support

           0     0.9667    0.9355    0.9508        93
           1     0.9933    0.9966    0.9950       891

    accuracy                         0.9909       984
   macro avg     0.9800    0.9661    0.9729       984
weighted avg     0.9908    0.9909    0.9908       984



In [18]:
mBox.append((knnM, 'knn5M'))

In [19]:
knn3M = KNeighborsClassifier(n_neighbors=3)
knn3M.fit(xT, yT)
yPredict = knn3M.predict(xTest)

In [20]:
print(classification_report(yTest, yPredict, digits=4))

              precision    recall  f1-score   support

           0     0.9886    0.9355    0.9613        93
           1     0.9933    0.9989    0.9961       891

    accuracy                         0.9929       984
   macro avg     0.9910    0.9672    0.9787       984
weighted avg     0.9929    0.9929    0.9928       984



In [21]:
mBox.append((knn3M, 'knn3M'))

In [22]:
knn7M = KNeighborsClassifier(n_neighbors=7)
knn7M.fit(xT, yT)
yPredict = knn7M.predict(xTest)

In [23]:
print(classification_report(yTest, yPredict, digits=4))

              precision    recall  f1-score   support

           0     0.9655    0.9032    0.9333        93
           1     0.9900    0.9966    0.9933       891

    accuracy                         0.9878       984
   macro avg     0.9777    0.9499    0.9633       984
weighted avg     0.9877    0.9878    0.9876       984



In [24]:
mBox.append((knn7M, 'knn7M'))

In [25]:
knn9M = KNeighborsClassifier(n_neighbors=9)
knn9M.fit(xT, yT)
yPredict = knn9M.predict(xTest)

In [26]:
print(classification_report(yTest, yPredict, digits=4))

              precision    recall  f1-score   support

           0     0.9770    0.9140    0.9444        93
           1     0.9911    0.9978    0.9944       891

    accuracy                         0.9898       984
   macro avg     0.9840    0.9559    0.9694       984
weighted avg     0.9898    0.9898    0.9897       984



In [27]:
mBox.append((knn9M, 'knn9M'))

In [28]:
knn11M = KNeighborsClassifier(n_neighbors=11)
knn11M.fit(xT, yT)
yPredict = knn11M.predict(xTest)

In [29]:
print(classification_report(yTest, yPredict, digits=4))

              precision    recall  f1-score   support

           0     0.9651    0.8925    0.9274        93
           1     0.9889    0.9966    0.9927       891

    accuracy                         0.9868       984
   macro avg     0.9770    0.9446    0.9601       984
weighted avg     0.9866    0.9868    0.9866       984



In [30]:
mBox.append((knn11M, 'knn11M'))

In [31]:
sqrtN = int(math.sqrt(df.shape[0]))
knnSqrNM = KNeighborsClassifier(n_neighbors=sqrtN)
knnSqrNM.fit(xT, yT)
yPredict = knnSqrNM.predict(xTest)

In [32]:
print(classification_report(yTest, yPredict, digits=4))

              precision    recall  f1-score   support

           0     0.9859    0.7527    0.8537        93
           1     0.9748    0.9989    0.9867       891

    accuracy                         0.9756       984
   macro avg     0.9804    0.8758    0.9202       984
weighted avg     0.9759    0.9756    0.9741       984



In [33]:
mBox.append((knnSqrNM , 'knnSqrtNM'))

### SVM Support Vector Machines

In [34]:
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.svm import NuSVC

In [35]:
from sklearn.svm import SVC

# Grid of hyperparameters
gammas = [1, 50, 100]
c_penalties = [1, 50, 100]
degrees = [1, 3, 5]
kernels = ['rbf', 'sigmoid']

# Array to sabe the best models
best_models = []


# Model training and saving
for gamma in gammas:
    for c in c_penalties:
        for degree in degrees:
            for kernel in kernels:
                model = SVC(gamma=gamma, C=c, degree=degree, kernel=kernel)
                model.fit(xT, yT)
                y_pred = model.predict(xV)
                
                rc = recall_score(yV, y_pred)

                name = "svc"+":G-"+str(gamma)+":C-"+str(c)+":dg-"+str(degree)+":kn-"+kernel
                # Saving data
                best_models.append((model, name, rc))



# Selection based on their recall and top three
best_models.sort(key=lambda x: x[2], reverse=True)
top_3_models = best_models[:3]

# Result printing
for i, (model, name, rc) in enumerate(top_3_models):
    print(f"Model {i}: {name}")
    print(f"Recall: {rc}")

    mBox.append((model, name))

Model 0: svc:G-1:C-1:dg-1:kn-rbf
Recall: 1.0
Model 1: svc:G-1:C-1:dg-1:kn-sigmoid
Recall: 1.0
Model 2: svc:G-1:C-1:dg-3:kn-rbf
Recall: 1.0


In [36]:
fit_intercepts = [True, False]
intercept_scaling = [0.1, 0.5, 1, 5.0, 10.0]
c_penalties = [0.1, 1, 10, 100, 1000]


# Array to sabe the best models
best_models = []


for fi in fit_intercepts:
    for int_s in intercept_scaling:
        for c in c_penalties:
            name = "svcL"+":INT_SCL-"+str(int_s)+":C-"+str(c)
            if (fi):
                name += ":FI-1:"
            else:
                name += ":FI-0:"
                
            model = LinearSVC(penalty='l2', C=c, loss='squared_hinge', fit_intercept=fi, intercept_scaling=int_s, dual='auto', max_iter=10000, tol=1e-5)
            try:
                with warnings.catch_warnings():
                    warnings.filterwarnings('error', category=ConvergenceWarning)
                    model.fit(xT, yT)
            except ConvergenceWarning:
                print(f"ConvergenceWarning /!\: Model {name} failed to converge.")
                continue
            
            y_pred = model.predict(xV)
            rc = recall_score(yV, y_pred)
                
            # Saving data
            best_models.append((model, name, rc))



# Selection based on their recall and top three
best_models.sort(key=lambda x: x[2], reverse=True)
top_3_models = best_models[:3]

# Result printing
for i, (model, name, rc) in enumerate(top_3_models):
    print(f"Model {i}: {name}")
    print(f"Recall: {rc}")

    mBox.append((model, name))

Model 0: svcL:INT_SCL-0.1:C-0.1:FI-1:
Recall: 0.9943693693693694
Model 1: svcL:INT_SCL-0.1:C-10:FI-1:
Recall: 0.9943693693693694
Model 2: svcL:INT_SCL-0.1:C-100:FI-1:
Recall: 0.9943693693693694


### RF Random Forest

In [37]:
from sklearn.ensemble import RandomForestClassifier

In [38]:
rfM = RandomForestClassifier()
rfM = rfM.fit(xT, yT)

In [39]:
yPredict = rfM.predict(xTest)

In [40]:
print(classification_report(yTest, yPredict))

              precision    recall  f1-score   support

           0       1.00      0.97      0.98        93
           1       1.00      1.00      1.00       891

    accuracy                           1.00       984
   macro avg       1.00      0.98      0.99       984
weighted avg       1.00      1.00      1.00       984



In [41]:
mBox.append((rfM, 'RFOOB'))

In [42]:
stimators = [100, 1000, 5000]
criterions = ["gini", "entropy", "log_loss"]
class_weights = [None, 'balanced', 'balanced_subsample']


# Array to sabe the best models
best_models = []


for stimator in stimators:
    for criterion in criterions:
            for class_weight in class_weights:
                name = "rf"+":STM-"+str(stimator)+":CRT-"+str(criterion)+":CW-"+str(class_weight)
                model = RandomForestClassifier(n_estimators=stimator, criterion=criterion, class_weight=class_weight)
                model.fit(xT, yT)
                y_pred = model.predict(xV)
                rc = recall_score(yV, y_pred)
                    
                # Saving data
                best_models.append((model, name, rc))



# Selection based on their recall and top three
best_models.sort(key=lambda x: x[2], reverse=True)
top_3_models = best_models[:3]

# Result printing
for i, (model, name, rc) in enumerate(top_3_models):
    print(f"Model {i}: {name}")
    print(f"Recall: {rc}")

    mBox.append((model, name))

Model 0: rf:STM-100:CRT-gini:CW-balanced
Recall: 0.9994369369369369
Model 1: rf:STM-100:CRT-gini:CW-balanced_subsample
Recall: 0.9994369369369369
Model 2: rf:STM-100:CRT-entropy:CW-None
Recall: 0.9994369369369369


## DT Decision Tree

In [43]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

In [44]:
dtM = tree.DecisionTreeClassifier()
dtM = dtM.fit(xT, yT)

In [45]:
yPredict = dtM.predict(xTest)

In [46]:
print(classification_report(yTest, yPredict))

              precision    recall  f1-score   support

           0       0.98      0.97      0.97        93
           1       1.00      1.00      1.00       891

    accuracy                           0.99       984
   macro avg       0.99      0.98      0.99       984
weighted avg       0.99      0.99      0.99       984



In [47]:
mBox.append((dtM, 'DTOOB'))

In [54]:
criterions = ["gini", "entropy", "log_loss"]
splitters = ["best", "random"]
max_features = ['sqrt', "log2", 0.3, 0.6, 0.9]


best_models = []

for criterion in criterions:
    for splitter in splitters:
            for max_feature in max_features:
                name = "dt"+":CRT-"+criterion+":SPLT-"+splitter+":CW-"+str(max_feature)
                model = tree.DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_features=max_feature)
                model.fit(xT, yT)
                
                y_pred = model.predict(xV)
                rc = recall_score(yV, y_pred)

                best_models.append((model, name, rc))



# Selection based on their recall and top three
best_models.sort(key=lambda x: x[2], reverse=True)
top_3_models = best_models[:3]

# Result printing
for i, (model, name, rc) in enumerate(top_3_models):
    print(f"Model {i}: {name}")
    print(f"Recall: {rc}")

    mBox.append((model, name))

Model 0: dt:CRT-log_loss:SPLT-random:CW-0.3
Recall: 0.9994369369369369
Model 1: dt:CRT-gini:SPLT-best:CW-0.6
Recall: 0.9988738738738738
Model 2: dt:CRT-entropy:SPLT-best:CW-log2
Recall: 0.9988738738738738


## NN Neural Network

In [16]:
import tensorflow as tf

In [20]:
def nnCreator(xT, yT, bSize, epochs, nNodes, doProb, lRate):
    scaler = StandardScaler()
    xT = scaler.fit_transform(xT)
    
    # Definición del modelo
    nnM = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(xT.shape[1],)),
        tf.keras.layers.Dense(nNodes, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(doProb),
        tf.keras.layers.Dense(nNodes, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(doProb),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    # Compilación del modelo
    nnM.compile(optimizer=tf.keras.optimizers.Adam(lRate), loss='binary_crossentropy',
                metrics=['recall'])
    
    # Callbacks
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5)
    
    # Entrenamiento del modelo
    history = nnM.fit(
        xT, yT, epochs=epochs, batch_size=bSize, validation_split=0.2, verbose=0,
        callbacks=[early_stopping, lr_scheduler]
    )

    return history, nnM

In [22]:
# Supongamos que ya tenemos xT, yT, xTest y yTest cargados y definidos

#bSizes = [32, 64, 128, 256]
#nNodes = [32, 64, 128]
#doProbs = [0.1, 0.3, 0.5]

bSizes = [128]
nNodes = [64]
doProbs = [0.2]


NNBox = [ ]

for bSize in bSizes:
    for nNode in nNodes:
        for doProb in doProbs:
            name = "nn"+":BS-"+str(bSize)+":NN-"+str(nNode)+":DOP-"+str(doProb)
            print(f'{name}')
            history, nnM = nnCreator(xT, yT, bSize, 100, nNode, doProb, 0.001)

            NNBox.append((name, (history, nnM)))

            

nn:BS-128:NN-64:DOP-0.2


In [23]:
for name, info in NNBox:
    recall_values = info[0].history.get('recall', [])
    val_recall_values = info[0].history.get('val_recall', [])

    if recall_values and val_recall_values:
        print(f"Modelo: {name}")
        print(f"Recall de Entrenamiento: {recall_values[-1]}")
        print(f"Recall de Validación: {val_recall_values[-1]}")
        print("=" * 50)
    else:
        print(f"Modelo: {name} no tiene recall registrado.")
        print("=" * 50)

Modelo: nn:BS-128:NN-64:DOP-0.2
Recall de Entrenamiento: 0.9984123706817627
Recall de Validación: 1.0


In [58]:
# Normalizar xTest utilizando el mismo escalador
xTest = scaler.transform(xTest)

In [61]:
# Predicción
yPredict = nnM.predict(xTest)
yPredict = (yPredict > 0.5).astype(int).reshape(-1,)
print(classification_report(yTest, yPredict))

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
              precision    recall  f1-score   support

           0       1.00      0.96      0.98        93
           1       1.00      1.00      1.00       891

    accuracy                           1.00       984
   macro avg       1.00      0.98      0.99       984
weighted avg       1.00      1.00      1.00       984



In [None]:
#mBox.append((nnM, 'NN'))

## K-Fold Validation

In [None]:
accuracies = []
kf = KFold(n_splits=5, shuffle=True, random_state=33)

for m, name in mBox:
    validation_score = cross_val_score(m, xV, yV, cv=kf, scoring='recall')
    print(f'{name} model. Recall: {validation_score.mean()}. Standard Deviation: {validation_score.std()}.')
    accuracies.append(validation_score.mean())
    




## Generation of .pkl

In [None]:
import joblib