# AMAML

## General Imports
In this section, besides each machine learning model, every import used will be shown.

In [492]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

from sklearn.metrics import classification_report

## Data Load
We will construct a *dataframe* called **df**. This *dataframe* will be used all along in the models.
It will be built from the file *data.csv*. The *data.csv* file is generated by *datagen.py*

In [493]:
df = pd.read_csv('data.csv')

## Data Preview

Let's check how the data is formatted after being loaded using some functions.

In [494]:
df.head()             # To check the beginning

Unnamed: 0,Name,Machine,AddressOfEntryPoint,SizeOfCode,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode.1,SizeOfInitializedData,SizeOfUninitializedData,...,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,DllCharacteristics,ImageDirectoryEntryExport,ImageDirectoryEntryImport,ImageDirectoryEntryResource,ImageDirectoryEntryException,ImageDirectoryEntrySecurity,Malflag
0,PickerHost.exe,34404,22144,23552,34,12,10,23552,23552,512,...,4096,0,16,49632,0,44992,57344,53248,46592,0
1,WCA.exe,332,42590,34816,258,11,0,34816,2560,0,...,4096,0,16,34144,0,42500,49152,0,37888,0
2,iconv.exe,34404,5376,22528,559,2,25,22528,18944,3584,...,4096,0,16,0,0,57344,69632,45056,0,0
3,wmpconfig.exe,34404,5216,3072,34,12,10,3072,100864,0,...,4096,0,16,49504,0,9628,20480,16384,0,0
4,graywallsetup.exe,332,39508,37376,33167,2,25,37376,17408,0,...,4096,0,16,32768,0,53248,69632,0,0,0


In [495]:
df.tail()            # To check the end

Unnamed: 0,Name,Machine,AddressOfEntryPoint,SizeOfCode,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode.1,SizeOfInitializedData,SizeOfUninitializedData,...,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,DllCharacteristics,ImageDirectoryEntryExport,ImageDirectoryEntryImport,ImageDirectoryEntryResource,ImageDirectoryEntryException,ImageDirectoryEntrySecurity,Malflag
9945,VirusShare_6a00e1b5feebc83783f4dcf04fce5fc9.exe,332,8185,9728,259,10,0,9728,27648,0,...,4096,0,16,33088,0,17372,24576,0,0,1
9946,ac453efe6e94638db3dcf7c4e271e92e3a20052b.exe,332,5344,5632,271,10,0,5632,357376,0,...,4096,0,16,0,0,12464,466944,0,0,1
9947,VirusShare_408df88ad1ef7b6b83938adfdde69651.exe,332,5925,65536,271,7,0,65536,98304,0,...,4096,0,16,0,0,151292,159744,0,0,1
9948,599c02241340897e19d575b2b295090cbee0847d.exe,332,5776,36864,271,6,0,36864,393216,0,...,4096,0,16,0,58480,56868,434176,0,0,1
9949,VirusShare_8441d32f80ef2c7772b25db4b95aab4f.exe,332,160750,152576,258,11,0,152576,72704,0,...,4096,0,16,34112,0,160660,172032,0,0,1


In [496]:
df.shape

(9950, 34)

## Data Preparation

In [497]:
df.drop(['Name', 'Machine'], axis=1, inplace=True)         # These two columns are obsolete in our study.
display(df)

Unnamed: 0,AddressOfEntryPoint,SizeOfCode,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode.1,SizeOfInitializedData,SizeOfUninitializedData,BaseOfCode,ImageBase,...,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,DllCharacteristics,ImageDirectoryEntryExport,ImageDirectoryEntryImport,ImageDirectoryEntryResource,ImageDirectoryEntryException,ImageDirectoryEntrySecurity,Malflag
0,22144,23552,34,12,10,23552,23552,512,4096,5368709120,...,4096,0,16,49632,0,44992,57344,53248,46592,0
1,42590,34816,258,11,0,34816,2560,0,8192,4194304,...,4096,0,16,34144,0,42500,49152,0,37888,0
2,5376,22528,559,2,25,22528,18944,3584,4096,4194304,...,4096,0,16,0,0,57344,69632,45056,0,0
3,5216,3072,34,12,10,3072,100864,0,4096,5368709120,...,4096,0,16,49504,0,9628,20480,16384,0,0
4,39508,37376,33167,2,25,37376,17408,0,4096,4194304,...,4096,0,16,32768,0,53248,69632,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9945,8185,9728,259,10,0,9728,27648,0,4096,4194304,...,4096,0,16,33088,0,17372,24576,0,0,1
9946,5344,5632,271,10,0,5632,357376,0,4096,4194304,...,4096,0,16,0,0,12464,466944,0,0,1
9947,5925,65536,271,7,0,65536,98304,0,4096,4194304,...,4096,0,16,0,0,151292,159744,0,0,1
9948,5776,36864,271,6,0,36864,393216,0,4096,4194304,...,4096,0,16,0,58480,56868,434176,0,0,1


In [498]:
# train, valid, test = np.split(df.sample(frac=1), [int(0.7*len(df)), int(0.9*len(df))])  # We will divide the data in three

train, temp = train_test_split(df, test_size=0.3, random_state=42)
valid, test = train_test_split(temp, test_size=0.33, random_state=42)

print("Size of train:", train.shape)
print("Size of valid:", valid.shape)
print("Size of test :", test.shape)

Size of train: (6965, 32)
Size of valid: (1999, 32)
Size of test : (986, 32)


In [499]:
def data_preparator(dataframe, os=False, scaler=False):
    # We extract all the data and put it appart from the label.
    X = dataframe[dataframe.columns[:-1]].values
    y = dataframe[dataframe.columns[-1]].values

    # In case we want to scale it.
    if scaler:
        scaler = MinMaxScaler()
        X = scaler.fit_transform(X)

    # In case we want to over sample it.
    if os:
        smote = SMOTE(random_state=33)
        X, y = smote.fit_resample(X, y)
    
    data = np.hstack((X, np.reshape(y, (-1, 1))))
    
    return data, X, y

In [500]:
OSS = False

if OSS == False:
    train, xT, yT = data_preparator(train, os=False, scaler=False)
    valid, xV, yV = data_preparator(valid, os=False, scaler=False)
    test, xTest, yTest = data_preparator(test, os=False, scaler=False)
    print("OSS [OVERSAMPLER & SCALER] Desactivated")
else:
    train, xT, yT = data_preparator(train, os=True, scaler=True)
    valid, xV, yV = data_preparator(valid, os=False, scaler=True)
    test, xTest, yTest = data_preparator(test, os=False, scaler=True)
    print("OSS [OVERSAMPLER & SCALER] Activated")


OSS [OVERSAMPLER & SCALER] Desactivated


In [501]:
sum(yT==1)

6301

In [502]:
sum(yT==0)

664

In [503]:
display(yT)

array([0, 1, 1, ..., 1, 0, 1])

## Models

In this section we are going to generate, train, validate and test the desired ML models.

### kNN k-Nearest Neighbors

In [504]:
from sklearn.neighbors import KNeighborsClassifier

In [505]:
knnM = KNeighborsClassifier(n_neighbors=5)
knnM.fit(xT, yT)

In [506]:
yPredict = knnM.predict(xTest)

In [507]:
print(classification_report(yTest, yPredict))

              precision    recall  f1-score   support

           0       0.97      0.91      0.94       112
           1       0.99      1.00      0.99       874

    accuracy                           0.99       986
   macro avg       0.98      0.95      0.97       986
weighted avg       0.99      0.99      0.99       986



### SVM Support Vector Machines

In [508]:
from sklearn.svm import SVC

In [509]:
svmM = SVC()
svmM = svmM.fit(xT, yT)

In [510]:
yPredict = svmM.predict(xTest)

In [511]:
print(classification_report(yTest, yPredict))

              precision    recall  f1-score   support

           0       1.00      0.18      0.30       112
           1       0.90      1.00      0.95       874

    accuracy                           0.91       986
   macro avg       0.95      0.59      0.63       986
weighted avg       0.92      0.91      0.88       986



### RF Random Forest

In [512]:
from sklearn.ensemble import RandomForestClassifier

In [513]:
rfM = RandomForestClassifier()
rfM = rfM.fit(xT, yT)

In [514]:
yPredict = rfM.predict(xTest)

In [515]:
print(classification_report(yTest, yPredict))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       112
           1       1.00      1.00      1.00       874

    accuracy                           1.00       986
   macro avg       0.99      0.99      0.99       986
weighted avg       1.00      1.00      1.00       986



## DT Decision Tree

In [516]:
from sklearn import tree

In [517]:
dtM = tree.DecisionTreeClassifier()
dtM = dtM.fit(xT, yT)

In [518]:
yPredict = dtM.predict(xTest)

In [519]:
print(classification_report(yTest, yPredict))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97       112
           1       1.00      0.99      1.00       874

    accuracy                           0.99       986
   macro avg       0.98      0.99      0.98       986
weighted avg       0.99      0.99      0.99       986



In [520]:
## NN Neural Network

In [521]:
import tensorflow as tf

In [522]:
def nnCreator(xT, yT, bSize, epochs, nNodes, doProb, lRate):
  nnM = tf.keras.Sequential([
      tf.keras.layers.Input(shape=(31,)),
      tf.keras.layers.Dense(nNodes, activation='relu'),
      tf.keras.layers.Dropout(doProb),
      tf.keras.layers.Dense(nNodes, activation='relu'),
      tf.keras.layers.Dropout(doProb),
      tf.keras.layers.Dense(1, activation='sigmoid')
  ])

  nnM.compile(optimizer=tf.keras.optimizers.Adam(lRate), loss='binary_crossentropy',
              metrics=['accuracy'])
    
  history = nnM.fit(
    xT, yT, epochs=epochs, batch_size=bSize, validation_split=0.2, verbose=0
  )

  return history, nnM

In [523]:
history, nnM = nnCreator(xT, yT, 128, 100, 64, 0.1, 0.001)

In [524]:
yPredict = nnM.predict(xTest)

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [525]:
yPredict = (yPredict > 0.5).astype(int).reshape(-1,)
print(classification_report(yTest, yPredict))

              precision    recall  f1-score   support

           0       1.00      0.03      0.05       112
           1       0.89      1.00      0.94       874

    accuracy                           0.89       986
   macro avg       0.94      0.51      0.50       986
weighted avg       0.90      0.89      0.84       986

