# AMAML

## General Imports
In this section, besides the machine learning model, every import used will be shown.

In [101]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

from sklearn.metrics import classification_report

## Data Load
We will construct a *dataframe* called **df**. This *dataframe* will be used all along in the models.
It will be built from the file *data.csv*. The *data.csv* file is generated by *datagen.py*

In [102]:
df = pd.read_csv('data.csv')

## Data Preview

Let's check how the data is formatted after being loaded using some functions.

In [103]:
df.head()             # To check the beginning

Unnamed: 0,Name,Machine,AddressOfEntryPoint,SizeOfCode,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode.1,SizeOfInitializedData,SizeOfUninitializedData,...,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,DllCharacteristics,ImageDirectoryEntryExport,ImageDirectoryEntryImport,ImageDirectoryEntryResource,ImageDirectoryEntryException,ImageDirectoryEntrySecurity,Malflag
0,PickerHost.exe,34404,22144,23552,34,12,10,23552,23552,512,...,4096,0,16,49632,0,44992,57344,53248,46592,0
1,WCA.exe,332,42590,34816,258,11,0,34816,2560,0,...,4096,0,16,34144,0,42500,49152,0,37888,0
2,iconv.exe,34404,5376,22528,559,2,25,22528,18944,3584,...,4096,0,16,0,0,57344,69632,45056,0,0
3,wmpconfig.exe,34404,5216,3072,34,12,10,3072,100864,0,...,4096,0,16,49504,0,9628,20480,16384,0,0
4,graywallsetup.exe,332,39508,37376,33167,2,25,37376,17408,0,...,4096,0,16,32768,0,53248,69632,0,0,0


In [104]:
df.tail()            # To check the end

Unnamed: 0,Name,Machine,AddressOfEntryPoint,SizeOfCode,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode.1,SizeOfInitializedData,SizeOfUninitializedData,...,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,DllCharacteristics,ImageDirectoryEntryExport,ImageDirectoryEntryImport,ImageDirectoryEntryResource,ImageDirectoryEntryException,ImageDirectoryEntrySecurity,Malflag
9945,VirusShare_6a00e1b5feebc83783f4dcf04fce5fc9.exe,332,8185,9728,259,10,0,9728,27648,0,...,4096,0,16,33088,0,17372,24576,0,0,1
9946,ac453efe6e94638db3dcf7c4e271e92e3a20052b.exe,332,5344,5632,271,10,0,5632,357376,0,...,4096,0,16,0,0,12464,466944,0,0,1
9947,VirusShare_408df88ad1ef7b6b83938adfdde69651.exe,332,5925,65536,271,7,0,65536,98304,0,...,4096,0,16,0,0,151292,159744,0,0,1
9948,599c02241340897e19d575b2b295090cbee0847d.exe,332,5776,36864,271,6,0,36864,393216,0,...,4096,0,16,0,58480,56868,434176,0,0,1
9949,VirusShare_8441d32f80ef2c7772b25db4b95aab4f.exe,332,160750,152576,258,11,0,152576,72704,0,...,4096,0,16,34112,0,160660,172032,0,0,1


## Data Preparation

In [105]:
df.drop(['Name', 'Machine'], axis=1, inplace=True)         # These two columns are obsolete in our study.
display(df)

Unnamed: 0,AddressOfEntryPoint,SizeOfCode,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode.1,SizeOfInitializedData,SizeOfUninitializedData,BaseOfCode,ImageBase,...,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,DllCharacteristics,ImageDirectoryEntryExport,ImageDirectoryEntryImport,ImageDirectoryEntryResource,ImageDirectoryEntryException,ImageDirectoryEntrySecurity,Malflag
0,22144,23552,34,12,10,23552,23552,512,4096,5368709120,...,4096,0,16,49632,0,44992,57344,53248,46592,0
1,42590,34816,258,11,0,34816,2560,0,8192,4194304,...,4096,0,16,34144,0,42500,49152,0,37888,0
2,5376,22528,559,2,25,22528,18944,3584,4096,4194304,...,4096,0,16,0,0,57344,69632,45056,0,0
3,5216,3072,34,12,10,3072,100864,0,4096,5368709120,...,4096,0,16,49504,0,9628,20480,16384,0,0
4,39508,37376,33167,2,25,37376,17408,0,4096,4194304,...,4096,0,16,32768,0,53248,69632,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9945,8185,9728,259,10,0,9728,27648,0,4096,4194304,...,4096,0,16,33088,0,17372,24576,0,0,1
9946,5344,5632,271,10,0,5632,357376,0,4096,4194304,...,4096,0,16,0,0,12464,466944,0,0,1
9947,5925,65536,271,7,0,65536,98304,0,4096,4194304,...,4096,0,16,0,0,151292,159744,0,0,1
9948,5776,36864,271,6,0,36864,393216,0,4096,4194304,...,4096,0,16,0,58480,56868,434176,0,0,1


In [106]:
train, valid, test = np.split(df.sample(frac=1), [int(0.7*len(df)), int(0.9*len(df))])  # We will divide the data in three

  return bound(*args, **kwds)


In [107]:
def data_preparator(dataframe, os=False, scaler=False):
    # We extract all the data and put it appart from the label.
    X = dataframe[dataframe.columns[:-1]].values
    y = dataframe[dataframe.columns[-1]].values

    # In case we want to scale it.
    if scaler:
        scaler = StandardScaler()
        X = scaler.fit_transform(X)

    # In case we want to over sample it.
    if os:
        ros = RandomOverSampler()
        X, y = ros.fit_resample(X, y)
    
    data = np.hstack((X, np.reshape(y, (-1, 1))))
    
    return data, X, y

In [108]:
train, xT, yT = data_preparator(train)
valid, xV, yV = data_preparator(valid)
test, xTest, yTest = data_preparator(test)

In [109]:
display(xT)

array([[  5540,   5632,    271, ..., 438272,      0,      0],
       [427070,  60928,    258, ..., 428032,      0,      0],
       [  5492,   5632,    271, ..., 393216, 389120,      0],
       ...,
       [  7018,  28672,    271, ..., 819200,      0,      0],
       [  5328,  19456,     39, ...,  69632,  36864,  55848],
       [ 97393, 147456,    271, ..., 212992,      0, 200704]])

In [110]:
display(yT)

array([1, 1, 1, ..., 1, 0, 0])

## Models

In this section we are going to generate, train, validate and test the desired ML models.

### kNN k-Nearest Neighbors

In [111]:
from sklearn.neighbors import KNeighborsClassifier

In [112]:
knnM = KNeighborsClassifier(n_neighbors=5)
knnM.fit(xT, yT)

In [113]:
yPredict = knnM.predict(xTest)

In [114]:
print(classification_report(yTest, yPredict))

              precision    recall  f1-score   support

           0       0.94      0.87      0.91        95
           1       0.99      0.99      0.99       900

    accuracy                           0.98       995
   macro avg       0.96      0.93      0.95       995
weighted avg       0.98      0.98      0.98       995



Accuracy:

Recall: 

In [115]:
### SVM Support Vector Machines

In [116]:
from sklearn.svm import SVC

In [117]:
svmM = SVC()
svmM = svcM.fit(xT, yT)

In [118]:
yPredict = svmM.predict(xTest)

In [119]:
print(classification_report(yTest, yPredict))

              precision    recall  f1-score   support

           0       1.00      0.22      0.36        95
           1       0.92      1.00      0.96       900

    accuracy                           0.93       995
   macro avg       0.96      0.61      0.66       995
weighted avg       0.93      0.93      0.90       995



### NB Naive Bayes

In [120]:
from sklearn.naive_bayes import GaussianNB

In [121]:
nbM = GaussianNB()
nbM = nbM.fit(xT, yT)

In [122]:
yPredict = nbM.predict(xTest)

In [123]:
print(classification_report(yTest, yPredict))

              precision    recall  f1-score   support

           0       0.35      0.75      0.47        95
           1       0.97      0.85      0.91       900

    accuracy                           0.84       995
   macro avg       0.66      0.80      0.69       995
weighted avg       0.91      0.84      0.87       995



### RF Random Forest

In [124]:
from sklearn.ensemble import RandomForestClassifier

In [125]:
rfM = RandomForestClassifier()
rfM = rfM.fit(xT, yT)

In [126]:
yPredict = rfM.predict(xTest)

In [127]:
print(classification_report(yTest, yPredict))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98        95
           1       1.00      1.00      1.00       900

    accuracy                           1.00       995
   macro avg       0.99      0.99      0.99       995
weighted avg       1.00      1.00      1.00       995

