# AMAML

## General Imports
In this section, besides the machine learning model, every import used will be shown.

In [155]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

from sklearn.metrics import classification_report

## Data Load
We will construct a *dataframe*: **df**. This *dataframe* will be used all along in the models.
It will be built from the file *data.csv*.

In [156]:
df = pd.read_csv('data.csv')

## Data Preview

In [157]:
df.head()             # To check the beginning

Unnamed: 0,Name,Machine,AddressOfEntryPoint,SizeOfCode,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode.1,SizeOfInitializedData,SizeOfUninitializedData,BaseOfCode,ImageBase,FileAlignment,SizeOfImage,SizeOfHeaders,MajorOperatingSystemVersion,MinorOperatingSystemVersion,MajorImageVersion,MinorImageVersion,MajorSubsystemVersion,MinorSubsystemVersion,SizeOfStackReserve,SizeOfStackCommit,SizeOfHeapReserve,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,DllCharacteristics,ImageDirectoryEntryExport,ImageDirectoryEntryImport,ImageDirectoryEntryResource,ImageDirectoryEntryException,ImageDirectoryEntrySecurity,Malflag
0,PickerHost.exe,34404,22144,23552,34,12,10,23552,23552,512,4096,5368709120,512,65536,1024,10,0,10,0,10,0,524288,8192,1048576,4096,0,16,49632,0,44992,57344,53248,46592,0
1,WCA.exe,332,42590,34816,258,11,0,34816,2560,0,8192,4194304,512,65536,512,4,0,0,0,6,0,1048576,4096,1048576,4096,0,16,34144,0,42500,49152,0,37888,0
2,wmpconfig.exe,34404,5216,3072,34,12,10,3072,100864,0,4096,5368709120,512,122880,1024,10,0,10,0,10,0,524288,8192,1048576,4096,0,16,49504,0,9628,20480,16384,0,0
3,graywallsetup.exe,332,39508,37376,33167,2,25,37376,17408,0,4096,4194304,512,81920,1024,1,0,0,0,4,0,1048576,16384,1048576,4096,0,16,32768,0,53248,69632,0,0,0
4,qhelpconverter.exe,34404,77104,75264,34,9,0,75264,96256,0,4096,5368709120,512,184320,1024,5,2,0,0,5,2,1048576,4096,1048576,4096,0,16,33088,0,126644,176128,172032,0,0


In [158]:
df.tail()            # To check the end

Unnamed: 0,Name,Machine,AddressOfEntryPoint,SizeOfCode,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode.1,SizeOfInitializedData,SizeOfUninitializedData,BaseOfCode,ImageBase,FileAlignment,SizeOfImage,SizeOfHeaders,MajorOperatingSystemVersion,MinorOperatingSystemVersion,MajorImageVersion,MinorImageVersion,MajorSubsystemVersion,MinorSubsystemVersion,SizeOfStackReserve,SizeOfStackCommit,SizeOfHeapReserve,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,DllCharacteristics,ImageDirectoryEntryExport,ImageDirectoryEntryImport,ImageDirectoryEntryResource,ImageDirectoryEntryException,ImageDirectoryEntrySecurity,Malflag
6969,VirusShare_cb9053a96092b96636a131416baced8d.exe,332,79418,147456,8450,8,0,147456,61440,0,4096,1677721600,4096,229376,4096,4,0,0,0,4,0,1048576,4096,1048576,4096,0,16,320,184000,181400,208896,0,0,1
6970,ac453efe6e94638db3dcf7c4e271e92e3a20052b.exe,332,5344,5632,271,10,0,5632,357376,0,4096,4194304,512,823296,1024,4,0,0,0,4,0,1048576,4096,1048576,4096,0,16,0,0,12464,466944,0,0,1
6971,VirusShare_408df88ad1ef7b6b83938adfdde69651.exe,332,5925,65536,271,7,0,65536,98304,0,4096,4194304,4096,167936,4096,4,0,0,0,4,0,1048576,4096,1048576,4096,0,16,0,0,151292,159744,0,0,1
6972,599c02241340897e19d575b2b295090cbee0847d.exe,332,5776,36864,271,6,0,36864,393216,0,4096,4194304,4096,823296,4096,4,0,0,0,4,0,1048576,4096,1048576,4096,0,16,0,58480,56868,434176,0,0,1
6973,VirusShare_8441d32f80ef2c7772b25db4b95aab4f.exe,332,160750,152576,258,11,0,152576,72704,0,8192,4194304,512,253952,1024,4,0,0,0,4,0,1048576,4096,1048576,4096,0,16,34112,0,160660,172032,0,0,1


## Data Preparation

In [159]:
df.drop(['Name', 'Machine'], axis=1, inplace=True)         # These two columns are obsolete in our study.
display(df)

Unnamed: 0,AddressOfEntryPoint,SizeOfCode,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode.1,SizeOfInitializedData,SizeOfUninitializedData,BaseOfCode,ImageBase,FileAlignment,SizeOfImage,SizeOfHeaders,MajorOperatingSystemVersion,MinorOperatingSystemVersion,MajorImageVersion,MinorImageVersion,MajorSubsystemVersion,MinorSubsystemVersion,SizeOfStackReserve,SizeOfStackCommit,SizeOfHeapReserve,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,DllCharacteristics,ImageDirectoryEntryExport,ImageDirectoryEntryImport,ImageDirectoryEntryResource,ImageDirectoryEntryException,ImageDirectoryEntrySecurity,Malflag
0,22144,23552,34,12,10,23552,23552,512,4096,5368709120,512,65536,1024,10,0,10,0,10,0,524288,8192,1048576,4096,0,16,49632,0,44992,57344,53248,46592,0
1,42590,34816,258,11,0,34816,2560,0,8192,4194304,512,65536,512,4,0,0,0,6,0,1048576,4096,1048576,4096,0,16,34144,0,42500,49152,0,37888,0
2,5216,3072,34,12,10,3072,100864,0,4096,5368709120,512,122880,1024,10,0,10,0,10,0,524288,8192,1048576,4096,0,16,49504,0,9628,20480,16384,0,0
3,39508,37376,33167,2,25,37376,17408,0,4096,4194304,512,81920,1024,1,0,0,0,4,0,1048576,16384,1048576,4096,0,16,32768,0,53248,69632,0,0,0
4,77104,75264,34,9,0,75264,96256,0,4096,5368709120,512,184320,1024,5,2,0,0,5,2,1048576,4096,1048576,4096,0,16,33088,0,126644,176128,172032,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6969,79418,147456,8450,8,0,147456,61440,0,4096,1677721600,4096,229376,4096,4,0,0,0,4,0,1048576,4096,1048576,4096,0,16,320,184000,181400,208896,0,0,1
6970,5344,5632,271,10,0,5632,357376,0,4096,4194304,512,823296,1024,4,0,0,0,4,0,1048576,4096,1048576,4096,0,16,0,0,12464,466944,0,0,1
6971,5925,65536,271,7,0,65536,98304,0,4096,4194304,4096,167936,4096,4,0,0,0,4,0,1048576,4096,1048576,4096,0,16,0,0,151292,159744,0,0,1
6972,5776,36864,271,6,0,36864,393216,0,4096,4194304,4096,823296,4096,4,0,0,0,4,0,1048576,4096,1048576,4096,0,16,0,58480,56868,434176,0,0,1


In [160]:
train, valid, test = np.split(df.sample(frac=1), [int(0.7*len(df)), int(0.9*len(df))])  # We will divide the data in three

  return bound(*args, **kwds)


In [161]:
def scale_dataset(dataframe, oversample=False):
      # We extract all the data and put it appart from the label.
      X = dataframe[dataframe.columns[:-1]].values
      y = dataframe[dataframe.columns[-1]].values


      #scaler = StandardScaler()
      #X = scaler.fit_transform(X)
    
      #if oversample:
        #ros = RandomOverSampler()
        #X, y = ros.fit_resample(X, y)
    
      data = np.hstack((X, np.reshape(y, (-1, 1))))
    
      return data, X, y

In [162]:
train, X_train, y_train = scale_dataset(train, oversample=True)
valid, X_valid, y_valid = scale_dataset(valid, oversample=False)
test, X_test, y_test = scale_dataset(test, oversample=False)

In [163]:
display(X_train)

array([[  5744,  36864,    271, ..., 434176,      0,      0],
       [110256, 102912,    270, ..., 114688,      0, 109056],
       [  6976,  28672,    271, ..., 819200,      0,      0],
       ...,
       [ 94052,  92672,    290, ..., 110592,      0, 109056],
       [  5088, 117760,    270, ...,      0,      0, 162304],
       [  5328,  17920,     39, ...,  65536,  36864,  44328]])

In [164]:
display(y_train)

array([1, 1, 1, ..., 0, 1, 0])

## Models

### kNN

In [165]:
from sklearn.neighbors import KNeighborsClassifier

In [166]:
knnM = KNeighborsClassifier(n_neighbors=5)
knnM.fit(X_train, y_train)

In [167]:
y_pred = knnM.predict(X_test)

In [168]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.98      0.95        58
           1       1.00      0.99      1.00       640

    accuracy                           0.99       698
   macro avg       0.96      0.99      0.97       698
weighted avg       0.99      0.99      0.99       698

