In [48]:
import pandas as pd

malData = pd.read_csv("MalwareData.csv", sep = '|')

legit = malData[0: 41323].drop(["legitimate"], axis = 1)
mal = malData[41323::].drop(["legitimate"], axis = 1)

print("The shape of the legit dataset is: %s samples, %s features"%(legit.shape[0],legit.shape[1]))
print("The shape of the malware dataset is: %s samples, %s features"%(mal.shape[0],mal.shape[1]))

The shape of the legit dataset is: 41323 samples, 56 features
The shape of the malware dataset is: 96724 samples, 56 features


In [49]:
print(malData.columns)

Index(['Name', 'md5', 'Machine', 'SizeOfOptionalHeader', 'Characteristics',
       'MajorLinkerVersion', 'MinorLinkerVersion', 'SizeOfCode',
       'SizeOfInitializedData', 'SizeOfUninitializedData',
       'AddressOfEntryPoint', 'BaseOfCode', 'BaseOfData', 'ImageBase',
       'SectionAlignment', 'FileAlignment', 'MajorOperatingSystemVersion',
       'MinorOperatingSystemVersion', 'MajorImageVersion', 'MinorImageVersion',
       'MajorSubsystemVersion', 'MinorSubsystemVersion', 'SizeOfImage',
       'SizeOfHeaders', 'CheckSum', 'Subsystem', 'DllCharacteristics',
       'SizeOfStackReserve', 'SizeOfStackCommit', 'SizeOfHeapReserve',
       'SizeOfHeapCommit', 'LoaderFlags', 'NumberOfRvaAndSizes', 'SectionsNb',
       'SectionsMeanEntropy', 'SectionsMinEntropy', 'SectionsMaxEntropy',
       'SectionsMeanRawsize', 'SectionsMinRawsize', 'SectionMaxRawsize',
       'SectionsMeanVirtualsize', 'SectionsMinVirtualsize',
       'SectionMaxVirtualsize', 'ImportsNbDLL', 'ImportsNb',
       'Impor

In [16]:
print(malData.head(5))

           Name                               md5  Machine  \
0   memtest.exe  631ea355665f28d4707448e442fbf5b8      332   
1       ose.exe  9d10f99a6712e28f8acd5641e3a7ea6b      332   
2     setup.exe  4d92f518527353c0db88a70fddcfd390      332   
3      DW20.EXE  a41e524f8d45f0074fd07805ff0c9b12      332   
4  dwtrig20.exe  c87e561258f2f8650cef999bf643a731      332   

   SizeOfOptionalHeader  Characteristics  MajorLinkerVersion  \
0                   224              258                   9   
1                   224             3330                   9   
2                   224             3330                   9   
3                   224              258                   9   
4                   224              258                   9   

   MinorLinkerVersion  SizeOfCode  SizeOfInitializedData  \
0                   0      361984                 115712   
1                   0      130560                  19968   
2                   0      517120                 621568   
3 

In [50]:
print(legit.take([1]))

      Name                               md5  Machine  SizeOfOptionalHeader  \
1  ose.exe  9d10f99a6712e28f8acd5641e3a7ea6b      332                   224   

   Characteristics  MajorLinkerVersion  MinorLinkerVersion  SizeOfCode  \
1             3330                   9                   0      130560   

   SizeOfInitializedData  SizeOfUninitializedData  ...  ExportNb  ResourcesNb  \
1                  19968                        0  ...         0            2   

   ResourcesMeanEntropy  ResourcesMinEntropy  ResourcesMaxEntropy  \
1              4.250461             3.420744             5.080177   

   ResourcesMeanSize  ResourcesMinSize  ResourcesMaxSize  \
1              837.0               518              1156   

   LoadConfigurationSize  VersionInformationSize  
1                     72                      18  

[1 rows x 56 columns]


In [51]:
print(mal.take([1]))

                                              Name  \
41324  VirusShare_9bd57c8252948bd2fa651ad372bd4f13   

                                    md5  Machine  SizeOfOptionalHeader  \
41324  9bd57c8252948bd2fa651ad372bd4f13      332                   224   

       Characteristics  MajorLinkerVersion  MinorLinkerVersion  SizeOfCode  \
41324              271                   6                   0       24064   

       SizeOfInitializedData  SizeOfUninitializedData  ...  ExportNb  \
41324                 164864                     1024  ...         0   

       ResourcesNb  ResourcesMeanEntropy  ResourcesMinEntropy  \
41324            6              3.199107             1.971335   

       ResourcesMaxEntropy  ResourcesMeanSize  ResourcesMinSize  \
41324             5.214816              452.0                34   

       ResourcesMaxSize  LoadConfigurationSize  VersionInformationSize  
41324               958                      0                      15  

[1 rows x 56 columns]


In [52]:
from sklearn.ensemble import ExtraTreesClassifier  #for optimizing the dataset
from sklearn.feature_selection import SelectFromModel  #to improve accuracy
from sklearn.model_selection import train_test_split  #to split the data
from sklearn.model_selection import cross_validate

In [40]:
data_in = malData.drop(['Name', 'md5', 'legitimate'], axis=1)
labels = malData['legitimate'].values
extratrees = ExtraTreesClassifier().fit(data_in,labels)
select = SelectFromModel(extratrees,prefit=True)
data_in_new = select.transform(data_in)
print(data_in.shape,data_in_new.shape)

(138047, 54) (138047, 13)




In [53]:
import numpy as np
features = data_in_new.shape[1]
importances = extratrees.feature_importances_
indices = np.argsort(importances)[::-1]

for f in range(features):
    print("%d"%(f+1),malData.columns[2+indices[f]],importances[indices[f]])

1 DllCharacteristics 0.14722983963814051
2 Machine 0.10473132530857367
3 Characteristics 0.08486199197901811
4 Subsystem 0.07347288909125349
5 SectionsMaxEntropy 0.06346623619906935
6 VersionInformationSize 0.0595969054996743
7 ResourcesMinEntropy 0.054438347702828904
8 ImageBase 0.05286861761130687
9 MajorSubsystemVersion 0.05122258703574621
10 ResourcesMaxEntropy 0.043179161738146864
11 SizeOfOptionalHeader 0.030081481890446307
12 MajorOperatingSystemVersion 0.02750674374766495
13 SizeOfStackReserve 0.02646452260790842


In [54]:
from sklearn.ensemble import RandomForestClassifier
legit_train, legit_test, mal_train, mal_test = train_test_split(data_in_new, labels, test_size=0.2)
classif = RandomForestClassifier(n_estimators=50)

classif.fit(legit_train,mal_train)

In [56]:
print("The score of the algorithm: ",classif.score(legit_test,mal_test)*100)

The score of the algorithm:  99.34806229626946


In [60]:
from sklearn.metrics import confusion_matrix

result = classif.predict(legit_test)
conf_mat = confusion_matrix(mal_test,result)

In [61]:
conf_mat.shape

(2, 2)

In [62]:
type(conf_mat)

numpy.ndarray

In [63]:
conf_mat

array([[19213,    95],
       [   85,  8217]], dtype=int64)

In [67]:
print("false_positives: ",conf_mat[0][1]/sum(conf_mat[0])*100)
print("false_negatives: ",conf_mat[1][0]/sum(conf_mat[1])*100)

false_positives:  0.492024031489538
false_negatives:  1.0238496747771622


In [69]:
from sklearn.ensemble import GradientBoostingClassifier

grad_boost =GradientBoostingClassifier(n_estimators=50)
grad_boost.fit(legit_train,mal_train)

In [70]:
print("The score of the Gradient Boosting Classifier is: ",grad_boost.score(legit_test,mal_test)*100)

The score of the Gradient Boosting Classifier is:  98.79029337196667
