<h1>ANALYSING THE DATASET</h1>

In [4]:
import pandas as pd
data = pd.read_csv("MalwareData.csv", sep="|")

ModuleNotFoundError: No module named 'pandas'

In [None]:
data.head()

Unnamed: 0,Name,md5,Machine,SizeOfOptionalHeader,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode,SizeOfInitializedData,SizeOfUninitializedData,...,ResourcesNb,ResourcesMeanEntropy,ResourcesMinEntropy,ResourcesMaxEntropy,ResourcesMeanSize,ResourcesMinSize,ResourcesMaxSize,LoadConfigurationSize,VersionInformationSize,legitimate
0,memtest.exe,631ea355665f28d4707448e442fbf5b8,332,224,258,9,0,361984,115712,0,...,4,3.262823,2.568844,3.537939,8797.0,216,18032,0,16,1
1,ose.exe,9d10f99a6712e28f8acd5641e3a7ea6b,332,224,3330,9,0,130560,19968,0,...,2,4.250461,3.420744,5.080177,837.0,518,1156,72,18,1
2,setup.exe,4d92f518527353c0db88a70fddcfd390,332,224,3330,9,0,517120,621568,0,...,11,4.426324,2.846449,5.271813,31102.272727,104,270376,72,18,1
3,DW20.EXE,a41e524f8d45f0074fd07805ff0c9b12,332,224,258,9,0,585728,369152,0,...,10,4.364291,2.669314,6.40072,1457.0,90,4264,72,18,1
4,dwtrig20.exe,c87e561258f2f8650cef999bf643a731,332,224,258,9,0,294912,247296,0,...,2,4.3061,3.421598,5.190603,1074.5,849,1300,72,18,1


In [None]:
legitimate = data[0:41323].drop(["legitimate"], axis=1)
malware = data[41323:].drop(["legitimate"], axis=1)

In [None]:
print("Shape of legitimate dataset is %s samples with %s features" %(legitimate.shape[0], legitimate.shape[1]))
print("Shape of malware dataset is %s samples with %s features" %(malware.shape[0], malware.shape[1]))

Shape of legitimate dataset is 41323 samples with 56 features
Shape of malware dataset is 96724 samples with 56 features


<h1>CLASSIFIER</h1>


In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

In [None]:
data_input = data.drop(['Name', 'md5', 'legitimate', 'ResourcesMinSize', 'ResourcesMinEntropy', 'ResourcesMeanSize', 'ResourcesMaxSize', 'ResourcesMinSize', 'ResourcesMaxEntropy', 'ResourcesMinEntropy'], axis=1).values
labels = data["legitimate"].values
extratrees = ExtraTreesClassifier().fit(data_input, labels)
select = SelectFromModel(extratrees, prefit=True)
data_input_new = select.transform(data_input)

In [None]:
data_input_new

array([[3.32000000e+02, 2.24000000e+02, 2.58000000e+02, ...,
        5.76680655e+00, 7.22105073e+00, 1.60000000e+01],
       [3.32000000e+02, 2.24000000e+02, 3.33000000e+03, ...,
        4.83968794e+00, 6.56690933e+00, 1.80000000e+01],
       [3.32000000e+02, 2.24000000e+02, 3.33000000e+03, ...,
        6.40955753e+00, 7.60095678e+00, 1.80000000e+01],
       ...,
       [3.32000000e+02, 2.24000000e+02, 2.58000000e+02, ...,
        5.65942634e+00, 7.97742342e+00, 1.40000000e+01],
       [3.32000000e+02, 2.24000000e+02, 3.31660000e+04, ...,
        3.01268192e+00, 6.43118768e+00, 0.00000000e+00],
       [3.32000000e+02, 2.24000000e+02, 2.58000000e+02, ...,
        5.24600058e+00, 6.56274547e+00, 0.00000000e+00]])

In [None]:
print(data_input.shape, data_input_new.shape)

(138047, 49) (138047, 11)


In [None]:
import numpy as np
features = data_input_new.shape[1]
importances = extratrees.feature_importances_
indices = np.argsort(importances)[::-1]

<h1>SELECTING THE MOST IMPORTANT FEATURES</h1>

In [None]:
for x in range(features):
    print("%d"%(x+1), data.columns[2+indices[x]], importances[indices[x]])

1 DllCharacteristics 0.15957226282291456
2 Machine 0.10294936670858645
3 Characteristics 0.10002110824166825
4 ResourcesMaxEntropy 0.07790531034994834
5 SectionsMaxEntropy 0.07650436744506255
6 MajorSubsystemVersion 0.0681976493822681
7 ImageBase 0.058449684791516215
8 Subsystem 0.05576108224155063
9 SizeOfOptionalHeader 0.051118315723415936
10 SizeOfStackReserve 0.0246548373016773
11 SectionsMeanEntropy 0.02130917062076368


<h1>RANDOM FOREST CLASSIFIER</h1>

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
legitimate_train, legitimate_test, malware_train, malware_test = train_test_split(data_input_new, labels, test_size=0.2)

In [None]:
classifier = RandomForestClassifier(n_estimators=100)

In [None]:
classifier.fit(legitimate_train, malware_train)

RandomForestClassifier()

In [None]:
print("Algorithm's score : ", classifier.score(legitimate_test, malware_test)*100)

Algorithm's score :  99.41325606664252


In [None]:
from sklearn.metrics import confusion_matrix
result = classifier.predict(legitimate_test)
matrix = confusion_matrix(malware_test, result)

In [None]:
matrix

array([[19279,    95],
       [   67,  8169]])

In [None]:
print("False Positives : ", matrix[0][1]/sum(matrix[0])*100)
print("False Negatives : ", matrix[1][0]/sum(matrix[1])*100)

False Positives :  0.4903478889232993
False Negatives :  0.8135016998542983


In [None]:
import pickle

In [None]:
with open('model','wb') as file:
    pickle.dump(classifier, file)

In [None]:
import pickle
model_load = pickle.load(open('model','rb'))

In [None]:
import os
import pefile
import pandas as pd

def extract(path):
    pe = pefile.PE(path, fast_load=True)
    data = []
    entropy = list(map(lambda x:x.get_entropy(), pe.sections))
    SectionsMinEntropy = min(entropy)
    SectionsMaxEntropy = max(entropy)
    data.append(pe.OPTIONAL_HEADER.DllCharacteristics)
    data.append(pe.FILE_HEADER.Characteristics)
    data.append(pe.FILE_HEADER.Machine)
    data.append(pe.OPTIONAL_HEADER.ImageBase)
    data.append('6.9')
    data.append(SectionsMaxEntropy) 
    data.append(pe.OPTIONAL_HEADER.Subsystem)
    data.append(pe.FILE_HEADER.SizeOfOptionalHeader)
    data.append(pe.OPTIONAL_HEADER.MajorSubsystemVersion)
    data.append(SectionsMinEntropy)    
    data.append(pe.OPTIONAL_HEADER.MajorOperatingSystemVersion)
    data2 = []
    data2.append(data)
    return data2

attributes = extract('')
df = pd.DataFrame(attributes)
print(df.shape)

(1, 11)


In [None]:
df1 = pd.DataFrame(data_input_new[4])
df1 = np.transpose(df1)
#print(df1.shape)
prediction = model_load.predict(df1)
print(prediction)

[1]


In [None]:
x=np.argmax(prediction.round(), axis=0)
if(x==0):
    print("LEGITIMATE")
else:
    print("MALWARE")

LEGITIMATE


<h1>GRADIENT BOOST CLASSIFIER</h1>

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
grad_boost = GradientBoostingClassifier(n_estimators=50)
grad_boost.fit(legitimate_train, malware_train)

GradientBoostingClassifier(n_estimators=50)

In [None]:
print("Score of Gradient Boost Classifier : ", grad_boost.score(legitimate_test, malware_test)*100)

Score of Gradient Boost Classifier :  98.89532777978994


In [None]:
from sklearn.metrics import confusion_matrix
result = grad_boost.predict(legitimate_test)
matrix = confusion_matrix(malware_test, result)

In [None]:
print("False Positives : ", matrix[0][1]/sum(matrix[0])*100)
print("False Negatives : ", matrix[1][0]/sum(matrix[1])*100)

False Positives :  0.7948797357282957
False Negatives :  1.8334142787761047


In [None]:
import pickle
with open('Grad_Boost_model','wb') as file:
    pickle.dump(grad_boost, file)

In [None]:
import os
import pefile
import pandas as pd

def extract(path):
    pe = pefile.PE(path, fast_load=True)
    data = []
    entropy = list(map(lambda x:x.get_entropy(), pe.sections))
    SectionsMinEntropy = min(entropy)
    SectionsMaxEntropy = max(entropy)
    data.append(pe.OPTIONAL_HEADER.DllCharacteristics)
    data.append(pe.FILE_HEADER.Characteristics)
    data.append(pe.FILE_HEADER.Machine)
    data.append(pe.OPTIONAL_HEADER.ImageBase)
    data.append('6.9')
    data.append(SectionsMaxEntropy) 
    data.append(pe.OPTIONAL_HEADER.Subsystem)
    data.append(pe.FILE_HEADER.SizeOfOptionalHeader)
    data.append(pe.OPTIONAL_HEADER.MajorSubsystemVersion)
    data.append(SectionsMinEntropy)    
    data.append(pe.OPTIONAL_HEADER.MajorOperatingSystemVersion)
    data2 = []
    data2.append(data)
    return data2

attributes = extract('')
df = pd.DataFrame(attributes)
print(df.shape)

(1, 11)


In [None]:
df1 = pd.DataFrame(data_input_new[4])
df1 = np.transpose(df1)
#print(df1.shape)
prediction = grad_boost.predict(df1)
print(prediction)
x=np.argmax(prediction.round(), axis=0)
if(x==0):
    print("LEGITIMATE")
else:
    print("MALWARE")

[1]
LEGITIMATE


<h1>TESTING OTHER CLASSIFIERS</h1>

In [None]:
from sklearn import tree, linear_model
import sklearn.ensemble as ek
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression
model = { "DecisionTree":tree.DecisionTreeClassifier(max_depth=10),
         "RandomForest":ek.RandomForestClassifier(n_estimators=50),
         "Adaboost":ek.AdaBoostClassifier(n_estimators=50),
         "GradientBoosting":ek.GradientBoostingClassifier(n_estimators=50),
         "GNB":GaussianNB(),
         "LinearRegression":LinearRegression()   
        }

In [None]:
for x in model:
    classifier = model[x]
    classifier.fit(legitimate_train, malware_train)
    score = classifier.score(legitimate_test, malware_test)
    print(x+ ' '+ str(score))

DecisionTree 0.9901847156827237
RandomForest 0.9939876856211518
Adaboost 0.9851865266207895
GradientBoosting 0.9889532777978993
GNB 0.7017385005432815
LinearRegression 0.5929045749430526
