In [3]:
import pandas as pd

In [4]:
import sklearn
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

# Loading Dataset

In [5]:
MalwareDataset = pd.read_csv('MalwareData.csv.gz', compression='gzip', sep='|')

In [9]:
print(MalwareDataset.info())
print(MalwareDataset.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138047 entries, 0 to 138046
Data columns (total 57 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Name                         138047 non-null  object 
 1   md5                          138047 non-null  object 
 2   Machine                      138047 non-null  int64  
 3   SizeOfOptionalHeader         138047 non-null  int64  
 4   Characteristics              138047 non-null  int64  
 5   MajorLinkerVersion           138047 non-null  int64  
 6   MinorLinkerVersion           138047 non-null  int64  
 7   SizeOfCode                   138047 non-null  int64  
 8   SizeOfInitializedData        138047 non-null  int64  
 9   SizeOfUninitializedData      138047 non-null  int64  
 10  AddressOfEntryPoint          138047 non-null  int64  
 11  BaseOfCode                   138047 non-null  int64  
 12  BaseOfData                   138047 non-null  int64  
 13 

# Separating legitimate and malicious samples

In [6]:
Legit = MalwareDataset[:41323].drop(['legitimate'], axis=1)
Malware = MalwareDataset[41323:].drop(['legitimate'], axis=1)

In [7]:
print('Number of legitimate samples:', Legit.shape[0])
print('Number of malicious samples:', Malware.shape[0])

Number of legitimate samples: 41323
Number of malicious samples: 96724


# combining the data for training

In [10]:
Data = MalwareDataset.drop(['Name', 'md5', 'legitimate'], axis=1).values
Target = MalwareDataset['legitimate'].values

# Performing feature selection

In [14]:
FeatSelect = ExtraTreesClassifier().fit(Data, Target)
Model = SelectFromModel(FeatSelect, prefit=True)
Data_new = Model.transform(Data)

# Splitting data into training and testing sets

In [16]:
Data_Train, Data_Test, Target_Train, Target_Test = train_test_split(
    Data_new, Target, test_size=0.2, shuffle=True, stratify=Target
)

# training Random Forest classifier

In [18]:
clf = RandomForestClassifier(n_estimators=50)
clf.fit(Data_Train, Target_Train)

# Evaluate model

In [19]:
score = clf.score(Data_Test, Target_Test)
print("The score of Random Forest Algorithm is: %.2f%%" % (score * 100))

The score of Random Forest Algorithm is: 99.36%


# Make predictions and compute confusion matrix

In [20]:
Result = clf.predict(Data_Test)
CM = confusion_matrix(Target_Test, Result)

 Assume '0' is legitimate and '1' is malicious

FPR = Legitimate samples misclassified as malicious

FNR = Malicious samples misclassified as legitimate

In [22]:
false_positive_rate = (CM[0][1] / float(sum(CM[0]))) * 100
false_negative_rate = (CM[1][0] / float(sum(CM[1]))) * 100

In [23]:
print("False positive rate: %.2f%%" % false_positive_rate)
print("False negative rate: %.2f%%" % false_negative_rate)

False positive rate: 0.54%
False negative rate: 0.88%
