In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Step 0: Goal Definition

The goal of this project is to develop a machine learning model capable of classifying Windows Portable executable (PE) files as malicious or bening based on static features extracted from the files themselves.

To achieve this objective, the dataset used consists of PE file characteristics extracted from a collection of Windows executable and DLL files. Each entry represents a unique file with various attributes extracted from its PE header and structure. The dataset includes both benign software samples and known malware samples.

Dataset available at the following link:
https://www.kaggle.com/datasets/amdj3dax/ransomware-detection-data-set


### Column descriptios obtained from the source

In [26]:
# Column description obtained from the source of the dataset
column_descriptions = [
    {"Column Name": "FileName", "Description": "Name or identifier of the PE file"},
    {"Column Name": "md5Hash", "Description": "MD5 hash of the file for unique identification"}, 
    {"Column Name": "Machine", "Description": "Target machine architecture identifier"}, 
    {"Column Name": "DebugSize", "Description": "Size of debug information"}, 
    {"Column Name": "DebugRVA", "Description": "Relative Virtual Address of debug information"}, 
    {"Column Name": "MajorImageVersion", "Description": "Major version number of the image"}, 
    {"Column Name": "MajorOSVersion", "Description": "Major version number of required operating system"}, 
    {"Column Name": "ExportRVA", "Description": "Relative Virtual Address of export table"}, 
    {"Column Name": "ExportSize", "Description": "Size of export table"}, 
    {"Column Name": "IatVRA", "Description": "Relative Virtual Address of Import Address Table"}, 
    {"Column Name": "MajorLinkerVersion", "Description": "Major version number of linker"}, 
    {"Column Name": "MinorLinkerVersion", "Description": "Minor version number of linker"}, 
    {"Column Name": "NumberOfSections", "Description": "Number of sections in the PE file"}, 
    {"Column Name": "SizeOfStackReserve", "Description": "Size of stack to reserve"}, 
    {"Column Name": "DllCharacteristics", "Description": "DLL characteristics flags"}, 
    {"Column Name": "ResourceSize", "Description": "Size of resource section"}, 
    {"Column Name": "BitcoinAddresses", "Description": "Number of potential Bitcoin addresses found"}, 
    {"Column Name": "Benign", "Description": "Binary label (1 for benign, 0 for malicious)"}
]

df_description = pd.DataFrame(column_descriptions)

df_description


Unnamed: 0,Column Name,Description
0,FileName,Name or identifier of the PE file
1,md5Hash,MD5 hash of the file for unique identification
2,Machine,Target machine architecture identifier
3,DebugSize,Size of debug information
4,DebugRVA,Relative Virtual Address of debug information
5,MajorImageVersion,Major version number of the image
6,MajorOSVersion,Major version number of required operating system
7,ExportRVA,Relative Virtual Address of export table
8,ExportSize,Size of export table
9,IatVRA,Relative Virtual Address of Import Address Table


### Target variable
According to the dataset source, the 'Benign' column can be used as a label for supervised learning tasks. In fact, it is our target variable, and it can assume two possible values:
- 1 for bening
- 0 for malicious

## Step 1: Data Acquisition

In [None]:
df = pd.read_csv('./dataset/data_file.csv', sep=',')
df_raw = df.copy()  # copy of the original dataframe
df

Unnamed: 0,FileName,md5Hash,Machine,DebugSize,DebugRVA,MajorImageVersion,MajorOSVersion,ExportRVA,ExportSize,IatVRA,MajorLinkerVersion,MinorLinkerVersion,NumberOfSections,SizeOfStackReserve,DllCharacteristics,ResourceSize,BitcoinAddresses,Benign
0,0124e21d-018c-4ce0-92a3-b9e205a76bc0.dll,79755c51e413ed3c6be4635fd729a6e1,332,0,0,0,4,0,0,8192,8,0,3,1048576,34112,672,0,1
1,05c8318f98a5d301d80000009c316005.vertdll.dll,95e19f3657d34a432eada93221b0ea16,34404,84,121728,10,10,126576,4930,0,14,10,8,262144,16864,1024,0,1
2,06054fba-5619-4a86-a861-ffb0464bef5d.dll,85c32641d77a54e19ba8ea4ab305c791,332,0,0,0,4,0,0,8192,8,0,3,1048576,34112,672,0,1
3,075822ac99a5d301660400009c316005.adhapi.dll,62e3b959d982ef534b66f819fe15f085,34404,84,19904,10,10,21312,252,18160,14,10,6,262144,16736,1040,0,1
4,090607dd9ba5d301ca0900009c316005.SensorsNative...,ae38c5f7d313ad0ff3bfb8826476767f,34404,84,97728,10,10,105792,1852,70592,14,10,7,262144,16736,1096,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62480,VirusShare_a43ceb5e5fffc793e0205d15a0606cb0,a43ceb5e5fffc793e0205d15a0606cb0,332,0,0,1,4,0,0,4096,6,0,3,1048576,0,23504,0,0
62481,VirusShare_0190dafc8304111a00fccf57340ea6a4,0190dafc8304111a00fccf57340ea6a4,332,0,0,7,10,0,0,0,7,0,7,1048576,0,15704,0,0
62482,VirusShare_0f3ca55979aaf59158d6b01140696e44,0f3ca55979aaf59158d6b01140696e44,332,0,0,0,4,0,0,404908,2,50,11,1048576,0,2364,0,0
62483,VirusShare_fca5ce35f1690db6babca5aa5d559535,fca5ce35f1690db6babca5aa5d559535,332,0,0,0,4,14448,70,4096,8,0,4,1048576,0,130296,0,0


## Step 2: Data Exploration

In [None]:
df.head()   # shows the first rows of the dataset

Unnamed: 0,FileName,md5Hash,Machine,DebugSize,DebugRVA,MajorImageVersion,MajorOSVersion,ExportRVA,ExportSize,IatVRA,MajorLinkerVersion,MinorLinkerVersion,NumberOfSections,SizeOfStackReserve,DllCharacteristics,ResourceSize,BitcoinAddresses,Benign
0,0124e21d-018c-4ce0-92a3-b9e205a76bc0.dll,79755c51e413ed3c6be4635fd729a6e1,332,0,0,0,4,0,0,8192,8,0,3,1048576,34112,672,0,1
1,05c8318f98a5d301d80000009c316005.vertdll.dll,95e19f3657d34a432eada93221b0ea16,34404,84,121728,10,10,126576,4930,0,14,10,8,262144,16864,1024,0,1
2,06054fba-5619-4a86-a861-ffb0464bef5d.dll,85c32641d77a54e19ba8ea4ab305c791,332,0,0,0,4,0,0,8192,8,0,3,1048576,34112,672,0,1
3,075822ac99a5d301660400009c316005.adhapi.dll,62e3b959d982ef534b66f819fe15f085,34404,84,19904,10,10,21312,252,18160,14,10,6,262144,16736,1040,0,1
4,090607dd9ba5d301ca0900009c316005.SensorsNative...,ae38c5f7d313ad0ff3bfb8826476767f,34404,84,97728,10,10,105792,1852,70592,14,10,7,262144,16736,1096,0,1


In [None]:
df.shape    # information about number of rows and number of columns

(62485, 18)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62485 entries, 0 to 62484
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   FileName            62485 non-null  object
 1   md5Hash             62485 non-null  object
 2   Machine             62485 non-null  int64 
 3   DebugSize           62485 non-null  int64 
 4   DebugRVA            62485 non-null  int64 
 5   MajorImageVersion   62485 non-null  int64 
 6   MajorOSVersion      62485 non-null  int64 
 7   ExportRVA           62485 non-null  int64 
 8   ExportSize          62485 non-null  int64 
 9   IatVRA              62485 non-null  int64 
 10  MajorLinkerVersion  62485 non-null  int64 
 11  MinorLinkerVersion  62485 non-null  int64 
 12  NumberOfSections    62485 non-null  int64 
 13  SizeOfStackReserve  62485 non-null  int64 
 14  DllCharacteristics  62485 non-null  int64 
 15  ResourceSize        62485 non-null  int64 
 16  BitcoinAddresses    62

In [14]:
df.isna().sum()     # see the sum of missing data for each column

FileName              0
md5Hash               0
Machine               0
DebugSize             0
DebugRVA              0
MajorImageVersion     0
MajorOSVersion        0
ExportRVA             0
ExportSize            0
IatVRA                0
MajorLinkerVersion    0
MinorLinkerVersion    0
NumberOfSections      0
SizeOfStackReserve    0
DllCharacteristics    0
ResourceSize          0
BitcoinAddresses      0
Benign                0
dtype: int64

There are no missing data, so we don't have to worry about them

In [28]:
df.describe()

Unnamed: 0,Machine,DebugSize,DebugRVA,MajorImageVersion,MajorOSVersion,ExportRVA,ExportSize,IatVRA,MajorLinkerVersion,MinorLinkerVersion,NumberOfSections,SizeOfStackReserve,DllCharacteristics,ResourceSize,BitcoinAddresses,Benign
count,62485.0,62485.0,62485.0,62485.0,62485.0,62485.0,62485.0,62485.0,62485.0,62485.0,62485.0,62485.0,62485.0,62485.0,62485.0,62485.0
mean,6756.472657,25870.48,154161.1,58.785997,5.417524,895318.6,409462.3,146631.1,9.782604,6.613171,4.751172,875983.0,12966.0314,184466.4,0.018004,0.433992
std,13345.499919,6461396.0,1903142.0,1114.068244,2.543697,37795270.0,28518200.0,1124630.0,9.047157,16.976465,2.138584,628818.9,15762.725511,17326250.0,0.132968,0.495628
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,332.0,0.0,0.0,0.0,4.0,0.0,0.0,4096.0,6.0,0.0,3.0,262144.0,0.0,1080.0,0.0,0.0
50%,332.0,0.0,0.0,0.0,5.0,0.0,0.0,8520.0,9.0,0.0,5.0,1048576.0,320.0,2496.0,0.0,0.0
75%,332.0,28.0,12832.0,6.0,6.0,28752.0,104.0,65536.0,11.0,10.0,6.0,1048576.0,32768.0,23504.0,0.0,1.0
max,43620.0,1615155000.0,285212700.0,63325.0,260.0,2147484000.0,2415919000.0,66154500.0,255.0,255.0,33.0,16777220.0,58632.0,4294942000.0,1.0,1.0


In [None]:
df.describe(include="object") 

Unnamed: 0,FileName,md5Hash
count,62485,62485
unique,62485,62485
top,0124e21d-018c-4ce0-92a3-b9e205a76bc0.dll,79755c51e413ed3c6be4635fd729a6e1
freq,1,1


In [None]:
header_names = ['FileName', 'md5Hash', 'Machine', 'DebugSize', 'DebugRVA', 'MajorImageVersion', 'MajorOSVersion', 
                'ExportRVA', 'ExportSize', 'IatVRA', 'MajorLinkerVersion', 'MinorLinkerVersion', 'NumberOfSections', 
                'SizeOfStackReserve', 'DllCharacteristics', 'ResourceSize', 'BitcoinAddresses', 'Benign']

nominal_cols = []
binary_cols = ['Benign']
numeric_cols = []

'Benign'