In [1]:
#!pip install pefile
#!pip install openpyxl

In [2]:
# Developed with Python version 3.10.5
!python --version

Python 3.10.5


In [3]:
import pefile
from pprint import pprint
import os
import glob
import pandas as pd

# Functions

In [4]:
def extract_features(clean_files_path, class_value):
    data = []
    
    for path in clean_files_path:
        try:
            pe = pefile.PE(path)
            
            ## feature extractions
            features = []
            # 1. Sections
            features.extend(dir(pe))
            
            # 2. Imported dlls
            if pe.DIRECTORY_ENTRY_IMPORT:
                features.extend([entry.dll.decode('utf-8') for entry in pe.DIRECTORY_ENTRY_IMPORT])
                
            ## cleaning
            features = [entry.replace('.','_') for entry in features]
            
            ## stringify
            features_str = ' '.join(features)
            
            data.append({'File': path.split('\\')[-1], 'PeStructure': features_str, 'Class': class_value})

        except Exception as e:
            print("PEFormatError: %s" % e, path.split('\\')[-1])
            
    return data

# Extract features from the clean files

In [5]:
clean_path = os.path.join(os.getcwd(), '0')


In [6]:
clean_files = os.listdir(clean_path)
clean_files[:5]

['00ELuByj9iSRf5Rx11Ypl15N6kS2FXmW.dll',
 '00Tree.html',
 '01nCLd7AG7XAlI0JH9G2E3rFbuahjIaD.dll',
 '025bvHOe4jNeyFreKPOzrL42kZf9igIi.dll',
 '02sRriSJXuQGFgx2CpAWSWMlRZHUMemA.dll']

In [7]:
clean_files_path = []
for file in clean_files:
    if file.split('.')[-1] in ['exe', 'dll']:
        clean_files_path.append(os.path.join(clean_path, file))

clean_files_path[:5]

['C:\\Users\\User2\\Desktop\\NordSecurity\\0\\00ELuByj9iSRf5Rx11Ypl15N6kS2FXmW.dll',
 'C:\\Users\\User2\\Desktop\\NordSecurity\\0\\01nCLd7AG7XAlI0JH9G2E3rFbuahjIaD.dll',
 'C:\\Users\\User2\\Desktop\\NordSecurity\\0\\025bvHOe4jNeyFreKPOzrL42kZf9igIi.dll',
 'C:\\Users\\User2\\Desktop\\NordSecurity\\0\\02sRriSJXuQGFgx2CpAWSWMlRZHUMemA.dll',
 'C:\\Users\\User2\\Desktop\\NordSecurity\\0\\03qLFrBnvZyyCMls133h0I1n6chSuU0N.dll']

In [8]:
clean_extraction = extract_features(clean_files_path, 0)

PEFormatError: 'DOS Header magic not found.' 00ELuByj9iSRf5Rx11Ypl15N6kS2FXmW.dll
PEFormatError: 'DOS Header magic not found.' 03xFH66ZwZHBFU4BehtgYdPnLDmsVRbt.dll
PEFormatError: 'PE' object has no attribute 'DIRECTORY_ENTRY_IMPORT' 04OedY8viaNBFEDv26n08fU2GnfxxmFv.dll
PEFormatError: 'DOS Header magic not found.' 07wFclZXJNUWH2xD2wEEzqoS3y5IMZ4P.dll
PEFormatError: 'DOS Header magic not found.' 0A8phAkPI7mn6GvZaaqwozxu2cGo4ueg.exe
PEFormatError: 'PE' object has no attribute 'DIRECTORY_ENTRY_IMPORT' 0AE3mQJnCYWWOaRS16TJ5LAENzpavhUj.dll
PEFormatError: 'DOS Header magic not found.' 0B38xQpWO8UJl0V0hCCMzSxCbtjnrpfx.dll
PEFormatError: 'DOS Header magic not found.' 0cwj8FURlEx0Zq20GfxV2lKgpwXoQ6vr.dll
PEFormatError: 'PE' object has no attribute 'DIRECTORY_ENTRY_IMPORT' 0DtRUfsg5PzSVSE6o63QUwhf2Ikhi5yo.dll
PEFormatError: 'DOS Header magic not found.' 0H7c9CDig77XOAcrjXbuvjMB5fwDxp5H.dll
PEFormatError: 'DOS Header magic not found.' 0hC1XeWeZPOZvH4PCemgBScQgI7SYDDY.dll
PEFormatError: 'DOS Header

Note - many exceptions while reading structure of the executables. A separate investigatoin is needed.

In [9]:
len(clean_extraction)

770

In [10]:
clean_extraction[5:6]

[{'File': '0484nLtEv6qs9TXB0IAAIA0EgktHKeIX.dll',
  'Class': 0}]

In [11]:
clean_df = pd.DataFrame(clean_extraction)
clean_df[:2]

Unnamed: 0,File,PeStructure,Class
0,01nCLd7AG7XAlI0JH9G2E3rFbuahjIaD.dll,DIRECTORY_ENTRY_BASERELOC DIRECTORY_ENTRY_DEBU...,0
1,025bvHOe4jNeyFreKPOzrL42kZf9igIi.dll,DIRECTORY_ENTRY_BASERELOC DIRECTORY_ENTRY_DEBU...,0


# Extract features from the malicious files

In [12]:
malicious_path = os.path.join(os.getcwd(), '1')

In [13]:
malicious_files = os.listdir(malicious_path)
malicious_files[:5]

['00G8W57Cz6gRc0Xl41jSEMIFLR9iLDOx.exe',
 '00gYviABxsPRDDr8JrYKT6LUuWqpzHe0.exe',
 '00Nb1Q3mxXNb6fvAp3SrscnVWACdUwpM.exe',
 '00Tree.html',
 '017xEB9ZOAAG3VdYpTc0kBPzlh40cPab.exe']

In [14]:
malicious_files_path = []
for file in malicious_files:
    if file.split('.')[-1] in ['exe', 'dll']:
        malicious_files_path.append(os.path.join(malicious_path, file))

malicious_files_path[:5]

['C:\\Users\\User2\\Desktop\\NordSecurity\\1\\00G8W57Cz6gRc0Xl41jSEMIFLR9iLDOx.exe',
 'C:\\Users\\User2\\Desktop\\NordSecurity\\1\\00gYviABxsPRDDr8JrYKT6LUuWqpzHe0.exe',
 'C:\\Users\\User2\\Desktop\\NordSecurity\\1\\00Nb1Q3mxXNb6fvAp3SrscnVWACdUwpM.exe',
 'C:\\Users\\User2\\Desktop\\NordSecurity\\1\\017xEB9ZOAAG3VdYpTc0kBPzlh40cPab.exe',
 'C:\\Users\\User2\\Desktop\\NordSecurity\\1\\01KX3RaIsPgUolmbUpt1FCYGTUv8CkEW.exe']

In [15]:
malicious_extraction = extract_features(malicious_files_path, 1)

PEFormatError: 'PE' object has no attribute 'DIRECTORY_ENTRY_IMPORT' 0flxKcHpElilqWPICPbJlmZK8omspaNA.exe
PEFormatError: 'PE' object has no attribute 'DIRECTORY_ENTRY_IMPORT' 0Rc77H3hd0Xk7ICTjKLSjyHII12HP4RV.exe
PEFormatError: 'Invalid NT Headers signature.' 0SHJTuofoDcjnda4eAwx1TSRa2JJj0Ru.exe
PEFormatError: 'PE' object has no attribute 'DIRECTORY_ENTRY_IMPORT' 1MLGsReVndRFW5w2xarpKjKXC7RmioHm.exe
PEFormatError: 'PE' object has no attribute 'DIRECTORY_ENTRY_IMPORT' 1Pb8c9ITK0F1TTThFRKJQL8DsuK9SIec.exe
PEFormatError: 'PE' object has no attribute 'DIRECTORY_ENTRY_IMPORT' 1TWIIFKtei8psAzcr1Ikx1xnbuIoU7jT.exe
PEFormatError: 'PE' object has no attribute 'DIRECTORY_ENTRY_IMPORT' 1vy6RrtdIPEqSne7Czi5GuNBJ8vWkhCk.exe
PEFormatError: 'PE' object has no attribute 'DIRECTORY_ENTRY_IMPORT' 1X3iRzF2XQgKSsudGxrHxdrh1lE3FBk1.exe
PEFormatError: 'PE' object has no attribute 'DIRECTORY_ENTRY_IMPORT' 20o13UEllyKt1JAgKlQFiBioBVCqxfr9.exe
PEFormatError: 'PE' object has no attribute 'DIRECTORY_ENTRY_IMPORT'

In [16]:
len(malicious_extraction)

980

In [17]:
malicious_extraction[:1]

[{'File': '00G8W57Cz6gRc0Xl41jSEMIFLR9iLDOx.exe',
  'Class': 1}]

In [18]:
malicious_df = pd.DataFrame(malicious_extraction)
malicious_df[:2]

Unnamed: 0,File,PeStructure,Class
0,00G8W57Cz6gRc0Xl41jSEMIFLR9iLDOx.exe,DIRECTORY_ENTRY_IMPORT DIRECTORY_ENTRY_RESOURC...,1
1,00gYviABxsPRDDr8JrYKT6LUuWqpzHe0.exe,DIRECTORY_ENTRY_BASERELOC DIRECTORY_ENTRY_IMPO...,1


# Join datasets

In [19]:
clean_and_malicious_df = pd.concat([clean_df, malicious_df])
len(clean_and_malicious_df)

1750

# Save to Excel

In [20]:
clean_and_malicious_df.to_excel('data.xlsx', index=False)