In [2]:
# Cell 1: Install dependencies
!pip install pefile lief --quiet


In [3]:
# Cell 2: Import libraries
import pefile
import lief
import os
import math
import re
import string
from collections import Counter
from datetime import datetime


In [4]:
# Cell 3: Helper functions

def get_entropy(data):
    if not data:
        return 0.0
    entropy = 0
    for x in range(256):
        p_x = data.count(bytes([x])) / len(data)
        if p_x > 0:
            entropy -= p_x * math.log2(p_x)
    return entropy

def extract_strings(data, min_len=4):
    pattern = rb'[\x20-\x7E]{%d,}' % min_len
    return re.findall(pattern, data)

def is_printable(s):
    return all(chr(c) in string.printable for c in s)

def average_string_length(strings):
    if not strings:
        return 0
    return sum(len(s) for s in strings) / len(strings)


In [5]:
# Cell 4: Main feature extractor

def extract_pe_features(file_path):
    pe = pefile.PE(file_path)
    lief_pe = lief.parse(file_path)
    data = open(file_path, 'rb').read()
    strings = extract_strings(data)

    features = {}
    
    # Static metadata
    features['size'] = os.path.getsize(file_path)
    features['entropy'] = get_entropy(data)
    features['numstrings'] = len(strings)
    features['avlength'] = average_string_length(strings)
    features['printables'] = sum([1 for s in strings if is_printable(s)])

    # Flags
    features['has_debug'] = int(hasattr(pe, 'DIRECTORY_ENTRY_DEBUG'))
    features['has_signature'] = int(hasattr(pe, 'DIRECTORY_ENTRY_SECURITY'))
    features['has_tls'] = int(hasattr(pe, 'DIRECTORY_ENTRY_TLS'))
    features['has_resources'] = int(hasattr(pe, 'DIRECTORY_ENTRY_RESOURCE'))
    features['has_relocations'] = int(hasattr(pe, 'DIRECTORY_ENTRY_BASERELOC'))
    
    # Counts
    features['exports_counts'] = len(pe.DIRECTORY_ENTRY_EXPORT.symbols) if hasattr(pe, 'DIRECTORY_ENTRY_EXPORT') else 0
    features['imports_counts'] = len(pe.DIRECTORY_ENTRY_IMPORT) if hasattr(pe, 'DIRECTORY_ENTRY_IMPORT') else 0
    features['symbols'] = len(lief_pe.symbols) if lief_pe and lief_pe.symbols else 0

    # COFF + optional header fields
    features['coff.timestamp'] = pe.FILE_HEADER.TimeDateStamp
    opt = pe.OPTIONAL_HEADER
    features['optional.major_image_version'] = opt.MajorImageVersion
    features['optional.minor_image_version'] = opt.MinorImageVersion
    features['optional.major_linker_version'] = opt.MajorLinkerVersion
    features['optional.minor_linker_version'] = opt.MinorLinkerVersion
    features['optional.major_operating_system_version'] = opt.MajorOperatingSystemVersion
    features['optional.minor_operating_system_version'] = opt.MinorOperatingSystemVersion
    features['optional.major_subsystem_version'] = opt.MajorSubsystemVersion
    features['optional.minor_subsystem_version'] = opt.MinorSubsystemVersion
    features['optional.sizeof_code'] = opt.SizeOfCode
    features['optional.sizeof_headers'] = opt.SizeOfHeaders
    features['optional.sizeof_heap_commit'] = opt.SizeOfHeapCommit

    features['MZ'] = int(data[:2] == b'MZ')  # check for MZ header
    features['vsize'] = opt.SizeOfImage

    return features


In [6]:
file_path = 'malware.exe'

In [7]:
features = extract_pe_features(file_path)

In [8]:
import pandas as pd
df = pd.DataFrame([features])
df.T 

Unnamed: 0,0
size,3514368.0
entropy,7.995471
numstrings,42255.0
avlength,4.683706
printables,42255.0
has_debug,0.0
has_signature,0.0
has_tls,0.0
has_resources,1.0
has_relocations,0.0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 27 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   size                                     1 non-null      int64  
 1   entropy                                  1 non-null      float64
 2   numstrings                               1 non-null      int64  
 3   avlength                                 1 non-null      float64
 4   printables                               1 non-null      int64  
 5   has_debug                                1 non-null      int64  
 6   has_signature                            1 non-null      int64  
 7   has_tls                                  1 non-null      int64  
 8   has_resources                            1 non-null      int64  
 9   has_relocations                          1 non-null      int64  
 10  exports_counts                           1 non-null   