In [None]:
import os
import pefile
import math
import csv
import hashlib
from collections import Counter

In [None]:
# ====== Utility ======
def get_entropy(data):
    if not data:
        return 0.0
    counter = Counter(data)
    length = len(data)
    entropy = 0
    for count in counter.values():
        p_x = count / length
        entropy -= p_x * math.log2(p_x)
    return entropy

def get_md5(file_path):
    h = hashlib.md5()
    with open(file_path, "rb") as f:
        while chunk := f.read(8192):
            h.update(chunk)
    return h.hexdigest()

In [None]:
# ====== Feature Extractor ======
def extract_features(file_path, label):
    try:
        pe = pefile.PE(file_path)
        features = {}

        # --- File-level ---
        with open(file_path, "rb") as f:
            data = f.read()
            features["FileSize"] = len(data)
            features["FileEntropy"] = round(get_entropy(data), 3)
        features["MD5"] = get_md5(file_path)

        # --- Header ---
        features["TimeDateStamp"] = pe.FILE_HEADER.TimeDateStamp
        features["Machine"] = pe.FILE_HEADER.Machine
        features["Characteristics"] = pe.FILE_HEADER.Characteristics
        features["NumberOfSections"] = pe.FILE_HEADER.NumberOfSections
        features["AddressOfEntryPoint"] = pe.OPTIONAL_HEADER.AddressOfEntryPoint
        features["SizeOfImage"] = pe.OPTIONAL_HEADER.SizeOfImage
        features["SizeOfHeaders"] = pe.OPTIONAL_HEADER.SizeOfHeaders
        features["Subsystem"] = pe.OPTIONAL_HEADER.Subsystem
        features["DllCharacteristics"] = pe.OPTIONAL_HEADER.DllCharacteristics
        features["LinkerVersion"] = float(f"{pe.OPTIONAL_HEADER.MajorLinkerVersion}.{pe.OPTIONAL_HEADER.MinorLinkerVersion}")
        features["Checksum"] = pe.OPTIONAL_HEADER.CheckSum

        # --- TLS ---
        features["TLS_Callbacks"] = hasattr(pe, "DIRECTORY_ENTRY_TLS")

        # --- Sections ---
        section_entropy = []
        section_ratio = []
        for section in pe.sections:
            vsize = section.Misc_VirtualSize
            raw_size = section.SizeOfRawData
            entropy = get_entropy(section.get_data())
            section_entropy.append(entropy)
            if raw_size != 0:
                section_ratio.append(vsize / raw_size)
        features["MeanSectionEntropy"] = round(sum(section_entropy) / len(section_entropy), 3) if section_entropy else 0
        features["MeanSectionRatio"] = round(sum(section_ratio) / len(section_ratio), 3) if section_ratio else 0

        # --- Imports ---
        imports_count = 0
        suspicious_apis = ["VirtualAlloc", "WriteProcessMemory", "CreateRemoteThread", "WinExec", "ShellExecute"]
        suspicious_found = 0
        if hasattr(pe, "DIRECTORY_ENTRY_IMPORT"):
            for entry in pe.DIRECTORY_ENTRY_IMPORT:
                for imp in entry.imports:
                    imports_count += 1
                    if imp.name:
                        if any(api.lower() in imp.name.decode(errors="ignore").lower() for api in suspicious_apis):
                            suspicious_found += 1
        features["ImportCount"] = imports_count
        features["SuspiciousAPIs"] = suspicious_found

        # --- Exports ---
        exports_count = 0
        if hasattr(pe, "DIRECTORY_ENTRY_EXPORT"):
            exports_count = len(pe.DIRECTORY_ENTRY_EXPORT.symbols)
        features["ExportCount"] = exports_count

        # --- Resources ---
        res_count = 0
        res_size = 0
        if hasattr(pe, "DIRECTORY_ENTRY_RESOURCE"):
            for resource_type in pe.DIRECTORY_ENTRY_RESOURCE.entries:
                if hasattr(resource_type, "directory"):
                    for entry in resource_type.directory.entries:
                        if hasattr(entry, "directory"):
                            for subentry in entry.directory.entries:
                                data_rva = subentry.data.struct.OffsetToData
                                size = subentry.data.struct.Size
                                res_size += size
                                res_count += 1
        features["ResourceCount"] = res_count
        features["ResourceSize"] = res_size

        # --- Strings (basic count only) ---
        try:
            with open(file_path, "rb") as f:
                data = f.read()
                strings = [s.decode(errors="ignore") for s in data.split(b"\x00") if len(s) > 4]
                features["StringsCount"] = len(strings)
        except:
            features["StringsCount"] = 0

        # --- Label ---
        features["Label"] = label

        return features

    except Exception as e:
        print(f"[!] Error parsing {file_path}: {e}")
        return None

In [None]:
# ====== Dataset Scanner ======
def scan_dataset(root_dir, output_csv):
    rows = []
    for root, _, files in os.walk(root_dir):
        for file in files:
            if file.lower().endswith((".exe", ".dll")):
                file_path = os.path.join(root, file)

                # Label: dựa trên tên thư mục
                label = "Benign" if "Benign" in root else "Malware"

                feats = extract_features(file_path, label)
                if feats:
                    rows.append(feats)

    # Save to CSV
    if rows:
        with open(output_csv, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=rows[0].keys())
            writer.writeheader()
            writer.writerows(rows)
    print(f"[+] Done! Extracted {len(rows)} samples -> {output_csv}")

In [None]:
dataset_dir = "Dataset"   # thư mục gốc Dataset
output_csv = "pe_features_dataset.csv"
scan_dataset(dataset_dir, output_csv)
