In [2]:
import yara
import os
import re

rules = yara.compile(r'rules\sekurak_yara_example.yar')
matches = rules.match(r'samples\sample_1.exe')

print("Dopasowania:", matches)

Dopasowania: [ExampleRule]


Phishing mail data collected from : https://monkey.org/~jose/phishing/ (2023 and 2024 year)


Regular mail data collected from selected: https://spamassassin.apache.org/old/publiccorpus/ (easy and hard, without spam mails)

## Data preparation

In [3]:
def split_emails_to_files(input_dir: str, output_dir: str):
    os.makedirs(output_dir, exist_ok=True)
    counter = 1
    
    for filename in sorted(os.listdir(input_dir)):
        filepath = os.path.join(input_dir, filename)
        if not os.path.isfile(filepath):
            continue

        try:
            with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
                content = f.read()
        except Exception as e:
            print(f"Error with file {filename}: {e}")
            continue

        messages = re.split(r'(?=^From \S+@\S+ .{24,})', content, flags=re.MULTILINE)
        messages = [msg.strip() for msg in messages if msg.strip()]

        for msg in messages:
            out_filename = f"message_{counter:06d}.txt"
            out_path = os.path.join(output_dir, out_filename)
            with open(out_path, 'w', encoding='utf-8') as out:
                out.write(msg)
            counter += 1

        print(f"Saved {len(messages)} messages from file: {filename}")

    print(f"\n Saved {counter - 1} messages to directory: {output_dir}")


In [None]:
#split_emails_to_files('samples/raw_phishing', 'samples/phishing_mails')

Saved 419 messages from file: phishing-2023.mbox
Saved 403 messages from file: phishing-2024.mbox

 Saved 822 messages to directory: samples/phishing_mails


In [4]:
def convert_emails_to_txt(input_dir: str, output_dir: str):
    os.makedirs(output_dir, exist_ok=True)
    
    files = sorted(f for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f)) and not f.startswith("cmds"))
    
    for idx, filename in enumerate(files, 1):
        input_path = os.path.join(input_dir, filename)
        output_filename = f"regular_message_{idx:03d}.txt"
        output_path = os.path.join(output_dir, output_filename)
        
        with open(input_path, 'r', encoding='utf-8', errors='ignore') as f_in, \
             open(output_path, 'w', encoding='utf-8') as f_out:
            content = f_in.read()
            f_out.write(content)

    print(f" Converted {len(files)} files from {input_dir} → {output_dir}")

In [None]:
#convert_emails_to_txt("samples/raw_regular", "samples/regular_mails")

 Converted 887 files from samples/raw_regular → samples/regular_mails


## Scanning files

In [5]:
def scan_with_yara(rules_path: str, target_dir: str, output_path: str):
    try:
        rules = yara.compile(filepath=rules_path)
    except Exception as e:
        print(f"Error during loading rules: {e}")
        return

    matches_count = 0
    scanned_count = 0

    with open(output_path, 'w', encoding='utf-8') as report:
        for filename in sorted(os.listdir(target_dir)):
            filepath = os.path.join(target_dir, filename)
            if not os.path.isfile(filepath):
                continue

            try:
                with open(filepath, 'rb') as f:
                    data = f.read()

                matches = rules.match(data=data)

                if matches:
                    matches_count += 1
                    report.write(f" MATCH: {filename} -> {matches}\n")
                else:
                    report.write(f" NO MATCH: {filename}\n")

                scanned_count += 1
            except Exception as e:
                report.write(f"ERROR {filename}: {e}\n")

    print(f"\nScaning finished. {scanned_count} files was scanned, {matches_count} matches was detected.")
    print(f"Results saved in: {output_path}")

In [6]:
scan_with_yara(
    rules_path='rules/sekurak_yara_example.yar',
    target_dir='samples/',
    output_path='results/yara_scan_results.txt'
)


Scaning finished. 1 files was scanned, 1 matches was detected.
Results saved in: results/yara_scan_results.txt


## Cofusion matrix