In [58]:
import yara
import os
import re

rules = yara.compile(r'rules\sekurak_yara_example.yar')
matches = rules.match(r'samples\sample_1.exe')

print("Dopasowania:", matches)

Dopasowania: [ExampleRule]


Phishing mail data collected from : https://monkey.org/~jose/phishing/ (2023 and 2024 year)


Regular mail data collected from selected: https://spamassassin.apache.org/old/publiccorpus/ (easy and hard, without spam mails)

## Data preparation

In [59]:
def split_emails_to_files(input_dir: str, output_dir: str):
    os.makedirs(output_dir, exist_ok=True)
    counter = 1
    
    for filename in sorted(os.listdir(input_dir)):
        filepath = os.path.join(input_dir, filename)
        if not os.path.isfile(filepath):
            continue

        try:
            with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
                content = f.read()
        except Exception as e:
            print(f"Error with file {filename}: {e}")
            continue

        messages = re.split(r'(?=^From \S+@\S+ .{24,})', content, flags=re.MULTILINE)
        messages = [msg.strip() for msg in messages if msg.strip()]

        for msg in messages:
            out_filename = f"message_{counter:06d}.txt"
            out_path = os.path.join(output_dir, out_filename)
            with open(out_path, 'w', encoding='utf-8') as out:
                out.write(msg)
            counter += 1

        print(f"Saved {len(messages)} messages from file: {filename}")

    print(f"\n Saved {counter - 1} messages to directory: {output_dir}")


In [60]:
split_emails_to_files('samples/raw_phishing', 'samples/phishing_mails')

Saved 419 messages from file: phishing-2023.mbox
Saved 403 messages from file: phishing-2024.mbox

 Saved 822 messages to directory: samples/phishing_mails


In [61]:
def convert_emails_to_txt(input_dir: str, output_dir: str):
    os.makedirs(output_dir, exist_ok=True)
    
    files = sorted(f for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f)) and not f.startswith("cmds"))
    
    for idx, filename in enumerate(files, 1):
        input_path = os.path.join(input_dir, filename)
        output_filename = f"regular_message_{idx:03d}.txt"
        output_path = os.path.join(output_dir, output_filename)
        
        with open(input_path, 'r', encoding='utf-8', errors='ignore') as f_in, \
             open(output_path, 'w', encoding='utf-8') as f_out:
            content = f_in.read()
            f_out.write(content)

    print(f" Converted {len(files)} files from {input_dir} → {output_dir}")

In [62]:
convert_emails_to_txt("samples/raw_regular", "samples/regular_mails")

 Converted 887 files from samples/raw_regular → samples/regular_mails


## Scanning files

In [63]:
def scan_with_yara(rules_path: str, target_dir: str, output_path: str):
    try:
        rules = yara.compile(filepath=rules_path)
    except Exception as e:
        print(f"Error during loading rules: {e}")
        return

    matches_count = 0
    scanned_count = 0

    excluded_dirs = {"raw_phishing", "raw_regular"}

    with open(output_path, 'w', encoding='utf-8') as report:
        for root, dirs, files in os.walk(target_dir):
            dirs[:] = [d for d in dirs if d not in excluded_dirs]

            for filename in sorted(files):
                filepath = os.path.join(root, filename)
                rel_path = os.path.relpath(filepath, target_dir)

                try:
                    with open(filepath, 'rb') as f:
                        data = f.read()

                    matches = rules.match(data=data)

                    if matches:
                        matches_count += 1
                        report.write(f" MATCH: {rel_path} -> {matches}\n")
                    else:
                        report.write(f" NO MATCH: {rel_path}\n")

                    scanned_count += 1
                except Exception as e:
                    report.write(f"ERROR {rel_path}: {e}\n")

    print(f"\nScanning finished. {scanned_count} files were scanned, {matches_count} matches were detected.")
    print(f"Results saved in: {output_path}")

In [64]:
def analyze_yara_results_from_file(result_file_path: str):
    phishing_matches = 0
    phishing_total = 0
    regular_matches = 0
    regular_total = 0
    other_matches = 0
    other_total = 0

    try:
        with open(result_file_path, 'r', encoding='utf-8') as file:
            for line in file:
                is_match = line.strip().startswith("MATCH")

                if "phishing_mails" in line:
                    phishing_total += 1
                    if is_match:
                        phishing_matches += 1
                elif "regular_mails" in line:
                    regular_total += 1
                    if is_match:
                        regular_matches += 1
                else:
                    other_total += 1
                    if is_match:
                        other_matches += 1

        total_matches = phishing_matches + regular_matches + other_matches
        total_files = phishing_total + regular_total + other_total

        print("Analiza wyników z pliku:", result_file_path)
        print(f"Phishing mails: {phishing_matches} / {phishing_total} dopasowań")
        print(f"Regular mails:  {regular_matches} / {regular_total} dopasowań")
        print(f"Inne pliki:      {other_matches} / {other_total} dopasowań")
        print(f"SUMA:            {total_matches} / {total_files} plików dopasowanych\n")

    except Exception as e:
        print(f"Błąd podczas analizy pliku wynikowego: {e}")

In [65]:
scan_with_yara(
    rules_path='rules/sekurak_yara_example.yar',
    target_dir='samples/',
    output_path='results/yara_scan_results.txt'
)


Scanning finished. 1710 files were scanned, 146 matches were detected.
Results saved in: results/yara_scan_results.txt


## Cofusion matrix

In [66]:
analyze_yara_results_from_file('results/yara_scan_results.txt')

Analiza wyników z pliku: results/yara_scan_results.txt
Phishing mails: 118 / 822 dopasowań
Regular mails:  27 / 887 dopasowań
Inne pliki:      1 / 1 dopasowań
SUMA:            146 / 1710 plików dopasowanych



In [67]:
scan_with_yara(
    rules_path='rules/phrases_rule.yar',
    target_dir='samples/',
    output_path='results/yara_phrases_scan.txt'
)


Scanning finished. 1710 files were scanned, 91 matches were detected.
Results saved in: results/yara_phrases_scan.txt


In [68]:
analyze_yara_results_from_file('results/yara_phrases_scan.txt')

Analiza wyników z pliku: results/yara_phrases_scan.txt
Phishing mails: 91 / 822 dopasowań
Regular mails:  0 / 887 dopasowań
Inne pliki:      0 / 1 dopasowań
SUMA:            91 / 1710 plików dopasowanych



In [69]:
scan_with_yara(
    rules_path='rules/domains_rule.yar',
    target_dir='samples/',
    output_path='results/yara_domains_scan.txt'
)


Scanning finished. 1710 files were scanned, 1 matches were detected.
Results saved in: results/yara_domains_scan.txt


In [70]:
analyze_yara_results_from_file('results/yara_domains_scan.txt')

Analiza wyników z pliku: results/yara_domains_scan.txt
Phishing mails: 1 / 822 dopasowań
Regular mails:  0 / 887 dopasowań
Inne pliki:      0 / 1 dopasowań
SUMA:            1 / 1710 plików dopasowanych



In [71]:
scan_with_yara(
    rules_path='rules/suspicious_tld_rule.yar',
    target_dir='samples/',
    output_path='results/yara_tld_scan.txt'
)


Scanning finished. 1710 files were scanned, 243 matches were detected.
Results saved in: results/yara_tld_scan.txt


In [72]:
analyze_yara_results_from_file('results/yara_tld_scan.txt')

Analiza wyników z pliku: results/yara_tld_scan.txt
Phishing mails: 84 / 822 dopasowań
Regular mails:  159 / 887 dopasowań
Inne pliki:      0 / 1 dopasowań
SUMA:            243 / 1710 plików dopasowanych



In [73]:
scan_with_yara(
    rules_path='rules/encoded_reply_rule.yar',
    target_dir='samples/',
    output_path='results/yara_encoded_scan.txt'
)


Scanning finished. 1710 files were scanned, 19 matches were detected.
Results saved in: results/yara_encoded_scan.txt


In [74]:
analyze_yara_results_from_file('results/yara_encoded_scan.txt')

Analiza wyników z pliku: results/yara_encoded_scan.txt
Phishing mails: 19 / 822 dopasowań
Regular mails:  0 / 887 dopasowań
Inne pliki:      0 / 1 dopasowań
SUMA:            19 / 1710 plików dopasowanych



In [75]:
scan_with_yara(
    rules_path='rules/suspicious_links_rule.yar',
    target_dir='samples/',
    output_path='results/suspicious_links_scan.txt'
)


Scanning finished. 1710 files were scanned, 520 matches were detected.
Results saved in: results/suspicious_links_scan.txt


In [76]:
analyze_yara_results_from_file('results/suspicious_links_scan.txt')

Analiza wyników z pliku: results/suspicious_links_scan.txt
Phishing mails: 421 / 822 dopasowań
Regular mails:  99 / 887 dopasowań
Inne pliki:      0 / 1 dopasowań
SUMA:            520 / 1710 plików dopasowanych



In [77]:
scan_with_yara(
    rules_path='rules/complex_html_rule.yar',
    target_dir='samples/',
    output_path='results/complex_html_scan.txt'
)


Scanning finished. 1710 files were scanned, 143 matches were detected.
Results saved in: results/complex_html_scan.txt


In [78]:
analyze_yara_results_from_file('results/complex_html_scan.txt')

Analiza wyników z pliku: results/complex_html_scan.txt
Phishing mails: 82 / 822 dopasowań
Regular mails:  61 / 887 dopasowań
Inne pliki:      0 / 1 dopasowań
SUMA:            143 / 1710 plików dopasowanych

