In [1]:
!pip install tenseal
!pip install transformers accelerate torch
!pip install -q --upgrade openai

Collecting transformers
  Using cached transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting accelerate
  Using cached accelerate-1.7.0-py3-none-any.whl.metadata (19 kB)
Collecting torch
  Using cached torch-2.7.1-cp313-cp313-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting filelock (from transformers)
  Using cached filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Using cached huggingface_hub-0.32.4-py3-none-any.whl.metadata (14 kB)
Collecting numpy>=1.17 (from transformers)
  Using cached numpy-2.3.0-cp313-cp313-manylinux_2_28_x86_64.whl.metadata (62 kB)
Collecting regex!=2019.12.17 (from transformers)
  Using cached regex-2024.11.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from tra

In [3]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (17 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.15.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.5/12.5 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading joblib-1.5.1-py3-none-any.whl (307 kB)
Downloading scipy-1.15.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (37.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.3/37.3 MB[0m [31m6.2 MB/s[0m eta [36m

In [4]:
from typing import List, Dict, Tuple
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import json
import openai
from openai import OpenAI
import re
import hashlib
import os
import numpy as np
from pathlib import Path
import tenseal as ts
import hashlib
from multiprocessing import Pool
from functools import partial

# --- Configuration ---
BUCKETS = 1024
NGRAM_SIZE = 4
SCALE = 2 ** 40

In [None]:
def hash_ngram(ngram, buckets=1024):
    return int(hashlib.sha256(ngram.encode()).hexdigest(), 16) % buckets

def vectorize(text, n=4, buckets=1024):
    vec = [0] * buckets
    text = text.lower()
    for i in range(len(text) - n + 1):
        idx = hash_ngram(text[i:i+n], buckets)
        vec[idx] += 1
    return vec

# Connect to OpenRouter using their API key and base URL
client = OpenAI(
    api_key="",
    base_url="https://openrouter.ai/api/v1"
)

# 1. Helper: LLM phrase generation from regex
def generate_phrases_from_regex(regex: str, max_phrases=10) -> List[str]:
    prompt = f"""You are a cybersecurity expert. Given the following regex from a phishing YARA rule:

{regex}

List {max_phrases} natural phrases or sentences that could appear in a phishing email and match the intent of this regex.
Just list each phrase on a new line without explanations."""

    response = client.chat.completions.create(
        model="mistralai/mistral-7b-instruct",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.7,
        max_tokens=300
    )

    raw_text = response.choices[0].message.content

    phrases = [
        re.sub(r"^\s*[\d]+[\.\)\-]*\s*", "", line).strip(" -•\n").lstrip("\"")
        for line in raw_text.split("\n") if line.strip()
    ]
    return phrases

# 2. Helper: extract n-grams (3-5 words) from phrases
def extract_ngrams(text: str, min_n=3, max_n=5) -> List[str]:
    words = text.lower().split()
    return [
        " ".join(words[i:i+n])
        for n in range(min_n, max_n + 1)
        for i in range(len(words) - n + 1)
    ]

# 3. Convert YARA regexes to vectorizer and feature space
def vectorize_yara_phrases(rules: List[Dict[str, object]]) -> Tuple[TfidfVectorizer, List[str]]:
    all_phrases = []
    for rule in rules:
        pattern = rule["pattern"]
        phrases = generate_phrases_from_regex(pattern)
        all_phrases.extend(phrases)

    # Convert to 3–5 word n-grams
    ngrams = set()
    for phrase in all_phrases:
        ngrams.update(extract_ngrams(phrase, 3, 5))

    sorted_vocab = sorted(ngrams)
    vectorized_vocab = [vectorize(phrase) for phrase in sorted_vocab]

    return vectorized_vocab, sorted_vocab



def generate_regexs_from_rule_text(rule_text: str, max_regex=3) -> List[Dict[str, str]]:
 
    prompt = f"""You are a cybersecurity expert. Here is a YARA rule:
    {rule_text}

    Your task is to extract up to {max_regex} regular expressions that are explicitly written or clearly implied by the rule.

    For each regex, estimate the "weight" field (as a float) to reflect how strongly the pattern indicates phishing (the higher the weight, the more suspicious/phishy the pattern is). Assign higher weights to patterns that are more typical for phishing, and lower weights to more generic or less suspicious patterns.

    Return the result as a valid Python list of dictionaries, using the following exact format:
    [
        {{ "pattern": r"/regex1/i", "weight": 1.0 }},
        {{ "pattern": r"/regex2/i", "weight": 0.7 }}
    ]

    Do not include any explanations, markdown, or comments!

    - Each pattern must be a Python raw string (starting with r") and contain a YARA-style regex: between slashes (e.g., /something/i).
    - Do NOT use extra quotes inside the pattern (e.g., no "r\"...\"" or escapes).
    - Do NOT include any markdown, explanations, comments, or extra text — just the list.
    - If the rule contains no regex, infer up to 3 regex-style patterns that match the intent of the rule (e.g., suspicious links, phishing phrases, etc.).
    - The pattern must be a valid Python raw string.
    - Avoid unescaped quotes. Prefer single quotes `'` around strings.
    - Do not include YARA flags like `nocase` as raw identifiers — use `/.../i` inside the string instead.

    """

    response = client.chat.completions.create(
        model="mistralai/mistral-7b-instruct",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.5,
        max_tokens=300
    )

    raw_output = response.choices[0].message.content

    try:
        return eval(raw_output.strip())  # UWAGA: tylko jeśli masz pełną kontrolę nad odpowiedzią
    except Exception as e:
        print("Błąd parsowania odpowiedzi z LLM:", e)
        print("Odpowiedź:\n", raw_output)
        return []


def generate_regexs_from_folder(folder_path: str, max_regex=3):
    """
    Zwraca listę krotek (nazwa_pliku, lista_reguł)
    """
    all_rule_sets = []

    for filename in os.listdir(folder_path):
        if not filename.endswith(".yar"):
            continue

        file_path = os.path.join(folder_path, filename)
        with open(file_path, "r", encoding="utf-8") as f:
            rule_text = f.read()

        regex_list = generate_regexs_from_rule_text(rule_text, max_regex=max_regex)
        if regex_list:
            all_rule_sets.append((filename, regex_list))
        else:
            print(f"Brak regexów w pliku {filename}")

    return all_rule_sets

# --- Text Hashing Vectorizer ---
def hash_ngram(ngram, buckets=BUCKETS):
    return int(hashlib.sha256(ngram.encode()).hexdigest(), 16) % buckets

def vectorize(text, n=NGRAM_SIZE, buckets=BUCKETS):
    vec = [0] * buckets
    text = text.lower()
    for i in range(len(text) - n + 1):
        idx = hash_ngram(text[i:i+n], buckets)
        vec[idx] += 1
    return vec

# --- CKKS Context ---
def create_ckks_context():
    context = ts.context(
        ts.SCHEME_TYPE.CKKS,
        poly_modulus_degree=8192,
        coeff_mod_bit_sizes=[60, 40, 40, 60]  # Must match scale size
    )
    context.global_scale = SCALE
    context.generate_galois_keys()
    return context

# --- Encryption Helper ---
def encrypt_vector(vector, context):
    return ts.ckks_vector(context, vector)

def rule_matches(pattern: str, text: str) -> bool:
    return re.search(pattern.strip("/i"), text, re.IGNORECASE) is not None


In [6]:
def vectorize_all_emails(input_dir: str, output_path: str) -> None:
    input_dir = Path(input_dir)
    vectors: list[np.ndarray] = []
    for dirpath, dirnames, filenames in os.walk(input_dir):
        for filename in filenames:
            file_path = os.path.join(dirpath, filename)
            with open(file_path, encoding="utf-8", errors="replace") as file:
                text = file.read()
                vec = vectorize(text)
                vectors.append(np.array(vec))

    np.savez_compressed(output_path, np.array(vectors))
    
    return np.array(vectors)

def iter_emails(path: str):
    array = np.load(path)['arr_0']
    for row in array:
        yield row

In [7]:
def calibrate_threshold(regular_path, vectorized_vocab, vocab_boost=2.0, percentile=75):
    context = create_ckks_context()
    max_len = max(max(len(vec) for vec in vectorized_vocab), BUCKETS)

    padded_vocab = [vec + [0] * (max_len - len(vec)) for vec in vectorized_vocab]
    encrypted_vocab = [encrypt_vector(vec, context) for vec in padded_vocab]

    def evaluate(vec):
        vec = list(vec) + [0] * (max_len - len(vec))
        enc_email = encrypt_vector(vec, context)
        similarity_scores = [enc_email.dot(enc_phrase).decrypt()[0] for enc_phrase in encrypted_vocab]
        return max(similarity_scores)

    similarities = []
    for vec in iter_emails(regular_path):
        similarities.append(evaluate(vec))

    import numpy as np
    base_threshold = np.percentile(similarities, percentile)

    return base_threshold + vocab_boost

def scan_vectorized_emails(
    phishing_path: str,
    regular_path: str,
    output_path: str,
    rules: List[Dict[str, object]],
    vectorized_vocab: List[List[int]],
    threshold: float = 10.0,
    vocab_boost: float = 2.0,
    sample_fraction: float = 1.0,
    max_files: int = None
):
    import random

    context = create_ckks_context()
    max_len = max(max(len(vec) for vec in vectorized_vocab), BUCKETS)

    padded_vocab = [vec + [0] * (max_len - len(vec)) for vec in vectorized_vocab]
    encrypted_vocab = [encrypt_vector(vec, context) for vec in padded_vocab]

    def evaluate_email(vec):
        vec = list(vec) + [0] * (max_len - len(vec))
        enc_email = encrypt_vector(vec, context)
        similarity_scores = [enc_email.dot(enc_phrase).decrypt()[0] for enc_phrase in encrypted_vocab]
        return max(similarity_scores)

    if threshold == "auto":
        threshold = calibrate_threshold(
            regular_path, vectorized_vocab,
            vocab_boost=vocab_boost
        )
        print(f"Dynamicznie ustawiony próg: {threshold:.2f}")

    with open(output_path, 'w', encoding='utf-8') as f:
        for label, path in [("phishing_mails", phishing_path), ("regular_mails", regular_path)]:
            all_vecs = list(iter_emails(path))
            random.shuffle(all_vecs)
            sample_size = min(int(len(all_vecs) * sample_fraction), max_files or len(all_vecs))

            for idx, vec in enumerate(all_vecs[:sample_size]):
                similarity = evaluate_email(vec)
                yara_score = 0.0
                text = ""  # optionally left empty
                for rule in rules:
                    if rule_matches(rule["pattern"], text):
                        yara_score += rule["weight"]

                final_score = similarity + yara_score
                final_threshold = threshold
                result = "MATCH" if final_score >= final_threshold else "NO MATCH"

                f.write(f"{result}: {label}/{idx} -> similarity={similarity:.2f}, yara_score={yara_score:.2f}, total={final_score:.2f}\n")


def analyze_vectorized_results(result_file_path: str):
    phishing_matches = phishing_total = 0
    regular_matches = regular_total = 0
    other_matches = other_total = 0

    with open(result_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            is_match = line.startswith("MATCH")
            if "phishing_mails" in line:
                phishing_total += 1
                if is_match:
                    phishing_matches += 1
            elif "regular_mails" in line:
                regular_total += 1
                if is_match:
                    regular_matches += 1
            else:
                other_total += 1
                if is_match:
                    other_matches += 1

    total_matches = phishing_matches + regular_matches + other_matches
    total_scanned = phishing_total + regular_total + other_total

    print("Analiza wyników (vectorized approach):")
    print(f"Phishing mails: {phishing_matches} / {phishing_total} dopasowań")
    print(f"Regular mails:  {regular_matches} / {regular_total} dopasowań")
    print(f"Inne pliki:      {other_matches} / {other_total} dopasowań")
    print(f"SUMA:            {total_matches} / {total_scanned} plików dopasowanych\n")


In [8]:
# -- YARA rules with weights for every file in the "rules" folder ---
rules_sets = generate_regexs_from_folder("rules/", max_regex=3)

all_vectorized_vocabs = []  # lista: [vectorized_vocab dla każdego pliku]
all_sorted_vocabs = []      # lista: [sorted_vocab dla każdego pliku]
all_filenames = []          # lista nazw plików 
all_rules = []              # lista reguł 

for filename, rules in rules_sets:
    print(f"\n--- Zestaw reguł z pliku: {filename} ---")
    print("Regexy w zestawie:")
    for rule in rules:
        print(f"  {rule['pattern']}")
    vectorized_vocab, sorted_vocab = vectorize_yara_phrases(rules)
    all_vectorized_vocabs.append(vectorized_vocab)
    all_sorted_vocabs.append(sorted_vocab)
    all_filenames.append(filename)
    all_rules.extend(rules)
    # print("Lista fraz:", sorted_vocab)


# -- YARA rules with weights for all files in the "rules" folder ---
global_vectorized_vocab, global_sorted_vocab = vectorize_yara_phrases(all_rules)
# print("\n=== GLOBALNA LISTA SŁOWNIKÓW FRAZ (ze wszystkich plików) ===")
# print(global_sorted_vocab)


--- Zestaw reguł z pliku: sekurak_yara_example.yar ---
Regexy w zestawie:
  sekret
  password
  .*sekret.*

--- Zestaw reguł z pliku: complex_html_rule.yar ---
Regexy w zestawie:
  <!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01
  <a[^>]+>(\s*<\/a>|\s*<span.*<\/span><\/a>)
  Hello,\s*nocase

--- Zestaw reguł z pliku: encoded_reply_rule.yar ---
Regexy w zestawie:
  /Reply-To:\s+=\?UTF-8\?B\?.{20,}\?=/i

--- Zestaw reguł z pliku: suspicious_tld_rule.yar ---
Regexy w zestawie:
  $tld_ml|$tld_ga|$tld_ru|$tld_su|$tld_skin|$tld_cn|$tld_top|$tld_sbs|$tld_bond|$tld_xyz
  \.(ml|ga|ru|su|skin|cn|top|sbs|bond|xyz)$
  $tld*

--- Zestaw reguł z pliku: suspicious_links_rule.yar ---
Regexy w zestawie:
  bit\.ly/
  tinyurl\.com/
  https?://[^ ]*@(.*)

--- Zestaw reguł z pliku: phrases_rule.yar ---
Regexy w zestawie:
  Renew your subscription|Update your payment details|Your shipment is on the way|Password Expiration Notification|New file shared in Teams|Urgent|Verification required|Invoice|Need urgent 

In [None]:
# ---All YARA files processing in loop--
def process_rule_set(filename, rules, vectorized_vocab):
    print(f"\n--- Przetwarzanie reguł z pliku: {filename} ---")
    output_path = f"results/{filename}_results.txt"

    scan_vectorized_emails(
        phishing_path="./samples/phishing_mails_vectorized.npz",
        regular_path="./samples/regular_mails_vectorized.npz",
        output_path=output_path,
        rules=rules,
        vectorized_vocab=vectorized_vocab,
        threshold="auto",
        vocab_boost=2.0
    )

    analyze_vectorized_results(output_path)


tasks = [
    (filename, rules, vocab)
    for (filename, rules), vocab in zip(rules_sets, all_vectorized_vocabs)
]



In [11]:
import multiprocessing
multiprocessing.cpu_count()

48

In [None]:

if __name__ == "__main__":
    with Pool(processes=multiprocessing.cpu_count()) as pool:
        pool.starmap(process_rule_set, tasks)


--- Przetwarzanie reguł z pliku: sekurak_yara_example.yar ---


--- Przetwarzanie reguł z pliku: encoded_reply_rule.yar ---

--- Przetwarzanie reguł z pliku: complex_html_rule.yar ---
--- Przetwarzanie reguł z pliku: suspicious_tld_rule.yar ---

--- Przetwarzanie reguł z pliku: suspicious_links_rule.yar ---

--- Przetwarzanie reguł z pliku: phrases_rule.yar ---

--- Przetwarzanie reguł z pliku: domains_rule.yar ---
Dynamicznie ustawiony próg: 692.00
Dynamicznie ustawiony próg: 620.00
Dynamicznie ustawiony próg: 698.50
Dynamicznie ustawiony próg: 466.00
Dynamicznie ustawiony próg: 724.50
Dynamicznie ustawiony próg: 604.00
Analiza wyników (vectorized approach):
Phishing mails: 611 / 822 dopasowań
Regular mails:  221 / 887 dopasowań
Inne pliki:      0 / 0 dopasowań
SUMA:            832 / 1709 plików dopasowanych

Analiza wyników (vectorized approach):
Phishing mails: 549 / 822 dopasowań
Regular mails:  222 / 887 dopasowań
Inne pliki:      0 / 0 dopasowań
SUMA:            771 / 1709 plikó