In [20]:
!pip install tenseal
!pip install transformers accelerate torch
!pip install -q --upgrade openai



In [21]:
from typing import List, Dict, Tuple
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import json
import openai
from openai import OpenAI
import re
import hashlib
import os
import numpy as np
from pathlib import Path

In [22]:
def hash_ngram(ngram, buckets=1024):
    return int(hashlib.sha256(ngram.encode()).hexdigest(), 16) % buckets

def vectorize(text, n=4, buckets=1024):
    vec = [0] * buckets
    text = text.lower()
    for i in range(len(text) - n + 1):
        idx = hash_ngram(text[i:i+n], buckets)
        vec[idx] += 1
    return vec

# Connect to OpenRouter using their API key and base URL
client = OpenAI(
    api_key="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",
    base_url="https://openrouter.ai/api/v1"
)

# 1. Helper: LLM phrase generation from regex
def generate_phrases_from_regex(regex: str, max_phrases=10) -> List[str]:
    prompt = f"""You are a cybersecurity expert. Given the following regex from a phishing YARA rule:

{regex}

List {max_phrases} natural phrases or sentences that could appear in a phishing email and match the intent of this regex.
Just list each phrase on a new line without explanations."""

    response = client.chat.completions.create(
        model="mistralai/mistral-7b-instruct",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.7,
        max_tokens=300
    )

    raw_text = response.choices[0].message.content

    phrases = [
        re.sub(r"^\s*[\d]+[\.\)\-]*\s*", "", line).strip(" -•\n").lstrip("\"")
        for line in raw_text.split("\n") if line.strip()
    ]
    return phrases

# 2. Helper: extract n-grams (3-5 words) from phrases
def extract_ngrams(text: str, min_n=3, max_n=5) -> List[str]:
    words = text.lower().split()
    return [
        " ".join(words[i:i+n])
        for n in range(min_n, max_n + 1)
        for i in range(len(words) - n + 1)
    ]

# 3. Convert YARA regexes to vectorizer and feature space
def vectorize_yara_phrases(rules: List[Dict[str, object]]) -> Tuple[TfidfVectorizer, List[str]]:
    all_phrases = []
    for rule in rules:
        pattern = rule["pattern"]
        phrases = generate_phrases_from_regex(pattern)
        all_phrases.extend(phrases)

    # Convert to 3–5 word n-grams
    ngrams = set()
    for phrase in all_phrases:
        ngrams.update(extract_ngrams(phrase, 3, 5))

    sorted_vocab = sorted(ngrams)
    vectorized_vocab = [vectorize(phrase) for phrase in sorted_vocab]

    return vectorized_vocab, sorted_vocab


In [23]:
import tenseal as ts
import hashlib

# --- Configuration ---
BUCKETS = 1024
NGRAM_SIZE = 4
SCALE = 2 ** 40

# --- Text Hashing Vectorizer ---
def hash_ngram(ngram, buckets=BUCKETS):
    return int(hashlib.sha256(ngram.encode()).hexdigest(), 16) % buckets

def vectorize(text, n=NGRAM_SIZE, buckets=BUCKETS):
    vec = [0] * buckets
    text = text.lower()
    for i in range(len(text) - n + 1):
        idx = hash_ngram(text[i:i+n], buckets)
        vec[idx] += 1
    return vec

# --- CKKS Context ---
def create_ckks_context():
    context = ts.context(
        ts.SCHEME_TYPE.CKKS,
        poly_modulus_degree=8192,
        coeff_mod_bit_sizes=[60, 40, 40, 60]  # Must match scale size
    )
    context.global_scale = SCALE
    context.generate_galois_keys()
    return context

# --- Encryption Helper ---
def encrypt_vector(vector, context):
    return ts.ckks_vector(context, vector)

In [24]:
# yara_rules = [
#     r"/pass(word)?\s*reset/i"]
# vectorized_vocab, sorted_vocab = vectorize_yara_phrases(yara_rules)
# print(sorted_vocab)

In [25]:
# --- YARA rules with weights ---
rules = [
    {"pattern": r"/pass(word)?\s*reset/i", "weight": 1.5},
    {"pattern": r"/account\s+(suspended|locked|restricted)/i", "weight": 1.0},
    {"pattern": r"/verify\s+your\s+identity/i", "weight": 1.2},
    {"pattern": r"/click\s+the\s+link/i", "weight": 0.8},
    {"pattern": r"/secure\s+login/i", "weight": 1.3},
]
vectorized_vocab, sorted_vocab = vectorize_yara_phrases(rules)
print(sorted_vocab)

def rule_matches(pattern: str, text: str) -> bool:
    return re.search(pattern.strip("/i"), text, re.IGNORECASE) is not None



In [26]:
# --- Main Logic ---
context = create_ckks_context()

email_text = """Subject: Immediate Action Required – Password Reset

Dear user,

We have detected suspicious activity on your account and, as a security precaution, your access has been temporarily limited.

To restore access, please follow the link below to initiate a password reset:

👉 https://secure-login-authenticator.com/reset

If you do not reset your password within 24 hours, your account will be permanently locked.

Thank you for your cooperation,
Security Team
"""
yara_rule_text = "password reset"

# Vectorize
email_vector = vectorize(email_text)
yara_vector = vectorize(yara_rule_text)

# Pad to same length if needed (shouldn't be necessary here)
if len(email_vector) != len(yara_vector):
    max_len = max(len(email_vector), len(yara_vector))
    email_vector += [0] * (max_len - len(email_vector))
    yara_vector += [0] * (max_len - len(yara_vector))

# Encrypt both vectors
enc_email = encrypt_vector(email_vector, context)
enc_yara = encrypt_vector(yara_vector, context)

# Compute encrypted dot product
# This works because both vectors have same scale & context
enc_score = enc_email.dot(enc_yara)

# --- Threshold Configuration ---
THRESHOLD = 10.0

# Decrypt the result
score = enc_score.decrypt()[0]
print("Similarity Score (Dot Product, decrypted):", score)

# Check against threshold
is_phishing = score >= THRESHOLD

# Additional verification with YARA rules
yara_matches = [
    rule for rule in rules
    if rule_matches(rule["pattern"], email_text)
]
yara_score = sum(rule["weight"] for rule in yara_matches)

# Combined approach: similarity + YARA rules
final_score = score + yara_score
final_threshold = THRESHOLD + 2.0  # Higher threshold for combined approach

is_phishing_final = final_score >= final_threshold

print("\n--- Results ---")
print(f"Similarity score: {score:.2f}")
print(f"YARA rules matched: {len(yara_matches)} (total weight: {yara_score:.2f})")
print(f"Combined score: {final_score:.2f}")
print(f"Threshold: {THRESHOLD:.2f} (similarity), {final_threshold:.2f} (combined)")
if is_phishing_final:
    print("\nPHISHING!")
    print("Matched YARA rules:")
    for rule in yara_matches:
        print(f"- {rule['pattern']} (weight: {rule['weight']})")
else:
    print("\nEmail appears legitimate")

Similarity Score (Dot Product, decrypted): 36.00000727209425

--- Results ---
Similarity score: 36.00
YARA rules matched: 1 (total weight: 1.50)
Combined score: 37.50
Threshold: 10.00 (similarity), 12.00 (combined)

PHISHING!
Matched YARA rules:
- /pass(word)?\s*reset/i (weight: 1.5)


In [27]:
# Padding
max_len = max(len(email_vector), len(vectorized_vocab[0]))
email_vector += [0] * (max_len - len(email_vector))

enc_email = encrypt_vector(email_vector, context)

# Calculate similarity with each phrase vector from the vocab:
scores = []
for phrase_vec in vectorized_vocab:
    vec = phrase_vec
    vec += [0] * (max_len - len(vec))  # padding
    enc_phrase = encrypt_vector(vec, context)
    enc_score = enc_email.dot(enc_phrase)
    score = enc_score.decrypt()[0]
    scores.append(score)

max_score = max(scores)

# Check which YARA rules match the email text
yara_matches = [rule for rule in rules if rule_matches(rule["pattern"], email_text)]
yara_score = sum(rule["weight"] for rule in yara_matches)

# Calculate the combined score
final_score = max_score + yara_score

THRESHOLD = 10.0
final_threshold = THRESHOLD + 2.0

is_phishing_final = final_score >= final_threshold

print("\n--- Results ---")
print(f"Max similarity score fraz: {max_score:.2f}")
print(f"YARA rules matched: {len(yara_matches)} (total weight: {yara_score:.2f})")
print(f"Combined score: {final_score:.2f}")
print(f"Threshold: {THRESHOLD:.2f} (similarity), {final_threshold:.2f} (combined)")
if is_phishing_final:
    print("\nPHISHING!")
    print("Matched YARA rules:")
    for rule in yara_matches:
        print(f"- {rule['pattern']} (weight: {rule['weight']})")
else:
    print("\nEmail appears legitimate")


--- Results ---
Max similarity score fraz: 83.00
YARA rules matched: 1 (total weight: 1.50)
Combined score: 84.50
Threshold: 10.00 (similarity), 12.00 (combined)

PHISHING!
Matched YARA rules:
- /pass(word)?\s*reset/i (weight: 1.5)


In [28]:
def vectorize_all_emails(input_dir: str, output_path: str) -> None:

    input_dir = Path(input_dir)
    vectors: list[np.ndarray] = []
    for dirpath, dirnames, filenames  in os.walk(input_dir):
        for filename in filenames:
            with open(os.path.join(dirpath, filename)) as file:
                text = file.read()
                vec = vectorize(text)
                
                vectors.append(np.array(vec))
        continue # first level 

    np.savez_compressed(output_path, np.array(vectors)) # not too big but readable
    
    return np.array(vectors)


In [29]:
def iter_emails(path: str):
    array = np.load(path)['arr_0']
    for row in array:
        yield row

In [None]:
phishing = vectorize_all_emails(
    input_dir="samples/phishing_mails",
    output_path="./samples/phishing_mails_vectorized"
)

regular = vectorize_all_emails(
    input_dir="./samples/regular_mails",
    output_path="./samples/regular_mails_vectorized"
)

In [None]:
mail_vectors = iter_emails('./samples/phishing_mails_vectorized.npz')

In [None]:
for mail_vector in mail_vectors:
    #### do sth 
    ...

[67 14  9 ... 16 35 21]
[ 6  5  2 ...  5 11 10]
[26 29  7 ... 11 25 38]
[ 9 13  8 ... 17  3  1]
[96 49 88 ... 42 95 77]
[8 6 2 ... 4 8 7]
[ 9  6  5 ...  7 50  7]
[10  7  2 ...  4 47  8]
[68 81 45 ... 41 16 13]
[3 6 3 ... 4 3 2]
[32 23 20 ... 12 16 13]
[22 27  6 ... 11 23 36]
[10  2  9 ...  4  9  4]
[11 10  5 ...  4  7  2]
[ 8  9  7 ...  7 10  3]
[ 5  7  3 ... 11 14  6]
[17 30 22 ... 24 19  5]
[12 24 11 ...  6  6  4]
[108 135 121 ... 173 123  84]
[11 14  7 ... 12 18 10]
[2 4 7 ... 4 5 3]
[ 6 14  7 ... 10  8  6]
[ 4  4  5 ... 10  3  1]
[ 7  7 10 ...  9 17  6]
[21 30 13 ... 13 20  8]
[17 17 22 ... 22 22 16]
[25 23 21 ... 30 26 16]
[7 7 3 ... 5 9 2]
[254 287 251 ... 220 270 238]
[174 161 227 ... 184 473 245]
[5 4 6 ... 7 4 7]
[8 6 8 ... 5 8 1]
[12 25  8 ...  4  6  3]
[ 7  7 12 ...  9  7  4]
[ 8  8 12 ...  4  3  4]
[ 87 131 102 ...  82 208 143]
[ 4 11 12 ...  5  7  5]
[ 9 14  4 ...  7  7  3]
[16  6  4 ...  5  6  2]
[13  5  3 ...  5  6  2]
[ 9  4 44 ... 24 63  6]
[3 3 4 ... 5 4 1]
[14 31 32 