In [None]:
!pip install tenseal
!pip install transformers accelerate torch
!pip install -q --upgrade openai









In [None]:
from typing import List, Dict, Tuple
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import json
import openai
from openai import OpenAI
import re
import hashlib

def hash_ngram(ngram, buckets=1024):
    return int(hashlib.sha256(ngram.encode()).hexdigest(), 16) % buckets

def vectorize(text, n=4, buckets=1024):
    vec = [0] * buckets
    text = text.lower()
    for i in range(len(text) - n + 1):
        idx = hash_ngram(text[i:i+n], buckets)
        vec[idx] += 1
    return vec

# Connect to OpenRouter using their API key and base URL
client = OpenAI(
    api_key="...",
    base_url="https://openrouter.ai/api/v1"
)

# 1. Helper: LLM phrase generation from regex
def generate_phrases_from_regex(regex: str, max_phrases=10) -> List[str]:
    prompt = f"""You are a cybersecurity expert. Given the following regex from a phishing YARA rule:

{regex}

List {max_phrases} natural phrases or sentences that could appear in a phishing email and match the intent of this regex.
Just list each phrase on a new line without explanations."""

    response = client.chat.completions.create(
        model="mistralai/mistral-7b-instruct",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.7,
        max_tokens=300
    )

    raw_text = response.choices[0].message.content

    phrases = [
        re.sub(r"^\s*[\d]+[\.\)\-]*\s*", "", line).strip(" -•\n").lstrip("\"")
        for line in raw_text.split("\n") if line.strip()
    ]
    return phrases

# 2. Helper: extract n-grams (3-5 words) from phrases
def extract_ngrams(text: str, min_n=3, max_n=5) -> List[str]:
    words = text.lower().split()
    return [
        " ".join(words[i:i+n])
        for n in range(min_n, max_n + 1)
        for i in range(len(words) - n + 1)
    ]

# 3. Convert YARA regexes to vectorizer and feature space
def vectorize_yara_phrases(rules: List[Dict[str, object]]) -> Tuple[TfidfVectorizer, List[str]]:
    all_phrases = []
    for rule in rules:
        pattern = rule["pattern"]
        phrases = generate_phrases_from_regex(pattern)
        all_phrases.extend(phrases)

    # Convert to 3–5 word n-grams
    ngrams = set()
    for phrase in all_phrases:
        ngrams.update(extract_ngrams(phrase, 3, 5))

    sorted_vocab = sorted(ngrams)
    vectorized_vocab = [vectorize(phrase) for phrase in sorted_vocab]

    return vectorized_vocab, sorted_vocab

def generate_regexs_from_rule_text(rule_text: str, max_regex=3) -> List[Dict[str, str]]:
 
    prompt = f"""You are a cybersecurity expert. Here is a YARA rule:
    {rule_text}

    Your task is to extract up to {max_regex} regular expressions that are explicitly written or clearly implied by the rule.

    For each regex, estimate the "weight" field (as a float) to reflect how strongly the pattern indicates phishing (the higher the weight, the more suspicious/phishy the pattern is). Assign higher weights to patterns that are more typical for phishing, and lower weights to more generic or less suspicious patterns.

    Return the result as a valid JSON list of dictionaries, using the following exact format:

    [
        {{ "pattern": "/regex1/i", "weight": 1.0 }},
        {{ "pattern": "/regex2/i", "weight": 0.7 }},
        ...
    ]

    Rules:
    - Each regex must be enclosed in double quotes.
    - Use the YARA-style `/.../i` syntax for patterns (even if it's implied).
    - Do **not** include raw strings (r"..."), `nocase`, or any extra flags.
    - Do **not** include explanations, markdown, or comments!
    - Return **only** the JSON list. Nothing more.
    - If the rule contains no regex, you may generate up to 3 patterns that match its intent (e.g., suspicious links, domains, common phishing phrases).
    """

    response = client.chat.completions.create(
        model="mistralai/mistral-7b-instruct",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.5,
        max_tokens=300
    )

    raw_output = response.choices[0].message.content

    try:
        return eval(raw_output.strip())  # UWAGA: tylko jeśli masz pełną kontrolę nad odpowiedzią
    except Exception as e:
        print("Błąd parsowania odpowiedzi z LLM:", e)
        print("Odpowiedź:\n", raw_output)
        return []


def generate_regexs_from_folder(folder_path: str, max_regex=3):
    """
    Zwraca listę krotek (nazwa_pliku, lista_reguł)
    """
    all_rule_sets = []

    for filename in os.listdir(folder_path):
        if not filename.endswith(".yar"):
            continue

        file_path = os.path.join(folder_path, filename)
        with open(file_path, "r", encoding="utf-8") as f:
            rule_text = f.read()

        regex_list = generate_regexs_from_rule_text(rule_text, max_regex=max_regex)
        if regex_list:
            all_rule_sets.append((filename, regex_list))
        else:
            print(f"Brak regexów w pliku {filename}")

    return all_rule_sets

In [30]:
import tenseal as ts
import hashlib

# --- Configuration ---
BUCKETS = 1024
NGRAM_SIZE = 4
SCALE = 2 ** 40

# --- Text Hashing Vectorizer ---
def hash_ngram(ngram, buckets=BUCKETS):
    return int(hashlib.sha256(ngram.encode()).hexdigest(), 16) % buckets

def vectorize(text, n=NGRAM_SIZE, buckets=BUCKETS):
    vec = [0] * buckets
    text = text.lower()
    for i in range(len(text) - n + 1):
        idx = hash_ngram(text[i:i+n], buckets)
        vec[idx] += 1
    return vec

# --- CKKS Context ---
def create_ckks_context():
    context = ts.context(
        ts.SCHEME_TYPE.CKKS,
        poly_modulus_degree=8192,
        coeff_mod_bit_sizes=[60, 40, 40, 60]  # Must match scale size
    )
    context.global_scale = SCALE
    context.generate_galois_keys()
    return context

# --- Encryption Helper ---
def encrypt_vector(vector, context):
    return ts.ckks_vector(context, vector)

In [None]:
# yara_rules = [
#     r"/pass(word)?\s*reset/i"]
# vectorized_vocab, sorted_vocab = vectorize_yara_phrases(yara_rules)
# print(sorted_vocab)

In [31]:
# --- YARA rules with weights (example) ---
rules = [
    {"pattern": r"/pass(word)?\s*reset/i", "weight": 1.5},
    {"pattern": r"/account\s+(suspended|locked|restricted)/i", "weight": 1.0},
    {"pattern": r"/verify\s+your\s+identity/i", "weight": 1.2},
    {"pattern": r"/click\s+the\s+link/i", "weight": 0.8},
    {"pattern": r"/secure\s+login/i", "weight": 1.3},
]
vectorized_vocab, sorted_vocab = vectorize_yara_phrases(rules)
print(sorted_vocab)

def rule_matches(pattern: str, text: str) -> bool:
    return re.search(pattern.strip("/i"), text, re.IGNORECASE) is not None

['a security review."', 'access your account', 'access your account details.', 'access your account."', 'access your secure', 'access your secure login', 'access your secure login area"', 'account due to', 'account due to possible', 'account due to possible account', 'account for a', 'account for a security', 'account for a security review."', 'account has been', 'account has been locked', 'account has been locked due', 'account has been locked for', 'account has been restricted', 'account has been restricted and', 'account has been restricted due', 'account has been suspended', 'account has been suspended for', 'account is currently', 'account is currently restricted', 'account is currently restricted due', 'account is temporarily', 'account is temporarily restricted', 'account is temporarily restricted for', 'account security requires', 'account security requires identity', 'account security requires identity verification."', 'account, please verify', 'account, please verify your', '

In [36]:
# -- YARA rules with weights for every file in the "rules" folder ---
rules_sets = generate_regexs_from_folder("rules/")

all_vectorized_vocabs = []  # lista: [vectorized_vocab dla każdego pliku]
all_sorted_vocabs = []      # lista: [sorted_vocab dla każdego pliku]
all_filenames = []          # lista nazw plików 
all_rules = []              # lista reguł 

for filename, rules in rules_sets:
    print(f"\n--- Zestaw reguł z pliku: {filename} ---")
    print("Regexy w zestawie:")
    for rule in rules:
        print(f"  {rule['pattern']}")
    vectorized_vocab, sorted_vocab = vectorize_yara_phrases(rules)
    all_vectorized_vocabs.append(vectorized_vocab)
    all_sorted_vocabs.append(sorted_vocab)
    all_filenames.append(filename)
    all_rules.extend(rules)
    print("Lista fraz:", sorted_vocab)


# -- YARA rules with weights for all files in the "rules" folder ---
global_vectorized_vocab, global_sorted_vocab = vectorize_yara_phrases(all_rules)
print("\n=== GLOBALNY SŁOWNIK FRAZ (ze wszystkich plików) ===")
print(global_sorted_vocab)




--- Zestaw reguł z pliku: complex_html_rule.yar ---
Regexy w zestawie:
  /<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01/
  style=
  <a[^>]+>
Lista fraz: ['a new message', 'a new message waiting.', 'a new message waiting. click', 'a new message,', 'a new message, click', 'a new message, click here', 'a prize! click', 'a prize! click the', 'a prize! click the link', 'access the attached', 'access the attached document:', 'access the attached document: <a[^>]+>', 'access your account', 'access your account details"', 'account by confirming', 'account by confirming your', 'account by confirming your personal', 'account details for', 'account details for safety"', 'account details now"', 'account details using', 'account details using the', 'account details using the link', 'account details: <a[^>]+>', 'account information immediately"', 'account security settings"', 'account will be', 'account will be suspended', 'account will be suspended if', 'account, please log', 'account, please log i

In [45]:
# --- Example of dictionary with regexes ---
print(F"Plik: {rules_sets[1][0]} \nSłownik regexów z wagami: {rules_sets[1][1]}") 

Plik: domains_rule.yar 
Słownik regexów z wagami: [{'pattern': '/\\bchainsmokers-feeling\\.org\\b/i', 'weight': 1.0}, {'pattern': '/\\b(xfund02|smxrayon|circularhub)\\.([a-z]{2,3})/i', 'weight': 0.8}, {'pattern': '/\\b(amazonses|sendgrid|sparkpostmail)\\.com/i', 'weight': 0.6}]


In [None]:
# --- Main Logic ---
context = create_ckks_context()

email_text = """Subject: Immediate Action Required – Password Reset

Dear user,

We have detected suspicious activity on your account and, as a security precaution, your access has been temporarily limited.

To restore access, please follow the link below to initiate a password reset:

👉 https://secure-login-authenticator.com/reset

If you do not reset your password within 24 hours, your account will be permanently locked.

Thank you for your cooperation,
Security Team
"""
yara_rule_text = "password reset"

# Vectorize
email_vector = vectorize(email_text)
yara_vector = vectorize(yara_rule_text)

# Pad to same length if needed (shouldn't be necessary here)
if len(email_vector) != len(yara_vector):
    max_len = max(len(email_vector), len(yara_vector))
    email_vector += [0] * (max_len - len(email_vector))
    yara_vector += [0] * (max_len - len(yara_vector))

# Encrypt both vectors
enc_email = encrypt_vector(email_vector, context)
enc_yara = encrypt_vector(yara_vector, context)

# Compute encrypted dot product
# This works because both vectors have same scale & context
enc_score = enc_email.dot(enc_yara)

# --- Threshold Configuration ---
THRESHOLD = 10.0

# Decrypt the result
score = enc_score.decrypt()[0]
print("Similarity Score (Dot Product, decrypted):", score)

# Check against threshold
is_phishing = score >= THRESHOLD

# Additional verification with YARA rules
yara_matches = [
    rule for rule in rules
    if rule_matches(rule["pattern"], email_text)
]
yara_score = sum(rule["weight"] for rule in yara_matches)

# Combined approach: similarity + YARA rules
final_score = score + yara_score
final_threshold = THRESHOLD + 2.0  # Higher threshold for combined approach

is_phishing_final = final_score >= final_threshold

print("\n--- Results ---")
print(f"Similarity score: {score:.2f}")
print(f"YARA rules matched: {len(yara_matches)} (total weight: {yara_score:.2f})")
print(f"Combined score: {final_score:.2f}")
print(f"Threshold: {THRESHOLD:.2f} (similarity), {final_threshold:.2f} (combined)")
if is_phishing_final:
    print("\nPHISHING!")
    print("Matched YARA rules:")
    for rule in yara_matches:
        print(f"- {rule['pattern']} (weight: {rule['weight']})")
else:
    print("\nEmail appears legitimate")

Similarity Score (Dot Product, decrypted): 36.00000408698871

--- Results ---
Similarity score: 36.00
YARA rules matched: 1 (total weight: 1.50)
Combined score: 37.50
Threshold: 10.00 (similarity), 12.00 (combined)

PHISHING!
Matched YARA rules:
- /pass(word)?\s*reset/i (weight: 1.5)


In [None]:
# Padding
max_len = max(len(email_vector), len(vectorized_vocab[0]))
email_vector += [0] * (max_len - len(email_vector))

enc_email = encrypt_vector(email_vector, context)

# Calculate similarity with each phrase vector from the vocab:
scores = []
for phrase_vec in vectorized_vocab:
    vec = phrase_vec
    vec += [0] * (max_len - len(vec))  # padding
    enc_phrase = encrypt_vector(vec, context)
    enc_score = enc_email.dot(enc_phrase)
    score = enc_score.decrypt()[0]
    scores.append(score)

max_score = max(scores)

# Check which YARA rules match the email text
yara_matches = [rule for rule in rules if rule_matches(rule["pattern"], email_text)]
yara_score = sum(rule["weight"] for rule in yara_matches)

# Calculate the combined score
final_score = max_score + yara_score

THRESHOLD = 10.0
final_threshold = THRESHOLD + 2.0

is_phishing_final = final_score >= final_threshold

print("\n--- Results ---")
print(f"Max similarity score fraz: {max_score:.2f}")
print(f"YARA rules matched: {len(yara_matches)} (total weight: {yara_score:.2f})")
print(f"Combined score: {final_score:.2f}")
print(f"Threshold: {THRESHOLD:.2f} (similarity), {final_threshold:.2f} (combined)")
if is_phishing_final:
    print("\nPHISHING!")
    print("Matched YARA rules:")
    for rule in yara_matches:
        print(f"- {rule['pattern']} (weight: {rule['weight']})")
else:
    print("\nEmail appears legitimate")


--- Results ---
Max similarity score fraz: 74.00
YARA rules matched: 1 (total weight: 1.50)
Combined score: 75.50
Threshold: 10.00 (similarity), 12.00 (combined)

PHISHING!
Matched YARA rules:
- /pass(word)?\s*reset/i (weight: 1.5)


In [1]:
import numpy as np
from pathlib import Path

def vectorize_all_emails(input_dir: str, output_path: str) -> None:

    input_dir = Path(input_dir)
    vectors: Dict[str, np.ndarray] = {}

    for file in input_dir.rglob("*.txt"):
        try:
            text = file.read_text(encoding="utf-8")
            vec = vectorize(text)
            vectors[str(file)] = np.array(vec, dtype=np.float32)
        except Exception as e:
            print(f"Błąd przetwarzania {file}: {e}")

    np.savez_compressed(output_path, **vectors)
    print(f"Vectors saved to: {output_path}")


In [None]:
vectorize_all_emails(
    input_dir="/content/yara_mk/samples/phishing_mails",
    output_path="/content/yara_mk/samples/phishing_mails_vectorized"
)

vectorize_all_emails(
    input_dir="/content/yara_mk/samples/regular_mails",
    output_path="/content/yara_mk/samples/regular_mails_vectorized"
)