### Imports

In [9]:
import pandas as pd
import os
import langdetect
import numpy as np
from deep_translator import GoogleTranslator

In [10]:
import sys
sys.path.append(os.path.abspath('../../src'))
from helper_functions.path_resolver import DynamicPathResolver

### Paths

In [11]:
dpr = DynamicPathResolver(marker="README.md")

data_mail_dir = dpr.path.data.raw.data_mail._path

test_paths = [
    dpr.path.data.raw.data_mail.own.mails_labeled_csv,
    dpr.path.data.raw.data_mail.own.jannis_mail_csv,
    dpr.path.data.raw.data_mail.curated.Nazario_5_csv,
    dpr.path.data.raw.data_mail.curated.SpamAssasin_csv
]

train_paths = [
    dpr.path.data.raw.data_mail.curated.CEAS_08_csv,
    dpr.path.data.raw.data_mail.curated.TREC_07_csv
]

Project Root: c:\Users\ilian\Documents\Projects\git_projects\university\phishing_detection


### Data amounts

In [3]:
train_size = 20000
test_size = 4000

### Functions

In [4]:
def read_dataset(path):
    return pd.read_csv(path)

def detect_language(text):
    try:
        return langdetect.detect(str(text))
    except:
        return "unknown"

def add_language_column(df, text_col):
    df["language"] = df[text_col].apply(detect_language)
    return df

def translate_to_de(text):
    try:
        return GoogleTranslator(source="en", target="de").translate(text)
    except:
        return text

def remove_duplicates(df):
    return df.drop_duplicates(subset=["subject", "body"])


In [5]:
def sample_balanced(df, lang, needed_legit, needed_phish):
    sub = df[df["language"] == lang]
    legit = sub[sub["label"] == 0]
    phish = sub[sub["label"] == 1]
    legit_samp = legit.sample(n=min(needed_legit, len(legit)), random_state=42)
    phish_samp = phish.sample(n=min(needed_phish, len(phish)), random_state=42)
    return pd.concat([legit_samp, phish_samp], ignore_index=True)

In [6]:
def ensure_german(df, needed_de_legit, needed_de_phish):
    current_de_legit = len(df[(df["language"]=="de") & (df["label"]==0)])
    current_de_phish = len(df[(df["language"]=="de") & (df["label"]==1)])
    short_legit = needed_de_legit - current_de_legit
    short_phish = needed_de_phish - current_de_phish
    
    if short_legit > 0:
        en_legit = df[(df["language"]=="en") & (df["label"]==0)]
        extra_legit = en_legit.sample(n=min(short_legit, len(en_legit)), random_state=42).copy()
        extra_legit["subject"] = extra_legit["subject"].apply(translate_to_de)
        extra_legit["body"] = extra_legit["body"].apply(translate_to_de)
        extra_legit["language"] = "de"
        df = pd.concat([df, extra_legit], ignore_index=True)

    if short_phish > 0:
        en_phish = df[(df["language"]=="en") & (df["label"]==1)]
        extra_phish = en_phish.sample(n=min(short_phish, len(en_phish)), random_state=42).copy()
        extra_phish["subject"] = extra_phish["subject"].apply(translate_to_de)
        extra_phish["body"] = extra_phish["body"].apply(translate_to_de)
        extra_phish["language"] = "de"
        df = pd.concat([df, extra_phish], ignore_index=True)
    return df

### Assemble Test

In [8]:
def build_test_sets(test_paths, out_dir, total_size=4000):
    all_frames = []
    for path in test_paths:
        df = read_dataset(path)
        df = df[df["label"].isin([0,1])]
        all_frames.append(df)

    combined = pd.concat(all_frames, ignore_index=True)
    combined = remove_duplicates(combined)
    combined.fillna({"subject": "", "body": ""}, inplace=True)
    combined = add_language_column(combined, "body")

    # Split for Phish / Legit for pure EN / DE sets
    half = total_size // 2

    # English-only 
    test_raw_en = sample_balanced(combined, "en", half, half)

    # German-only test set (tranlate)
    combined = ensure_german(combined, half, half)
    test_raw_de = sample_balanced(combined, "de", half, half)

    # Mixed set split in Phish / Legit & EN / DE
    half_mixed = total_size // 2
    quarter = half_mixed // 2

    en_half = sample_balanced(combined, "en", quarter, quarter)

    combined = ensure_german(combined, quarter, quarter)
    de_half = sample_balanced(combined, "de", quarter, quarter)

    test_raw_en_de = pd.concat([en_half, de_half], ignore_index=True)

    os.makedirs(out_dir, exist_ok=True)
    test_raw_en.to_csv(os.path.join(out_dir, "test_raw_en.csv"), index=False)
    test_raw_de.to_csv(os.path.join(out_dir, "test_raw_de.csv"), index=False)
    test_raw_en_de.to_csv(os.path.join(out_dir, "test_raw_en_de.csv"), index=False)

In [10]:
build_test_sets(test_paths, data_mail_dir, test_size)

### Assemble Train

In [11]:
def build_train_sets(train_paths, out_dir, total_size=20000):
    all_frames = []
    for path in train_paths:
        df = read_dataset(path)
        df = df[df["label"].isin([0,1])]
        all_frames.append(df)

    combined = pd.concat(all_frames, ignore_index=True)
    combined = remove_duplicates(combined)
    combined.fillna({"subject": "", "body": ""}, inplace=True)
    combined = add_language_column(combined, "body")

    # Split in Legit / Phish
    half = total_size // 2  # total legit or total phish
    combined = ensure_german(combined, half, half)

    # Split again for EN / DE
    quarter = half // 2

    # Legit & EN
    en_legit = sample_balanced(combined, "en", quarter, 0)
    en_legit = en_legit[en_legit["label"] == 0].sample(n=min(quarter, len(en_legit)), random_state=42)

    # Phish & EN
    en_phish = sample_balanced(combined, "en", 0, quarter)
    en_phish = en_phish[en_phish["label"] == 1].sample(n=min(quarter, len(en_phish)), random_state=42)

    # Legit & DE
    de_legit = sample_balanced(combined, "de", quarter, 0)
    de_legit = de_legit[de_legit["label"] == 0].sample(n=min(quarter, len(de_legit)), random_state=42)

    # Phish & DE
    de_phish = sample_balanced(combined, "de", 0, quarter)
    de_phish = de_phish[de_phish["label"] == 1].sample(n=min(quarter, len(de_phish)), random_state=42)

    train_final = pd.concat([en_legit, en_phish, de_legit, de_phish], ignore_index=True)
    os.makedirs(out_dir, exist_ok=True)
    train_final.to_csv(os.path.join(out_dir, "train_raw_balanced.csv"), index=False)


In [13]:
build_train_sets(train_paths, data_mail_dir, train_size)

### Verifiy Distribution

In [14]:
def verify(df, name):
    print(f"\n{name}, Rows: {len(df)}")
    print("----------------------------------------")

    class_counts = df["label"].value_counts().to_dict()
    lang_counts = df["language"].value_counts().to_dict()
    grouped = df.groupby(["label", "language"]).size().to_dict()

    print(f"Class Distribution: {class_counts}")
    print(f"Language Distribution: {lang_counts}")
    print(f"Detailed (Class, Language) Distribution: {grouped}")
    print("----------------------------------------")


In [15]:
df_train = pd.read_csv(os.path.join(data_mail_dir, "train_raw_balanced.csv"))
verify(df_train, "Train Balanced")

df_test_en = pd.read_csv(os.path.join(data_mail_dir, "test_raw_en.csv"))
verify(df_test_en, "Test EN")

df_test_de = pd.read_csv(os.path.join(data_mail_dir, "test_raw_de.csv"))
verify(df_test_de, "Test DE")

df_test_mixed = pd.read_csv(os.path.join(data_mail_dir, "test_raw_en_de.csv"))
verify(df_test_mixed, "Test Mixed")


Train Balanced, Rows: 20000
----------------------------------------
Class Distribution: {0: 10000, 1: 10000}
Language Distribution: {'en': 10000, 'de': 10000}
Detailed (Class, Language) Distribution: {(0, 'de'): 5000, (0, 'en'): 5000, (1, 'de'): 5000, (1, 'en'): 5000}
----------------------------------------

Test EN, Rows: 4000
----------------------------------------
Class Distribution: {0: 2000, 1: 2000}
Language Distribution: {'en': 4000}
Detailed (Class, Language) Distribution: {(0, 'en'): 2000, (1, 'en'): 2000}
----------------------------------------

Test DE, Rows: 4000
----------------------------------------
Class Distribution: {0: 2000, 1: 2000}
Language Distribution: {'de': 4000}
Detailed (Class, Language) Distribution: {(0, 'de'): 2000, (1, 'de'): 2000}
----------------------------------------

Test Mixed, Rows: 4000
----------------------------------------
Class Distribution: {0: 2000, 1: 2000}
Language Distribution: {'en': 2000, 'de': 2000}
Detailed (Class, Language) D