### Imports

In [None]:
import sys, os
sys.path.append(os.path.abspath('../../src'))

from helper_functions.path_resolver import DynamicPathResolver
from model_training.bert import *

### Paths

In [None]:
dpr = DynamicPathResolver(marker="README.md")

train_csv = dpr.path.data.preprocessed.data_bert.own_train_base_bert_csv
test_csv  = dpr.path.data.preprocessed.data_bert.english_curated_test_bert_csv
verification_csv = dpr.path.data.preprocessed.data_bert.english_curated_verification_bert_csv

models_folder = dpr.path.models.bert._path
output_dir    = dpr.path.models.bert.results._path
log_dir       = dpr.path.models.bert._logs._path

### Config

##### Train

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

config = {
    'data_amount': 5774,
    'max_len': 256,
    'batch_size': 64,
    'num_epochs': 10,
    'learning_rate': 2e-05,
    'lr_scheduler_type': "linear",
    'weight_decay': 0.01,
    'logging_steps': 10,
    'evaluation_strategy': "epoch",
    'save_strategy': "epoch",
    'warmup_steps': 500,
    'early_stopping_patience': 3,
}

bert_type = 'bert-base-german-cased' # bert-base-multilingual-cased # bert-base-german-cased
special_tokens = ["[EMAIL]", "[URL]"]

##### Eval

In [None]:
model_lang = 'german'  # 'german' 'multi'
use_own = False
use_test_set = True  

In [None]:
if model_lang == "german":
    bert_type = 'bert-base-german-cased'    
    model_name = 'bert_german_curated'      
    test_set_dir = 'german_own_test' if use_test_set else 'german_curated_verification' # german_own_test # german_curated_test
    test_csv = (
        dpr.path.data.preprocessed.data_bert.own_test_base_bert_csv # german_curated_test_bert_csv # own_test_base_bert_csv
        if use_test_set else
        dpr.path.data.preprocessed.data_bert.german_curated_verification_bert_csv
    )

elif model_lang == "multi":
    bert_type = 'bert-base-multilingual-cased'    
    model_name = 'bert_multilingual_curated'     
    test_set_dir = 'multilingual_curated_test' if use_test_set else 'multilingual_curated_verification'
    test_csv = (
        dpr.path.data.preprocessed.data_bert.multilingual_curated_test_bert_csv
        if use_test_set else
        dpr.path.data.preprocessed.data_bert.multilingual_curated_verification_bert_csv
    )

else:
    bert_type = 'bert-base-cased'           
    model_name = 'bert_english_curated'     
    test_set_dir = 'english_curated_test' if use_test_set else 'english_curated_verification'
    test_csv = (
        dpr.path.data.preprocessed.data_bert.english_curated_test_bert_csv
        if use_test_set else
        dpr.path.data.preprocessed.data_bert.english_curated_verification_bert_csv
    )

if use_own:
    bert_type = 'bert-base-german-cased'    
    model_name = 'bert_german_own'      
    test_set_dir = 'german_own_test' 
    test_csv = dpr.path.data.preprocessed.data_bert.own_test_base_bert_csv

### Prepare data

##### Get model, tokenizer, optimizer

In [None]:
model, tokenizer = create_model_and_tokenizer(bert_type, special_tokens, device)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

optimizer = torch.optim.AdamW(
    model.parameters(), 
    lr=config['learning_rate'], 
    betas=(0.9, 0.999), 
    eps=1e-08
)

scheduler = None 

##### Load Train data

In [None]:
train_eval_data, test_data = load_data(train_csv, test_csv, config['data_amount'])
train_data, eval_data = split_data(train_eval_data, eval_size=0.2)
#train_data.head(5)

In [None]:
train_dataset, eval_dataset, test_dataset = create_custom_datasets(
    train_data, eval_data, test_data, tokenizer, config['max_len']
)

### Verify preprocess

In [None]:
for i in range(5):
    sample = train_dataset[i]
    print(f"Sample {i}:")
    print("Input IDs:", sample["input_ids"])
    print("Attention Mask:", sample["attention_mask"])
    print("Label:", sample["labels"])
    print("-" * 40)

In [None]:
for i in range(5):
    print(f"Sample {i}:")
    print("Input IDs length:", len(train_dataset[i]['input_ids']))
    print("Attention Mask length:", len(train_dataset[i]['attention_mask']))
    print("Label:", train_dataset[i]['labels'])
    print("-" * 40)


### Train model

In [None]:
train_bert = False

In [None]:
if train_bert:
    trainer = train_model(model, tokenizer, train_dataset, eval_dataset, config, output_dir, log_dir)

### Evaluate model

In [None]:
output_dir = os.path.join(models_folder, model_name, 'results')
eval_results_dir = os.path.join(output_dir, test_set_dir) 
os.makedirs(eval_results_dir, exist_ok=True)  

In [None]:
print(f"Using model: {model_name}")
print(f"Using BERT type: {bert_type}")
print(f"Using set: {test_set_dir}")
print(f"Using CSV: {test_csv}")
print(f"Evaluation results saved in: {eval_results_dir}")

In [None]:
model, tokenizer = load_model_from_checkpoint(output_dir, '2500', device)

In [None]:
true_labels, predicted_labels, probs = evaluate_model(
    model, 
    test_dataset, 
    data_collator, 
    config['batch_size'], 
    device
    )

### Visualize results

##### Extract metrics

In [None]:
metrics = visual.extract_all_metrics_from_events(log_dir)
extracted_metrics = visual.extract_loss_and_accuracy_metrics(metrics)

In [None]:
scalar_tag = "train/learning_rate"
visual.plot_scalar_metric(log_dir, scalar_tag)

##### Loss x Epochs & Accuracy x Epochs

In [None]:
visual.plot_loss_accuracy(extracted_metrics, output_dir)

In [None]:
base_log_dir = dpr.path.models.bert._path

model_folders = {
    "BERT English Curated": "bert_english_curated",
    "BERT German Curated": "bert_german_curated",
    "BERT Multilingual Curated": "bert_multilingual_curated",
    #"BERT German Own": "bert_german_own",
}

visual.plot_multiple_loss_accuracy(base_log_dir, model_folders, output_dir)


##### Confusion Matrix

In [None]:
cm = confusion_matrix(true_labels, predicted_labels)
visual.plot_confusion_matrix(cm, eval_results_dir, "test")

##### ROC

In [None]:
visual.plot_roc_curve(true_labels, probs, eval_results_dir)

##### Precision - Recall

In [None]:
visual.plot_precision_recall(true_labels, probs, eval_results_dir)

##### Classification Report

In [None]:
visual.display_classification_report(true_labels, predicted_labels, target_names=["Legitimate", "Phishing"], save_path=eval_results_dir)

### Inference on samples

##### English Mails

In [None]:
s_texts = [
    # Legitimate (0) - Order confirmation with personal address
    "Order Confirmation: Your order has been placed! [SEP] Dear Emily, your order #56789 has been successfully placed. You can track your shipment using tracking ID 654321. Thank you for shopping with us!",

    # Phishing (1) - Generic addressing with a suspicious tracking link
    "Urgent: Delivery Issue with Your Package! [SEP] Dear Customer, your package is on hold due to incorrect details. Please update your address immediately here: http://track-your-order.secure-link.com.",

    # Legitimate (0) - Bank account update with secure login instructions
    "Your Monthly Bank Statement is Ready [SEP] Dear Emily, your monthly bank statement is ready for review. Please log in to your account at our official website to view the details.",

    # Phishing (1) - Fake bank alert with urgency and login link
    "Security Alert: Account Flagged! [SEP] Dear user, your bank account has been flagged for suspicious activity! Immediate action required! Log in now to verify: http://securebank-login.com.",

    # Legitimate (0) - Company HR confirmation for an interview
    "Your Interview at XYZ Corp [SEP] Hello Emily, we are pleased to inform you that you have been shortlisted for the marketing role at XYZ Corp. Your interview is scheduled for Monday.",

    # Phishing (1) - Fake job offer requesting sensitive information
    "Exclusive Remote Job Offer – Immediate Start! [SEP] Dear Candidate, congratulations! You have been selected for a high-paying remote job. Please provide your social security number to complete your application.",

    # Legitimate (0) - Subscription renewal notice with proper details
    "Reminder: Your Subscription Renewal [SEP] Dear Emily, your annual subscription for Premium Streaming Service is about to expire. Please renew before 10/12/2024 to continue enjoying our services.",

    # Phishing (1) - Fake subscription renewal with urgency and fake login link
    "Final Notice: Subscription Expiring Today! [SEP] Dear user, your subscription is expiring today! Renew now to avoid losing access. Click here to update your payment details: http://renewal-fastpay.com.",

    # Legitimate (0) - Real promotional email with a verified discount code
    "Enjoy 20% Off on Your Next Purchase! [SEP] Happy Holidays, Emily! Enjoy a 20% discount on your next purchase with us. Use code HOLIDAY20 at checkout. Offer valid until 01/05/2024.",

    # Phishing (1) - Fake giveaway requiring personal details
    "You've Won a Free iPhone 15! [SEP] Congratulations! You have won a free iPhone 15! Claim your prize now by providing your name, address, and payment for shipping at http://win-free-gift.com."
]

s_true_labels = [
    0, 1, 0, 1, 0, 1, 0, 1, 0, 1
]

##### German Mails

In [None]:
s_texts += [
    # Legitimate (0) - Bestellbestätigung mit persönlicher Anrede
    "Bestellbestätigung: Ihre Bestellung wurde bearbeitet! [SEP] Hallo Julia, Ihre Bestellung #98765 wurde erfolgreich bearbeitet. Die Sendungsnummer lautet 112233. Vielen Dank für Ihren Einkauf!",

    # Phishing (1) - Allgemeine Anrede mit dringender Handlungsaufforderung
    "Dringend: Problem mit Ihrer Lieferung! [SEP] Sehr geehrter Kunde, Ihr Paket konnte nicht zugestellt werden. Aktualisieren Sie Ihre Adresse jetzt hier: http://paket-tracking.de-verify.com.",

    # Legitimate (0) - Bankbenachrichtigung mit sicherer Anmeldung
    "Ihr Kontoauszug für diesen Monat [SEP] Liebe Julia, Ihr Kontoauszug für diesen Monat ist nun verfügbar. Loggen Sie sich sicher über unsere offizielle Website ein, um ihn anzusehen.",

    # Phishing (1) - Falsche Bankwarnung mit gefälschtem Login-Link
    "Sicherheitswarnung: Ihr Konto wurde gesperrt! [SEP] Sehr geehrter Nutzer, Ihr Bankkonto wurde gesperrt! Loggen Sie sich sofort ein, um Ihr Konto zu entsperren: http://sicherebank-login.de.",

    # Legitimate (0) - Einladung zu einem Vorstellungsgespräch
    "Einladung zum Vorstellungsgespräch [SEP] Hallo Julia, wir freuen uns, Ihnen mitzuteilen, dass Sie für ein Vorstellungsgespräch bei ABC GmbH ausgewählt wurden. Termin: Montag, 10 Uhr.",

    # Phishing (1) - Angeblich hochbezahlte Stelle mit Aufforderung zur Datenweitergabe
    "Exklusiver Job für Sie – Handeln Sie schnell! [SEP] Herzlichen Glückwunsch! Sie wurden für eine exklusive Heimarbeitsstelle ausgewählt. Bitte senden Sie uns Ihre persönlichen Daten zur Anmeldung.",

    # Legitimate (0) - Abo-Erinnerung mit richtigen Details
    "Erinnerung: Ihr Abonnement läuft bald aus [SEP] Hallo Julia, Ihr Premium-Abo läuft bald ab. Verlängern Sie es bis zum 15.12.2024, um weiterhin unbegrenzten Zugang zu genießen.",

    # Phishing (1) - Gefälschte Abo-Erneuerung mit Dringlichkeit
    "Letzte Chance: Abo-Verlängerung erforderlich! [SEP] Achtung! Ihr Abonnement läuft heute aus. Verlängern Sie es jetzt, um weiterhin Zugriff zu haben: http://abo-verlängerung-jetzt.com.",

    # Legitimate (0) - Echtes Werbeangebot mit Rabattcode
    "15% Rabatt für Sie – Jetzt sichern! [SEP] Hallo Julia, nutzen Sie unseren exklusiven Sommer-Rabatt! Sparen Sie 15% mit dem Code SOMMER15. Gültig bis 30.06.2024.",

    # Phishing (1) - Betrügerisches Gewinnspiel mit Aufforderung zur Datenangabe
    "Sie haben eine Traumreise gewonnen! [SEP] Herzlichen Glückwunsch! Sie haben eine Traumreise gewonnen! Bestätigen Sie Ihre Teilnahme, indem Sie Ihre persönlichen Daten eingeben: http://gratis-reise.com."
]

s_true_labels += [
    0, 1, 0, 1, 0, 1, 0, 1, 0, 1
]

##### Predict samples

In [None]:
s_true_labels, s_predicted_labels, s_probs = inference(
    model, s_texts, s_true_labels, tokenizer,  config['max_len'], device
)

In [None]:
visual.display_inference_results(s_texts, s_true_labels, s_predicted_labels, s_probs, class_names=["Legitimate", "Phishing"])

##### Classification report

In [None]:
visual.display_classification_report(s_true_labels, s_predicted_labels, target_names=["Legitimate", "Phishing"])