### Imports

In [2]:
import sys, os
sys.path.append(os.path.abspath('../../src'))

from helper_functions.path_resolver import DynamicPathResolver
from helper_functions.preparation import *

### Paths

In [3]:
dpr = DynamicPathResolver(marker="README.md")

data_mail_dir = dpr.path.data.raw.data_mail.sets._path
data_bert_dir = dpr.path.data.preprocessed.data_bert._path

Project Root: c:\Users\ilian\Documents\Projects\git_projects\university\phishing_bert


### Test Preprocessing

In [None]:
train_raw = os.path.join(data_mail_dir, "own_train_base.csv")
test_raw = os.path.join(data_mail_dir, "own_test_base.csv")

In [None]:
sample_preprocessed_bert_data(train_raw, n=1)

In [None]:
sample_preprocessed_bert_data(test_raw, n=1)

##### No subject / body

In [None]:
df = read_dataset(train_raw)
df = process_text_columns(df)

no_subject_count = (df['subject'] == '[NO_SUBJECT]').sum()
no_body_count = (df['body'] == '[NO_BODY]').sum()

print(f"Rows with [NO_SUBJECT]: {no_subject_count}")
print(f"Rows with [NO_BODY]: {no_body_count}")

### Preprocess

##### Individual

In [4]:
train_raw = os.path.join(data_mail_dir, "own_train_base.csv")
test_raw = os.path.join(data_mail_dir, "own_test_base.csv")

train_preprocessed = os.path.join(data_bert_dir, "own_train_base_bert.csv")
test_preprocessed  = os.path.join(data_bert_dir, "own_test_base_bert.csv")

In [5]:
prepare_and_save_bert_data(train_raw, train_preprocessed)

Char Count (Original): 11418008
Char Count (Processed): 7698705
Char Reduction (Overall): 32.57%
Char Reduction (Avg. per row): 15.74%
Processed saved to c:\Users\ilian\Documents\Projects\git_projects\university\phishing_bert\data\preprocessed\data_bert\own_train_base_bert.csv


In [6]:
prepare_and_save_bert_data(test_raw, test_preprocessed)

Char Count (Original): 3014960
Char Count (Processed): 2024101
Char Reduction (Overall): 32.86%
Char Reduction (Avg. per row): 15.66%
Processed saved to c:\Users\ilian\Documents\Projects\git_projects\university\phishing_bert\data\preprocessed\data_bert\own_test_base_bert.csv


##### Preprocess All

In [4]:
bert_datasets = {
    "english_curated": {
         "train_raw": os.path.join(data_mail_dir, "english_curated_train.csv"),
         "test_raw": os.path.join(data_mail_dir, "english_curated_test.csv"),
         "train_preprocessed": os.path.join(data_bert_dir, "english_curated_train_bert.csv"),
         "test_preprocessed": os.path.join(data_bert_dir, "english_curated_test_bert.csv")
    },
    "german_curated": {
         "train_raw": os.path.join(data_mail_dir, "german_curated_train.csv"),
         "test_raw": os.path.join(data_mail_dir, "german_curated_test.csv"),
         "train_preprocessed": os.path.join(data_bert_dir, "german_curated_train_bert.csv"),
         "test_preprocessed": os.path.join(data_bert_dir, "german_curated_test_bert.csv")
    },
    "multilingual_curated": {
         "train_raw": os.path.join(data_mail_dir, "multilingual_curated_train.csv"),
         "test_raw": os.path.join(data_mail_dir, "multilingual_curated_test.csv"),
         "train_preprocessed": os.path.join(data_bert_dir, "multilingual_curated_train_bert.csv"),
         "test_preprocessed": os.path.join(data_bert_dir, "multilingual_curated_test_bert.csv")
    },
}

In [None]:
bert_datasets = {
    "english_own": {
         "train_raw": os.path.join(data_mail_dir, "english_own_train.csv"),
         "test_raw": os.path.join(data_mail_dir, "english_own_test.csv"),
         "train_preprocessed": os.path.join(data_bert_dir, "english_own_train_bert.csv"),
         "test_preprocessed": os.path.join(data_bert_dir, "english_own_test_bert.csv")
    },
    "german_own": {
         "train_raw": os.path.join(data_mail_dir, "german_own_train.csv"),
         "test_raw": os.path.join(data_mail_dir, "german_own_test.csv"),
         "train_preprocessed": os.path.join(data_bert_dir, "german_own_train_bert.csv"),
         "test_preprocessed": os.path.join(data_bert_dir, "german_own_test_bert.csv")
    },
    "multilingual_own": {
         "train_raw": os.path.join(data_mail_dir, "multilingual_own_train.csv"),
         "test_raw": os.path.join(data_mail_dir, "multilingual_own_test.csv"),
         "train_preprocessed": os.path.join(data_bert_dir, "multilingual_own_train_bert.csv"),
         "test_preprocessed": os.path.join(data_bert_dir, "multilingual_own_test_bert.csv")
    }
}

In [None]:
for dataset_name, paths in bert_datasets.items():
    print(f"Preprocessing BERT data for {dataset_name} dataset:")
    
    prepare_and_save_bert_data(paths["train_raw"], paths["train_preprocessed"])
    prepare_and_save_bert_data(paths["test_raw"], paths["test_preprocessed"])

    print(f"Completed preprocessing for {dataset_name} dataset.\n")
    print(f"\n -------------------------------------------------------------")

Preprocessing BERT data for english_curated dataset:
Char Count (Original): 35304463
Char Count (Processed): 29399761
Char Reduction (Overall): 16.73%
Char Reduction (Avg. per row): 13.02%
Processed saved to c:\Users\ilian\Documents\Projects\git_projects\university\phishing_detection\data\preprocessed\data_bert\english_curated_train_bert.csv
Char Count (Original): 14523458
Char Count (Processed): 12688454
Char Reduction (Overall): 12.63%
Char Reduction (Avg. per row): 12.85%
Processed saved to c:\Users\ilian\Documents\Projects\git_projects\university\phishing_detection\data\preprocessed\data_bert\english_curated_test_bert.csv
Completed preprocessing for english_curated dataset.

Preprocessing BERT data for german_curated dataset:
Char Count (Original): 36802274
Char Count (Processed): 31406480
Char Reduction (Overall): 14.66%
Char Reduction (Avg. per row): 10.86%
Processed saved to c:\Users\ilian\Documents\Projects\git_projects\university\phishing_detection\data\preprocessed\data_bert\