### Imports

In [2]:
import sys, os
sys.path.append(os.path.abspath('../../src'))

from helper_functions.path_resolver import DynamicPathResolver
from helper_functions.preparation import *

### Paths

In [3]:
dpr = DynamicPathResolver(marker="README.md")

train_raw = dpr.path.data.raw.data_mail.train_raw_balanced_csv
test_raw  = dpr.path.data.raw.data_mail.test_raw_de_csv

train_preprocessed = dpr.path.data.preprocessed.data_mail.train_processed_balanced_csv
test_preprocessed  = dpr.path.data.preprocessed.data_mail.test_processed_de_csv

models_folder = dpr.path.models.bert
output_dir    = dpr.path.models.bert.results

Project Root: c:\Users\ilian\Documents\Projects\git_projects\university\phishing_detection


### Functions

In [4]:
def calculate_reduction(original, processed):
    original_length = len(original)
    processed_length = len(processed)
    if original_length == 0:
        return 0
    reduction_percentage = (original_length - processed_length) / original_length * 100
    return reduction_percentage

In [5]:
def sample_preprocessed_data(input_file, n=1):
    pd.set_option('display.max_colwidth', None)
    
    df = read_dataset(input_file)
    df_sample = df.sample(n).copy()
    
    # Original
    df_sample['full_text_original'] = df_sample['subject'].astype(str) + " " + df_sample['body'].astype(str)
    df_sample['word_count_original'] = df_sample['full_text_original'].apply(lambda x: len(x.split()))
    df_sample['char_count_original'] = df_sample['full_text_original'].apply(lambda x: len(x))
    
    print("Original Data:")
    display(df_sample[['subject', 'body', 'label', 'word_count_original', 'char_count_original']])
    
    # Processed
    df_processed = prepare_bert_data(df_sample).copy()
    df_processed['word_count_processed'] = df_processed['text'].apply(lambda x: len(x.split()))
    df_processed['char_count_processed'] = df_processed['text'].apply(lambda x: len(x))
    
    # Reduction in percentage
    df_processed['char_reduction_%'] = df_sample.apply(
        lambda row: calculate_reduction(
            row['full_text_original'], 
            prepare_bert_data(pd.DataFrame([row]))['text'].iloc[0]),
        axis=1
    )
    
    print("Processed Data:")
    display(df_processed[['text', 'label', 'word_count_processed', 'char_count_processed', 'char_reduction_%']])


In [6]:
def prepare_and_save_full_data(input_file, output_file):
    # Load data
    df = read_dataset(input_file)
    
    # Original
    df['full_text_original'] = df['subject'].astype(str) + " " + df['body'].astype(str)
    df['char_count_original'] = df['full_text_original'].apply(lambda x: len(x))
    
    # Process 
    df_processed = prepare_bert_data(df).copy()
    df_processed['char_count_processed'] = df_processed['text'].apply(lambda x: len(x))
    
    # Reduction
    total_orig_chars = df['char_count_original'].sum()
    total_proc_chars = df_processed['char_count_processed'].sum()
    overall_reduction = (total_orig_chars - total_proc_chars) / total_orig_chars * 100 if total_orig_chars > 0 else 0
    
    # Avg. reduction
    df['reduction_%'] = df.apply(
        lambda row: calculate_reduction(
            row['full_text_original'], 
            prepare_bert_data(pd.DataFrame([row]))['text'].iloc[0]
        ),
        axis=1
    )
    avg_reduction = df['reduction_%'].mean()
    
    print(f"Char Count (Original): {total_orig_chars}")
    print(f"Char Count (Processed): {total_proc_chars}")
    print(f"Char Reduction (Overall): {overall_reduction:.2f}%")
    print(f"Char Reduction (Avg. per row): {avg_reduction:.2f}%")
    
    # Save 
    df_processed.to_csv(output_file, index=False)
    print(f"Processed saved to {output_file}")


### Preprocess Test

In [7]:
sample_preprocessed_data(train_raw, n=1)

Original Data:


Unnamed: 0,subject,body,label,word_count_original,char_count_original
4687,Re: [Python-3000] Displaying strings containing unicode escapes at\tthe interactive prompt,"2008/4/17, Guido van Rossum :\n> I changed my mind already. :-) See my post of this morning in another thread.\n\nAh, I missed the mail! Thank you.\n_______________________________________________\nPython-3000 mailing list\nsmsbmo-9552@python.org\nhttp://mail.python.org/mailman/listinfo/python-3000\nUnsubscribe: http://mail.python.org/mailman/options/python-3000/python-3000%40tangomu.com\n\n\n",0,47,476


Processed Data:


Unnamed: 0,text,label,word_count_processed,char_count_processed,char_reduction_%
4687,"Re: [Python-3000] Displaying strings containing unicode escapes at the interactive prompt [SEP] 2008/4/17, Guido van Rossum : > I changed my mind already. :-) See my post of this morning in another thread. Ah, I missed the mail! Thank you. Python-3000 mailing list [EMAIL] [URL] Unsubscribe: [URL]",0,47,299,37.605042


In [8]:
sample_preprocessed_data(test_raw, n=1)

Original Data:


Unnamed: 0,subject,body,label,word_count_original,char_count_original
3919,Ein Ingenieur verändert Ihre Stromrechnung,"Vor kurzem sprach ich mit einem alten Freund in einem ruhigen Caf�. \n\nWir unterhielten uns �ber verschiedene Themen.\n\n Er erz�hlte mir, dass er k�rzlich eine faszinierende \n\nEntdeckung gemacht hatte. epkILt4Twe5iSSvqoRcFBBSSOubya47pX Er hatte ein besonderes Buch gefunden, das seine Perspektive ver�ndert hat. \npiy7zK\n\nW�hrend wir unseren Kaffee genossen, erz�hlte er weiter �ber seine Erfahrungen . \nBQ1RLJKQ3EWG54XGLVJGKC Er sagte, dass dieses Buch ihm die Augen ge�ffnet hat , \n\nneue M�glichkeiten zu entdecken. \nEs war unglaublich inspirierend , ihm zuzuh�ren.\n\nIch stellte fest, dass solche Entdeckungen wertvoll sind und dass wir aufgeschlossen f�r neue Erfahrungen sein sollten. \n8751 Unser Gespr�ch dauerte lange Zeit und wir tauschten Ideen aus. \n\n3\n\nAm Ende des Tages gingen wir auseinander und ich f�hlte mich inspiriert von der \n\n Begegnung. \nOJ50T Solche Momente sind wertvoll. W65DUVC8OQM Es ist sch�n, sich Zeit zu nehmen, um \n\ngemeinsame Erinnerungen aufzufrischen .\n\nReflectierend , bin ich dankbar. \n061317190282018 Es ist wichtig, Zeit f�r pers�nliche Begegnungen zu haben, \n\num den eigenen Horizont zu erweitern.",1,165,1186


Processed Data:


Unnamed: 0,text,label,word_count_processed,char_count_processed,char_reduction_%
3919,"Ein Ingenieur verändert Ihre Stromrechnung [SEP] Vor kurzem sprach ich mit einem alten Freund in einem ruhigen Caf�. Wir unterhielten uns �ber verschiedene Themen. Er erz�hlte mir, dass er k�rzlich eine faszinierende Entdeckung gemacht hatte. epkILt4Twe5iSSvqoRcFBBSSOubya47pX Er hatte ein besonderes Buch gefunden, das seine Perspektive ver�ndert hat. piy7zK W�hrend wir unseren Kaffee genossen, erz�hlte er weiter �ber seine Erfahrungen . BQ1RLJKQ3EWG54XGLVJGKC Er sagte, dass dieses Buch ihm die Augen ge�ffnet hat , neue M�glichkeiten zu entdecken. Es war unglaublich inspirierend , ihm zuzuh�ren. Ich stellte fest, dass solche Entdeckungen wertvoll sind und dass wir aufgeschlossen f�r neue Erfahrungen sein sollten. 8751 Unser Gespr�ch dauerte lange Zeit und wir tauschten Ideen aus. 3 Am Ende des Tages gingen wir auseinander und ich f�hlte mich inspiriert von der Begegnung. OJ50T Solche Momente sind wertvoll. W65DUVC8OQM Es ist sch�n, sich Zeit zu nehmen, um gemeinsame Erinnerungen aufzufrischen . Reflectierend , bin ich dankbar. 061317190282018 Es ist wichtig, Zeit f�r pers�nliche Begegnungen zu haben, um den eigenen Horizont zu erweitern.",1,166,1154,2.698145


### No subject / body

In [39]:
df = read_dataset(train_raw)
df = process_text_columns(df)

no_subject_count = (df['subject'] == '[NO_SUBJECT]').sum()
no_body_count = (df['body'] == '[NO_BODY]').sum()

print(f"Rows with [NO_SUBJECT]: {no_subject_count}")
print(f"Rows with [NO_BODY]: {no_body_count}")

Rows with [NO_SUBJECT]: 101
Rows with [NO_BODY]: 0


### Preprocess All

In [15]:
prepare_and_save_full_data(train_raw, train_preprocessed)

Char Count (Original): 36866391
Char Count (Processed): 31624838
Char Reduction (Overall): 14.22%
Char Reduction (Avg. per row): 11.43%
Processed saved to c:\Users\ilian\Documents\Projects\git_projects\university\phishing_detection\data\preprocessed\data_mail\train_processed_balanced.csv


In [40]:
prepare_and_save_full_data(test_raw, test_preprocessed)

Char Count (Original): 15347973
Char Count (Processed): 12392612
Char Reduction (Overall): 19.26%
Char Reduction (Avg. per row): 15.94%
Processed saved to c:\Users\ilian\Documents\Projects\git_projects\university\phishing_detection\data\preprocessed\data_mail\test_processed_de.csv
