# Preperation

### Imports

In [1]:
import pandas as pd
import textstat
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt

### Paths

In [2]:
import sys, os
sys.path.append(os.path.abspath('../../src'))
from helper_functions.path_resolver import DynamicPathResolver

dpr = DynamicPathResolver(marker="README.md")
paths = dpr.structure

Project Root: c:\Users\ilian\Documents\Projects\git_projects\university\phishing_detection


In [3]:
train_raw = paths.data.raw.data_mail.train_raw_csv
test_raw = paths.data.raw.data_mail.test_raw_csv

train_preprocessed = paths.data.preprocessed.data_mail.train_processed_bert_csv
test_preprocessed = paths.data.preprocessed.data_mail.test_processed_bert_csv

models_folder = dpr.get_folder_path_from_namespace(paths.models.bert)
output_dir = dpr.get_folder_path_from_namespace(paths.models.bert.results)

os.makedirs(models_folder, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)

### Functions

In [4]:
def read_dataset(file_path):
    return pd.read_csv(file_path)

def process_text_columns(df):
    df['subject'] = df['subject'].fillna('[NO_SUBJECT]').astype(str)
    df['body'] = df['body'].fillna('[NO_BODY]').astype(str)
    return df


def clean_text(text):
    text = re.sub(r'\s+', ' ', text).strip()
    substitutions = [
        (r'https?://\S+|www\.\S+', '[URL]'),
        (r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', '[EMAIL]'),
        (r'-{2,}', ' '),
        (r'!{2,}', '!'),
        (r'\?{2,}', '?'),
        (r'[_+*]{2,}', ' '),
        (r'[=+]{3,}', ' '),
        (r'[~]{3,}', ' '),
        (r'[#]{3,}', ' '),
        (r'[<]{3,}', ' '),
        (r'[>]{3,}', ' ')
    ]
        
    for pattern, repl in substitutions:
        text = re.sub(pattern, repl, text)
    
    return text.strip()


def combine_text_fields(df):
    df['subject'] = df['subject'].apply(clean_text)
    df['body'] = df['body'].apply(clean_text)
    df['text'] = df['subject'] + " [SEP] " + df['body']
    return df


def prepare_bert_data(df):
    df = process_text_columns(df)
    df = combine_text_fields(df)
    return df[['text', 'label']]

In [5]:
def calculate_reduction(original, processed):
    original_length = len(original)
    processed_length = len(processed)
    if original_length == 0:
        return 0
    reduction_percentage = (original_length - processed_length) / original_length * 100
    return reduction_percentage

def sample_preprocessed_data(input_file, n=1):
    pd.set_option('display.max_colwidth', None)
    
    df = read_dataset(input_file)
    df_sample = df.sample(n).copy()
    
    # Original
    df_sample['full_text_original'] = df_sample['subject'].astype(str) + " " + df_sample['body'].astype(str)
    df_sample['word_count_original'] = df_sample['full_text_original'].apply(lambda x: len(x.split()))
    df_sample['char_count_original'] = df_sample['full_text_original'].apply(lambda x: len(x))
    
    print("Original Data:")
    display(df_sample[['subject', 'body', 'label', 'word_count_original', 'char_count_original']])
    
    # Processed
    df_processed = prepare_bert_data(df_sample).copy()
    df_processed['word_count_processed'] = df_processed['text'].apply(lambda x: len(x.split()))
    df_processed['char_count_processed'] = df_processed['text'].apply(lambda x: len(x))
    
    # Reduction in percentage
    df_processed['char_reduction_%'] = df_sample.apply(
        lambda row: calculate_reduction(row['full_text_original'], prepare_bert_data(pd.DataFrame([row]))['text'].iloc[0]),
        axis=1
    )
    
    print("Processed Data:")
    display(df_processed[['text', 'label', 'word_count_processed', 'char_count_processed', 'char_reduction_%']])


In [6]:
def prepare_and_save_full_data(input_file, output_file):
    # Load data
    df = read_dataset(input_file)
    
    # Original
    df['full_text_original'] = df['subject'].astype(str) + " " + df['body'].astype(str)
    df['char_count_original'] = df['full_text_original'].apply(lambda x: len(x))
    
    # Process 
    df_processed = prepare_bert_data(df).copy()
    df_processed['char_count_processed'] = df_processed['text'].apply(lambda x: len(x))
    
    # Reduction
    total_orig_chars = df['char_count_original'].sum()
    total_proc_chars = df_processed['char_count_processed'].sum()
    overall_reduction = (total_orig_chars - total_proc_chars) / total_orig_chars * 100 if total_orig_chars > 0 else 0
    
    # Avg. reduction
    df['reduction_%'] = df.apply(
        lambda row: calculate_reduction(
            row['full_text_original'], 
            prepare_bert_data(pd.DataFrame([row]))['text'].iloc[0]
        ),
        axis=1
    )
    avg_reduction = df['reduction_%'].mean()
    
    print(f"Char Count (Original): {total_orig_chars}")
    print(f"Char Count (Processed): {total_proc_chars}")
    print(f"Char Reduction (Overall): {overall_reduction:.2f}%")
    print(f"Char Reduction (Avg. per row): {avg_reduction:.2f}%")
    
    # Save 
    df_processed.to_csv(output_file, index=False)
    print(f"Processed saved to {output_file}")


### Preprocess Test

In [7]:
sample_preprocessed_data(train_raw, n=1)

Original Data:


Unnamed: 0,subject,body,label,word_count_original,char_count_original
6750,Rev 332: merge from ronnie in http://samba.org/~tridge/ctdb,"------------------------------------------------------------\nrevno: 332\nrevision-id: tridge@samba.org-20070523045041-a6v1tls6f3m01bqx\nparent: tridge@samba.org-20070523043519-e3eiktftt5q3f62e\nparent: sahlberg@ronnie-20070520232434-xxhlg6zdpx5kkvjp\ncommitter: Andrew Tridgell \nbranch nick: tridge\ntimestamp: Wed 2007-05-23 14:50:41 +1000\nmessage:\n merge from ronnie\nmodified:\n common/ctdb.c ctdb.c-20061127094323-t50f58d65iaao5of-2\n common/ctdb_client.c ctdb_client.c-20070411010216-3kd8v37k61steeya-1\n common/ctdb_control.c ctdb_control.c-20070426122724-j6gkpiofhbwdin63-1\n common/ctdb_monitor.c ctdb_monitor.c-20070518100625-8jf4ft1mjzmb22ck-1\n include/ctdb.h ctdb.h-20061117234101-o3qt14umlg9en8z0-11\n include/ctdb_private.h ctdb_private.h-20061117234101-o3qt14umlg9en8z0-13\n tools/ctdb_control.c ctdb_control.c-20070426122705-9ehj1l5lu2gn9kuj-1\n ------------------------------------------------------------\n revno: 326.1.3\n merged: sahlberg@ronnie-20070520232434-xxhlg6zdpx5kkvjp\n parent: sahlberg@ronnie-20070519221839-a9mpdvipiatk5nag\n committer: Ronnie Sahlberg \n branch nick: ctdb\n timestamp: Mon 2007-05-21 09:24:34 +1000\n message:\n add controls to enable/disable the monitoring of dead nodes\n ------------------------------------------------------------\n revno: 326.1.2\n merged: sahlberg@ronnie-20070519221839-a9mpdvipiatk5nag\n parent: sahlberg@ronnie-20070519065910-liyyxru2wl1eah69\n parent: tridge@samba.org-20070519111106-hmbognp9baltnxgw\n committer: Ronnie Sahlberg \n branch nick: ctdb\n timestamp: Sun 2007-05-20 08:18:39 +1000\n message:\n merge from tridge\n\nDiff too large for email (238, the limit is 200).\n\n",0,108,1829


Processed Data:


Unnamed: 0,text,label,word_count_processed,char_count_processed,char_reduction_%
6750,"Rev 332: merge from ronnie in [URL] [SEP] revno: 332 revision-id: [EMAIL]-20070523045041-a6v1tls6f3m01bqx parent: [EMAIL]-20070523043519-e3eiktftt5q3f62e parent: sahlberg@ronnie-20070520232434-xxhlg6zdpx5kkvjp committer: Andrew Tridgell branch nick: tridge timestamp: Wed 2007-05-23 14:50:41 +1000 message: merge from ronnie modified: common/ctdb.c ctdb.c-20061127094323-t50f58d65iaao5of-2 common/ctdb_client.c ctdb_client.c-20070411010216-3kd8v37k61steeya-1 common/ctdb_control.c ctdb_control.c-20070426122724-j6gkpiofhbwdin63-1 common/ctdb_monitor.c ctdb_monitor.c-20070518100625-8jf4ft1mjzmb22ck-1 include/ctdb.h ctdb.h-20061117234101-o3qt14umlg9en8z0-11 include/ctdb_private.h ctdb_private.h-20061117234101-o3qt14umlg9en8z0-13 tools/ctdb_control.c ctdb_control.c-20070426122705-9ehj1l5lu2gn9kuj-1 revno: 326.1.3 merged: sahlberg@ronnie-20070520232434-xxhlg6zdpx5kkvjp parent: sahlberg@ronnie-20070519221839-a9mpdvipiatk5nag committer: Ronnie Sahlberg branch nick: ctdb timestamp: Mon 2007-05-21 09:24:34 +1000 message: add controls to enable/disable the monitoring of dead nodes revno: 326.1.2 merged: sahlberg@ronnie-20070519221839-a9mpdvipiatk5nag parent: sahlberg@ronnie-20070519065910-liyyxru2wl1eah69 parent: [EMAIL]-20070519111106-hmbognp9baltnxgw committer: Ronnie Sahlberg branch nick: ctdb timestamp: Sun 2007-05-20 08:18:39 +1000 message: merge from tridge Diff too large for email (238, the limit is 200).",0,106,1424,22.361946


In [8]:
sample_preprocessed_data(test_raw, n=1)

Original Data:


Unnamed: 0,subject,body,label,word_count_original,char_count_original
1172,%User_name% rezep-tfrei Pharma,"Hallo %User_name%,\n\n%User_name%, auch ein richtiger Kerl leidet, wenn man seine Manneskraft einfach so verliert wenn die \nManneskraft schwindet und Sie nichts dagegen unternehmen \nkonnten, wie es zu frueheren Zeiten ueblich war. Erfreulicherweise ist die Zeit vorbei, denn Probleme mit dem ""hart"" werden sind mit Arzneien praktisch und diskret zu beheben und sind somit von gestern!\n\nAuch vorzeitiger Samenerguss wird durch die blauen Tabletten vorgebeugt. Also endlich wieder ein Liebesspiel das Sie wieder faszinieren wird, ohne \npermanent daran zu denken, dass der Spass zu schnell vorbei sein kann.\n\nAbsolut unglaublich, dass Sie dafuer zum Arzt muessten und dann noch zur ueberteuerten Offline-\nApotheke obwohl es bei uns, trotz \ngratis Lieferung nach Deutschland, Schweiz, Austria diskret in den Briefkasten, auch guenstiger ist?\n\nHier geht es weiter!\n\nhttps://teknoware.ae/apotheke.html\n\nGesund bleiben\nCEO Gerhard Eiser",1,134,958


Processed Data:


Unnamed: 0,text,label,word_count_processed,char_count_processed,char_reduction_%
1172,"%User_name% rezep-tfrei Pharma [SEP] Hallo %User_name%, %User_name%, auch ein richtiger Kerl leidet, wenn man seine Manneskraft einfach so verliert wenn die Manneskraft schwindet und Sie nichts dagegen unternehmen konnten, wie es zu frueheren Zeiten ueblich war. Erfreulicherweise ist die Zeit vorbei, denn Probleme mit dem ""hart"" werden sind mit Arzneien praktisch und diskret zu beheben und sind somit von gestern! Auch vorzeitiger Samenerguss wird durch die blauen Tabletten vorgebeugt. Also endlich wieder ein Liebesspiel das Sie wieder faszinieren wird, ohne permanent daran zu denken, dass der Spass zu schnell vorbei sein kann. Absolut unglaublich, dass Sie dafuer zum Arzt muessten und dann noch zur ueberteuerten Offline- Apotheke obwohl es bei uns, trotz gratis Lieferung nach Deutschland, Schweiz, Austria diskret in den Briefkasten, auch guenstiger ist? Hier geht es weiter! [URL] Gesund bleiben CEO Gerhard Eiser",1,135,925,3.444676


### No subject / body

In [9]:
df = read_dataset(train_raw)
df = process_text_columns(df)

no_subject_count = (df['subject'] == '[NO_SUBJECT]').sum()
no_body_count = (df['body'] == '[NO_BODY]').sum()

print(f"Rows with [NO_SUBJECT]: {no_subject_count}")
print(f"Rows with [NO_BODY]: {no_body_count}")

Rows with [NO_SUBJECT]: 121
Rows with [NO_BODY]: 0


### Preprocess All

In [10]:
prepare_and_save_full_data(train_raw, train_preprocessed)

Char Count (Original): 36345544
Char Count (Processed): 30822505
Char Reduction (Overall): 15.20%
Char Reduction (Avg. per row): 10.82%
Processed saved to c:\Users\ilian\Documents\Projects\git_projects\university\phishing_detection\data\preprocessed\data_mail\train_processed_bert.csv


In [11]:
prepare_and_save_full_data(test_raw, test_preprocessed)

Char Count (Original): 8448350
Char Count (Processed): 6190671
Char Reduction (Overall): 26.72%
Char Reduction (Avg. per row): 20.11%
Processed saved to c:\Users\ilian\Documents\Projects\git_projects\university\phishing_detection\data\preprocessed\data_mail\test_processed_bert.csv
