### Imports

In [17]:
import pandas as pd
import os
import langdetect
import numpy as np
from deep_translator import GoogleTranslator

In [18]:
import sys
sys.path.append(os.path.abspath('../../src'))
from helper_functions.path_resolver import DynamicPathResolver

dpr = DynamicPathResolver(marker="README.md")
paths = dpr.structure

Project Root: c:\Users\ilian\Documents\Projects\git_projects\university\phishing_detection


### Functions

In [19]:
def read_dataset(file_path):
    return pd.read_csv(file_path)

def detect_language(text):
    try:
        return langdetect.detect(str(text))
    except:
        return "unknown"

def add_language_column(df):
    df['language'] = df['body'].apply(detect_language)
    return df

def get_filtered_data(df, lang, amount):
    df = df[df['language'] == lang]
    return df.sample(n=min(amount, len(df)), random_state=42)

def translate_to_german(text):
    try:
        return GoogleTranslator(source='en', target='de').translate(text)
    except:
        return text


### Data to use

In [None]:
test_files = [
    paths.data.raw.data_mail.own.jannis_mail_csv,
    paths.data.raw.data_mail.own.mails_labeled_csv
]
train_files = [
    paths.data.raw.data_mail.curated.CEAS_08_csv,
    paths.data.raw.data_mail.curated.TREC_07_csv,
    paths.data.raw.data_mail.curated.Nazario_5_csv
]

In [None]:
train_raw = paths.data.raw.data_mail.train_raw_csv
test_raw = paths.data.raw.data_mail.test_raw_csv

train_processed = paths.data.preprocessed.data_mail.preprocessed_combined_train_csv
test_processed = paths.data.preprocessed.data_mail.preprocessed_combined_test_csv

### Assemble Test

In [None]:
test_dfs = [read_dataset(file) for file in test_files]
test_combined = pd.concat(test_dfs, ignore_index=True)
test_combined = add_language_column(test_combined)
test_combined = test_combined[test_combined['language'].isin(["de", "en"])]
test_combined.to_csv(test_raw, index=False)

In [None]:
test_cleaned = test_combined[test_combined['label'] != -1]

print(f"Row count after removing unlabeled -1: {len(test_cleaned)}")
print("Distribution (Language and Class):")
print(test_combined.groupby(['language', 'label']).size())

test_cleaned.to_csv(train_raw, index=False)

Number of rows after removing label -1: 2400


In [25]:
grouped = test_cleaned.groupby(['language', 'label'])

min_count = grouped.size().min()
print("\nMin group count:", min_count)

balanced_groups = [
    group.sample(n=min_count, random_state=42) 
    for _, group in grouped
]

test_data_balanced = pd.concat(balanced_groups, ignore_index=True)


Minimum group count across all groups: 12


In [26]:
print("\nBalanced Test Distribution (Language and Class):")
print(test_data_balanced.groupby(['language', 'label']).size())


Balanced Test Distribution (Language and Class):
language  label
de        0        12
          1        12
en        0        12
          1        12
dtype: int64


In [None]:
test_data_balanced.to_csv(test_raw, index=False)

### Assemble Train

##### Combined train datasets 

In [None]:
train_dfs = [read_dataset(file) for file in train_files]
train_combined = pd.concat(train_dfs, ignore_index=True)
train_combined = add_language_column(train_combined)
train_combined = train_combined[train_combined['language'] == "en"]  

##### Sample english

In [None]:
en_legit_sample = train_combined[train_combined['label'] == 0].sample(n=5000, random_state=42)
en_phish_sample = train_combined[train_combined['label'] == 1].sample(n=5000, random_state=42)
train_english = pd.concat([en_legit_sample, en_phish_sample], ignore_index=True)

##### Sample german (translated)

In [None]:
remaining = train_combined.drop(train_english.index)

In [None]:
rem_legit_sample = remaining[remaining['label'] == 0].sample(n=5000, random_state=42)
rem_phish_sample = remaining[remaining['label'] == 1].sample(n=5000, random_state=42)
train_to_translate = pd.concat([rem_legit_sample, rem_phish_sample], ignore_index=True)

In [None]:
train_to_translate['body'] = train_to_translate['body'].apply(translate_to_german)
train_to_translate['subject'] = train_to_translate['subject'].apply(translate_to_german)
train_to_translate['language'] = "de"

##### Combine

In [None]:
train_final = pd.concat([train_english, train_to_translate], ignore_index=True)
train_final.to_csv(train_raw, index=False)

### Balance test

In [27]:
train = read_dataset(train_raw)
test = read_dataset(test_raw)

In [30]:
print(f"Train data rows: {len(train)}")

print("\nTrain: Class Distribution:")
print(train['label'].value_counts())

for lang in ['de', 'en']:
    print(f"\n Class distribution (per language) '{lang}':")
    lang_subset = train[train['language'] == lang]
    print(lang_subset['label'].value_counts())

Train data rows: 20000

Train: Class Distribution:
label
1    10954
0     9046
Name: count, dtype: int64

 Class distribution (per language) 'de':
label
1    5489
0    4511
Name: count, dtype: int64

 Class distribution (per language) 'en':
label
1    5465
0    4535
Name: count, dtype: int64


In [31]:
print(f"Test data rows: {len(test)}")

print("\nTest: Class Distribution:")
print(test['label'].value_counts())

for lang in ['de', 'en']:
    print(f"\n Class distribution (per language) '{lang}':")
    lang_subset = test[test['language'] == lang]
    print(lang_subset['label'].value_counts())

Test data rows: 2400

Test: Class Distribution:
label
1    1200
0    1200
Name: count, dtype: int64

 Class distribution (per language) 'de':
label
0    1188
1     667
Name: count, dtype: int64

 Class distribution (per language) 'en':
label
1    533
0     12
Name: count, dtype: int64
