### Imports

In [2]:
import sys, os
sys.path.append(os.path.abspath('../../src'))

from helper_functions.path_resolver import DynamicPathResolver
from helper_functions.assemble_collect import *

### Paths

In [3]:
dpr = DynamicPathResolver(marker="README.md")

sets_dir = dpr.path.data.raw.data_mail.sets._path

train_paths_curated = [
    dpr.path.data.raw.data_mail.curated.CEAS_08_csv,
    dpr.path.data.raw.data_mail.curated.TREC_07_csv
]

test_paths_curated = [
    dpr.path.data.raw.data_mail.curated.Nazario_5_csv,
    dpr.path.data.raw.data_mail.curated.SpamAssasin_csv
]

paths_own = [
    dpr.path.data.raw.data_mail.own.mails_combined_csv,
   # dpr.path.data.raw.data_mail.own.mails_jannis.jannis_mail_csv
]

Project Root: c:\Users\ilian\Documents\Projects\git_projects\university\phishing_bert


### Detect And Add Language

##### Base paths

In [None]:
curated_train_base = os.path.join(sets_dir, "curated_train_base.csv")
curated_test_base  = os.path.join(sets_dir, "curated_test_base.csv")

##### Add language col to bases

In [None]:
add_lang_and_create_base(train_paths_curated, curated_train_base)
add_lang_and_create_base(test_paths_curated, curated_test_base)

##### Verify 

In [None]:
df_curated_train_base = pd.read_csv(curated_train_base)
verify(df_curated_train_base, "Curated Train Base")

df_curated_test_base = pd.read_csv(curated_test_base)
verify(df_curated_test_base, "Curated Test Base")

### Create Curated Train & Test Sets

##### 1. BERT – English Only

In [None]:
build_balanced_set(
    base_file=curated_train_base,
    out_file=os.path.join(sets_dir, "english_curated_train.csv"),
    total_size=20000
)

In [None]:
build_balanced_set(
    base_file=curated_test_base,
    out_file=os.path.join(sets_dir, "english_curated_test.csv"),
    total_size=4000
)

##### 2. BERT – German Only

In [None]:
translate_entire_dataset(
    eng_file=os.path.join(sets_dir, "english_curated_verification.csv"),
    out_file=os.path.join(sets_dir, "german_curated_verification.csv")
)

In [None]:
translate_entire_dataset(
    eng_file=os.path.join(sets_dir, "english_curated_train.csv"),
    out_file=os.path.join(sets_dir, "german_curated_train.csv")
)

In [None]:
translate_entire_dataset(
    eng_file=os.path.join(sets_dir, "english_curated_test.csv"),
    out_file=os.path.join(sets_dir, "german_curated_test.csv")
)

##### 3. BERT – Multilingual

In [None]:
sample_from_existing_datasets(
    eng_file=os.path.join(sets_dir, "english_curated_verification.csv"),
    de_file=os.path.join(sets_dir, "german_curated_verification.csv"),
    out_file=os.path.join(sets_dir, "multilingual_curated_verification.csv"),
    english_size=2000,
    german_size=2000
)

In [None]:
sample_from_existing_datasets(
    eng_file=os.path.join(sets_dir, "english_curated_train.csv"),
    de_file=os.path.join(sets_dir, "german_curated_train.csv"),
    out_file=os.path.join(sets_dir, "multilingual_curated_train.csv"),
    english_size=10000,
    german_size=10000
)

In [None]:
sample_from_existing_datasets(
    eng_file=os.path.join(sets_dir, "english_curated_test.csv"),
    de_file=os.path.join(sets_dir, "german_curated_test.csv"),
    out_file=os.path.join(sets_dir, "multilingual_curated_test.csv"),
    english_size=2000,
    german_size=2000
)

### Verification Set 

##### Sample non-overlapping

In [None]:
used_train_file = os.path.join(sets_dir, "english_curated_train.csv")
output_file = os.path.join(sets_dir, "english_curated_verification.csv")

sample_non_overlapping(train_paths_curated, used_train_file, output_file)

##### Check duplicates

In [None]:
csv_file1 = dpr.path.data.raw.data_mail.sets.english_curated_test_csv 
csv_file2 = dpr.path.data.raw.data_mail.sets.english_curated_verification_csv

In [None]:
num_duplicates = check_duplicates(csv_file1, csv_file2)

### Create own

In [None]:
own_base = os.path.join(sets_dir, "own_base.csv")
own_train_base = os.path.join(sets_dir, "own_train_base.csv")
own_test_base  = os.path.join(sets_dir, "own_test_base.csv")

In [None]:
add_lang_and_create_base(paths_own, own_base)

In [None]:
balance_and_split_dataset(own_base, own_train_base, own_test_base)

In [None]:
df_own_train_base = pd.read_csv(own_train_base)
verify(df_own_train_base, "Own Train Base")

df_own_test_base = pd.read_csv(own_test_base)
verify(df_own_test_base, "Own Test Base")

### Verify Balances

##### Verify Verification

In [4]:
df_multilangual_curated_verification = pd.read_csv(os.path.join(sets_dir, "english_curated_verification.csv"))
verify(df_multilangual_curated_verification, "English Curated Verification")

df_multilangual_curated_verification = pd.read_csv(os.path.join(sets_dir, "german_curated_verification.csv"))
verify(df_multilangual_curated_verification, "German Curated Verification")

df_multilangual_curated_verification = pd.read_csv(os.path.join(sets_dir, "multilingual_curated_verification.csv"))
verify(df_multilangual_curated_verification, "Multilingual Curated Verification")


English Curated Verification, Rows: 4000
----------------------------------------
Class Distribution: {0: 2000, 1: 2000}
Language Distribution: {'en': 4000}
Detailed (Class, Language) Distribution: {(0, 'en'): 2000, (1, 'en'): 2000}
----------------------------------------

German Curated Verification, Rows: 4000
----------------------------------------
Class Distribution: {0: 2000, 1: 2000}
Language Distribution: {'de': 4000}
Detailed (Class, Language) Distribution: {(0, 'de'): 2000, (1, 'de'): 2000}
----------------------------------------

Multilingual Curated Verification, Rows: 4000
----------------------------------------
Class Distribution: {1: 2000, 0: 2000}
Language Distribution: {'en': 2000, 'de': 2000}
Detailed (Class, Language) Distribution: {(0, 'de'): 1000, (0, 'en'): 1000, (1, 'de'): 1000, (1, 'en'): 1000}
----------------------------------------


##### Verify English

In [5]:
# Curated-based train english
df_eng_curated_train = pd.read_csv(os.path.join(sets_dir, "english_curated_train.csv"))
verify(df_eng_curated_train, "English Curated Train")

# Curated-based test english
df_eng_curated_test = pd.read_csv(os.path.join(sets_dir, "english_curated_test.csv"))
verify(df_eng_curated_test, "English Curated Test")


English Curated Train, Rows: 20000
----------------------------------------
Class Distribution: {0: 10000, 1: 10000}
Language Distribution: {'en': 20000}
Detailed (Class, Language) Distribution: {(0, 'en'): 10000, (1, 'en'): 10000}
----------------------------------------

English Curated Test, Rows: 4000
----------------------------------------
Class Distribution: {0: 2000, 1: 2000}
Language Distribution: {'en': 4000}
Detailed (Class, Language) Distribution: {(0, 'en'): 2000, (1, 'en'): 2000}
----------------------------------------


##### Verify German

In [6]:
# Curated-based train german
df_germ_curated_train = pd.read_csv(os.path.join(sets_dir, "german_curated_train.csv"))
verify(df_germ_curated_train, "German Curated Train")

# Curated-based test german
df_germ_curated_test = pd.read_csv(os.path.join(sets_dir, "german_curated_test.csv"))
verify(df_germ_curated_test, "German Curated Test")


German Curated Train, Rows: 20000
----------------------------------------
Class Distribution: {0: 10000, 1: 10000}
Language Distribution: {'de': 20000}
Detailed (Class, Language) Distribution: {(0, 'de'): 10000, (1, 'de'): 10000}
----------------------------------------

German Curated Test, Rows: 4000
----------------------------------------
Class Distribution: {0: 2000, 1: 2000}
Language Distribution: {'de': 4000}
Detailed (Class, Language) Distribution: {(0, 'de'): 2000, (1, 'de'): 2000}
----------------------------------------


##### Verify Multi

In [7]:
# Curated-based train multi
df_mult_curated_train = pd.read_csv(os.path.join(sets_dir, "multilingual_curated_train.csv"))
verify(df_mult_curated_train, "Multilingual Curated Train")

# Curated-based test multi
df_mult_curated_test = pd.read_csv(os.path.join(sets_dir, "multilingual_curated_test.csv"))
verify(df_mult_curated_test, "Multilingual Curated Test")


Multilingual Curated Train, Rows: 20000
----------------------------------------
Class Distribution: {1: 10000, 0: 10000}
Language Distribution: {'en': 10000, 'de': 10000}
Detailed (Class, Language) Distribution: {(0, 'de'): 5000, (0, 'en'): 5000, (1, 'de'): 5000, (1, 'en'): 5000}
----------------------------------------

Multilingual Curated Test, Rows: 4000
----------------------------------------
Class Distribution: {1: 2000, 0: 2000}
Language Distribution: {'en': 2000, 'de': 2000}
Detailed (Class, Language) Distribution: {(0, 'de'): 1000, (0, 'en'): 1000, (1, 'de'): 1000, (1, 'en'): 1000}
----------------------------------------


##### Verify Own

In [8]:
# Own Base 
df_own_base = pd.read_csv(os.path.join(sets_dir, "own_base.csv"))
verify(df_own_base, "Own Base")

# Own Train
df_own_train_base = pd.read_csv(os.path.join(sets_dir, "own_train_base.csv"))
verify(df_own_train_base, "German Own Train")

# Own Test
df_own_test_base = pd.read_csv(os.path.join(sets_dir, "own_test_base.csv"))
verify(df_own_test_base, "German Own Test")


Own Base, Rows: 8162
----------------------------------------
Class Distribution: {1: 4543, 0: 3609, -1: 10}
Language Distribution: {'de': 7047, 'en': 470, 'ru': 335, 'af': 42, 'sv': 26, 'fr': 24, 'nl': 23, 'no': 21, 'unknown': 21, 'et': 18, 'bg': 16, 'tl': 15, 'it': 15, 'mk': 14, 'da': 13, 'id': 13, 'pl': 10, 'tr': 7, 'cy': 6, 'uk': 4, 'ca': 4, 'es': 4, 'so': 3, 'hu': 2, 'ro': 2, 'fi': 2, 'pt': 1, 'lv': 1, 'hr': 1, 'ja': 1, 'sk': 1}
Detailed (Class, Language) Distribution: {(-1, 'de'): 8, (-1, 'en'): 1, (-1, 'no'): 1, (0, 'af'): 23, (0, 'ca'): 1, (0, 'cy'): 1, (0, 'da'): 10, (0, 'de'): 3169, (0, 'en'): 323, (0, 'et'): 17, (0, 'fi'): 1, (0, 'fr'): 14, (0, 'hu'): 2, (0, 'id'): 5, (0, 'it'): 2, (0, 'nl'): 5, (0, 'no'): 8, (0, 'pl'): 1, (0, 'ro'): 1, (0, 'sk'): 1, (0, 'sv'): 11, (0, 'tl'): 12, (0, 'tr'): 1, (0, 'unknown'): 1, (1, 'af'): 19, (1, 'bg'): 16, (1, 'ca'): 3, (1, 'cy'): 5, (1, 'da'): 3, (1, 'de'): 3870, (1, 'en'): 146, (1, 'es'): 4, (1, 'et'): 1, (1, 'fi'): 1, (1, 'fr'): 10, (1