# Preperation

### Imports

In [31]:
import pandas as pd
import textstat
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt

### Paths

In [32]:
import sys, os
sys.path.append(os.path.abspath('../../src'))
from helper_functions.path_resolver import DynamicPathResolver

dpr = DynamicPathResolver(marker="README.md")
paths = dpr.structure

Project Root: c:\Users\ilian\Documents\Projects\git_projects\university\phishing_detection


In [33]:
train_raw = paths.data.raw.data_mail.train_raw_balanced_csv
test_raw = paths.data.raw.data_mail.test_raw_de_csv

train_preprocessed = paths.data.preprocessed.data_mail.train_processed_balanced_csv
test_preprocessed = paths.data.preprocessed.data_mail.test_processed_de_csv

models_folder = dpr.get_folder_path_from_namespace(paths.models.bert)
output_dir = dpr.get_folder_path_from_namespace(paths.models.bert.results)

os.makedirs(models_folder, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)

### Functions

In [34]:
def read_dataset(file_path):
    return pd.read_csv(file_path)

def process_text_columns(df):
    df['subject'] = df['subject'].fillna('[NO_SUBJECT]').astype(str)
    df['body'] = df['body'].fillna('[NO_BODY]').astype(str)
    return df


def clean_text(text):
    text = re.sub(r'\s+', ' ', text).strip()
    substitutions = [
        (r'https?://\S+|www\.\S+', '[URL]'),
        (r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', '[EMAIL]'),
        (r'-{2,}', ' '),
        (r'!{2,}', '!'),
        (r'\?{2,}', '?'),
        (r'[_+*]{2,}', ' '),
        (r'[=+]{3,}', ' '),
        (r'[~]{3,}', ' '),
        (r'[#]{3,}', ' '),
        (r'[<]{3,}', ' '),
        (r'[>]{3,}', ' ')
    ]
        
    for pattern, repl in substitutions:
        text = re.sub(pattern, repl, text)
    
    return text.strip()


def combine_text_fields(df):
    df['subject'] = df['subject'].apply(clean_text)
    df['body'] = df['body'].apply(clean_text)
    df['text'] = df['subject'] + " [SEP] " + df['body']
    return df


def prepare_bert_data(df):
    df = process_text_columns(df)
    df = combine_text_fields(df)
    return df[['text', 'label']]

In [35]:
def calculate_reduction(original, processed):
    original_length = len(original)
    processed_length = len(processed)
    if original_length == 0:
        return 0
    reduction_percentage = (original_length - processed_length) / original_length * 100
    return reduction_percentage

def sample_preprocessed_data(input_file, n=1):
    pd.set_option('display.max_colwidth', None)
    
    df = read_dataset(input_file)
    df_sample = df.sample(n).copy()
    
    # Original
    df_sample['full_text_original'] = df_sample['subject'].astype(str) + " " + df_sample['body'].astype(str)
    df_sample['word_count_original'] = df_sample['full_text_original'].apply(lambda x: len(x.split()))
    df_sample['char_count_original'] = df_sample['full_text_original'].apply(lambda x: len(x))
    
    print("Original Data:")
    display(df_sample[['subject', 'body', 'label', 'word_count_original', 'char_count_original']])
    
    # Processed
    df_processed = prepare_bert_data(df_sample).copy()
    df_processed['word_count_processed'] = df_processed['text'].apply(lambda x: len(x.split()))
    df_processed['char_count_processed'] = df_processed['text'].apply(lambda x: len(x))
    
    # Reduction in percentage
    df_processed['char_reduction_%'] = df_sample.apply(
        lambda row: calculate_reduction(row['full_text_original'], prepare_bert_data(pd.DataFrame([row]))['text'].iloc[0]),
        axis=1
    )
    
    print("Processed Data:")
    display(df_processed[['text', 'label', 'word_count_processed', 'char_count_processed', 'char_reduction_%']])


In [36]:
def prepare_and_save_full_data(input_file, output_file):
    # Load data
    df = read_dataset(input_file)
    
    # Original
    df['full_text_original'] = df['subject'].astype(str) + " " + df['body'].astype(str)
    df['char_count_original'] = df['full_text_original'].apply(lambda x: len(x))
    
    # Process 
    df_processed = prepare_bert_data(df).copy()
    df_processed['char_count_processed'] = df_processed['text'].apply(lambda x: len(x))
    
    # Reduction
    total_orig_chars = df['char_count_original'].sum()
    total_proc_chars = df_processed['char_count_processed'].sum()
    overall_reduction = (total_orig_chars - total_proc_chars) / total_orig_chars * 100 if total_orig_chars > 0 else 0
    
    # Avg. reduction
    df['reduction_%'] = df.apply(
        lambda row: calculate_reduction(
            row['full_text_original'], 
            prepare_bert_data(pd.DataFrame([row]))['text'].iloc[0]
        ),
        axis=1
    )
    avg_reduction = df['reduction_%'].mean()
    
    print(f"Char Count (Original): {total_orig_chars}")
    print(f"Char Count (Processed): {total_proc_chars}")
    print(f"Char Reduction (Overall): {overall_reduction:.2f}%")
    print(f"Char Reduction (Avg. per row): {avg_reduction:.2f}%")
    
    # Save 
    df_processed.to_csv(output_file, index=False)
    print(f"Processed saved to {output_file}")


### Preprocess Test

In [37]:
sample_preprocessed_data(train_raw, n=1)

Original Data:


Unnamed: 0,subject,body,label,word_count_original,char_count_original
2062,[Ip-health] Economist Article,"Drugs companies' patents are under attack. Will this really help the\npoor?\nJun 7th 2007 | NEW YORK\n>From The Economist print edition\n\nNOBODY could fault Thailand for want of ambition. At the recent Bio\nconference, the largest annual gathering of the biotechnology industry,\nit pitched itself hard as an emerging pharmaceutical power, with a\ndazzling pavilion, visiting luminaries and free drinks for all.\nInstead, the arguments with Thailand are over means, not ends.\n\nAt the end of last year the Thai government stunned the drugs industry\nwhen it said it would overrule international patents for Efavirenz, an\nanti-retroviral drug made by Merck, an American firm, and switch to a\nThai-made generic copy at half the price. The country had signed the\nAgreement on Trade-Related Aspects of Intellectual Property Rights\n(TRIPS), which protects drugs patents. But that deal allows “compulsory\nlicensing” only under special conditions—conditions that some\ncomplained Thailand did not fulfil.\n\nIn the months since then, Thailand has said it would overrule the\npatents on two more drugs, and it may soon add a further pair to the\nlist. Moreover, other countries have followed its lead. Brazil declared\nlast month that Merck was charging too much for Efavirenz. In recent\nweeks the health ministers of India, Malaysia and Kenya have also\nmuttered about pursuing compulsory licensing.\n\nAll this has sparked an almighty row. Many global-health advocates—\nincluding Bill Clinton, whose foundation works on HIV—have applauded\nthe trend, arguing that access to cheap generic drugs will greatly help\nthe poor. Last month the World Health Organisation passed a resolution\nsupporting compulsory licensing. America objected vociferously, but\nother rich countries supported the motion.\n\nDrug executives are furious. Jon Pender of GlaxoSmithKline, a British\ndrugs giant, insists that compulsory licensing was meant to be used\nonly “as a last resort”. He argues that although compulsory licensing\nis legal, TRIPS rules allow it only under limited circumstances, such\nas national health emergencies, and only after lengthy efforts to\nnegotiate prices with firms.\n\n“It is easy to see Big Pharma as a source of evil,” laments Daniel\nVasella, chairman of Novartis, a Swiss drugs giant. His firm is\ninvolved in a closely watched patent case in India that involves\nGleevec, a cancer drug. Without innovation, he insists, future\ngenerations will have fewer life-saving drugs—“which is equally\nunethical as lack of access now.” And as Fred Hassan, the boss of\nAmerica's Schering-Plough and head of the international pharmaceutical\nlobby, warns, “without intellectual property there is no innovation.”\n\nAt first sight, this row reflects an old dilemma that pits today's\npatients against tomorrow's. Compulsory licensing means that more Thais\nwill get HIV drugs now, but it also means that drugs firms will be less\nkeen to invest in drugs for Thailand in the future. Yet look closer and\nthis is more than a fight between the poor-country sick and rich-world\ndrugs companies. What makes it different is the role of two new actors:\nmuscular middle-income countries and the rising generics industry.\n\nThis controversy has been sparked not by the poorest countries, which\nalready get most of their drugs at low cost, but by middle-income ones.\nThey have long used the threat of compulsory licensing to win\ndiscounts, but by actually imposing such licensing they shift the\nbalance of power. Rudolf Van Puymbroeck, a former senior lawyer at the\nWorld Bank, likens compulsory licensing to other sorts of compulsory\nstate purchases: “Firms are upset not because this is illegal, but\nbecause they are in a very weak position to negotiate compensation\nafter expropriation.”\n\n\n\nFollow the money\nA perverse result of this trend is that middle-income countries are\ngetting cheaper drugs, whereas quieter and perhaps more deserving\nneighbours are not. Thailand's poor no doubt need treatment, but the\nmilitary regime is wealthy enough to spend more on health care. Richard\nEpstein of the University of Chicago law school has observed that there\nis nothing to stop AIDS organisations or foreign governments from\nbuying these products at a negotiated price and then giving them away\nfree. “Charity can come from anywhere, not just drug companies,” he\nnotes.\n\nEven experts devoted to the cause of helping the poor get access to\ndrugs see the trend as worrying. “Brazil is not Rwanda, which cannot\nafford to pay,” says Tadataka Yamada of the Gates Foundation, a giant\ncharity. Victoria Hale, head of OneWorld Health, an innovative non-\nprofit pharmaceutical firm, reckons that compulsory licensing could\nprove “the last blow” that pushes the drug industry away from looking\nfor cures for diseases of the poor world, which are already woefully\nneglected.\n\nWhether or not the poor end up suffering in the long run from\ndiminished innovation, a sure winner from the trend towards compulsory\nlicensing is the generic-drugs industry. Under a provision of the TRIPS\ntreaty, countries that invoke compulsory licensing but lack domestic\nmanufacturing are allowed to import generic drugs from another\ncountry.\n\nThis promises a gold rush for generics firms. Canada encourages\ndomestic firms to produce copycat drugs for precisely this reason. But\ntheir costs are so high that such exports cannot hope to compete with\nthe cheaper pills produced by India, argues Amir Attaran of the\nUniversity of Ottawa. Small wonder that executives at Cipla, one of the\nIndian firms already making generic versions of HIV drugs, warmly\napplaud the trend and welcome Brazil's support for compulsory\nlicensing, which they say “helps protect the rights of citizens”.\n\nDoes the future therefore belong to compulsory licensing? If so, there\nmay be trouble ahead for both pillmakers and punters. Dr Yamada fears\nthat compulsory licensing could prove “lethal” for the industry. He\nsuggests that drugs firms and middle-income countries ought instead to\nuse a sliding scale, based on GDP per head, to determine prices. Bruce\nLehman, a lawyer who worked on the TRIPS accord in the Clinton\nadministration, thinks it is cynical for middle-income countries “to\navoid paying their fair share of drug-discovery costs”. In doing so, he\nfears, they risk provoking a backlash from Americans who will, in\neffect, have to pay more as a result.\n\nBut things may not get that ugly, thanks to the growing influence of\ninnovators in developing countries themselves. Even in India drugs\npatents have their defenders. Ranbaxy is a local firm that made its\nname by manufacturing knock-off drugs. But ask Ramesh Adige, a member\nof its board, about the current trend and he is quick with his reply:\n“We do not encourage compulsory licensing.” He explains that his firm\nhas 1,100 researchers and invests 7% of its turnover in research and\ndevelopment. It already has process patents, and within two years hopes\nto have patents for novel drugs (for malaria, possibly). “We are very\nsupportive of intellectual-property rights, as innovations must be\ngiven their reward,” he says.\n\nThe reason Ranbaxy has gone from ignoring patents to defending them is\nthat it now has inventions of its own that it wishes to protect. Even\nYongyuth Yuthavong, Thailand's science and technology minister, seems\nto acknowledge the risk his country is taking. “As a scientist myself,\nI know the value of intellectual property,” he says. “In future, we\nshould develop our own drugs industry.”\n\n\n\n\n\n\n\n___________________________________________________________\n\nTiscali Broadband only £9.99 a month for your first 3 months! http://www.tiscali.co.uk/products/broadband/\n_______________________________________________\nIp-health mailing list\nIp-health@lists.essential.org\nhttp://lists.essential.org/mailman/listinfo/ip-health\n\n",0,1209,7824


Processed Data:


Unnamed: 0,text,label,word_count_processed,char_count_processed,char_reduction_%
2062,"[Ip-health] Economist Article [SEP] Drugs companies' patents are under attack. Will this really help the poor? Jun 7th 2007 | NEW YORK >From The Economist print edition NOBODY could fault Thailand for want of ambition. At the recent Bio conference, the largest annual gathering of the biotechnology industry, it pitched itself hard as an emerging pharmaceutical power, with a dazzling pavilion, visiting luminaries and free drinks for all. Instead, the arguments with Thailand are over means, not ends. At the end of last year the Thai government stunned the drugs industry when it said it would overrule international patents for Efavirenz, an anti-retroviral drug made by Merck, an American firm, and switch to a Thai-made generic copy at half the price. The country had signed the Agreement on Trade-Related Aspects of Intellectual Property Rights (TRIPS), which protects drugs patents. But that deal allows “compulsory licensing” only under special conditions—conditions that some complained Thailand did not fulfil. In the months since then, Thailand has said it would overrule the patents on two more drugs, and it may soon add a further pair to the list. Moreover, other countries have followed its lead. Brazil declared last month that Merck was charging too much for Efavirenz. In recent weeks the health ministers of India, Malaysia and Kenya have also muttered about pursuing compulsory licensing. All this has sparked an almighty row. Many global-health advocates— including Bill Clinton, whose foundation works on HIV—have applauded the trend, arguing that access to cheap generic drugs will greatly help the poor. Last month the World Health Organisation passed a resolution supporting compulsory licensing. America objected vociferously, but other rich countries supported the motion. Drug executives are furious. Jon Pender of GlaxoSmithKline, a British drugs giant, insists that compulsory licensing was meant to be used only “as a last resort”. He argues that although compulsory licensing is legal, TRIPS rules allow it only under limited circumstances, such as national health emergencies, and only after lengthy efforts to negotiate prices with firms. “It is easy to see Big Pharma as a source of evil,” laments Daniel Vasella, chairman of Novartis, a Swiss drugs giant. His firm is involved in a closely watched patent case in India that involves Gleevec, a cancer drug. Without innovation, he insists, future generations will have fewer life-saving drugs—“which is equally unethical as lack of access now.” And as Fred Hassan, the boss of America's Schering-Plough and head of the international pharmaceutical lobby, warns, “without intellectual property there is no innovation.” At first sight, this row reflects an old dilemma that pits today's patients against tomorrow's. Compulsory licensing means that more Thais will get HIV drugs now, but it also means that drugs firms will be less keen to invest in drugs for Thailand in the future. Yet look closer and this is more than a fight between the poor-country sick and rich-world drugs companies. What makes it different is the role of two new actors: muscular middle-income countries and the rising generics industry. This controversy has been sparked not by the poorest countries, which already get most of their drugs at low cost, but by middle-income ones. They have long used the threat of compulsory licensing to win discounts, but by actually imposing such licensing they shift the balance of power. Rudolf Van Puymbroeck, a former senior lawyer at the World Bank, likens compulsory licensing to other sorts of compulsory state purchases: “Firms are upset not because this is illegal, but because they are in a very weak position to negotiate compensation after expropriation.” Follow the money A perverse result of this trend is that middle-income countries are getting cheaper drugs, whereas quieter and perhaps more deserving neighbours are not. Thailand's poor no doubt need treatment, but the military regime is wealthy enough to spend more on health care. Richard Epstein of the University of Chicago law school has observed that there is nothing to stop AIDS organisations or foreign governments from buying these products at a negotiated price and then giving them away free. “Charity can come from anywhere, not just drug companies,” he notes. Even experts devoted to the cause of helping the poor get access to drugs see the trend as worrying. “Brazil is not Rwanda, which cannot afford to pay,” says Tadataka Yamada of the Gates Foundation, a giant charity. Victoria Hale, head of OneWorld Health, an innovative non- profit pharmaceutical firm, reckons that compulsory licensing could prove “the last blow” that pushes the drug industry away from looking for cures for diseases of the poor world, which are already woefully neglected. Whether or not the poor end up suffering in the long run from diminished innovation, a sure winner from the trend towards compulsory licensing is the generic-drugs industry. Under a provision of the TRIPS treaty, countries that invoke compulsory licensing but lack domestic manufacturing are allowed to import generic drugs from another country. This promises a gold rush for generics firms. Canada encourages domestic firms to produce copycat drugs for precisely this reason. But their costs are so high that such exports cannot hope to compete with the cheaper pills produced by India, argues Amir Attaran of the University of Ottawa. Small wonder that executives at Cipla, one of the Indian firms already making generic versions of HIV drugs, warmly applaud the trend and welcome Brazil's support for compulsory licensing, which they say “helps protect the rights of citizens”. Does the future therefore belong to compulsory licensing? If so, there may be trouble ahead for both pillmakers and punters. Dr Yamada fears that compulsory licensing could prove “lethal” for the industry. He suggests that drugs firms and middle-income countries ought instead to use a sliding scale, based on GDP per head, to determine prices. Bruce Lehman, a lawyer who worked on the TRIPS accord in the Clinton administration, thinks it is cynical for middle-income countries “to avoid paying their fair share of drug-discovery costs”. In doing so, he fears, they risk provoking a backlash from Americans who will, in effect, have to pay more as a result. But things may not get that ugly, thanks to the growing influence of innovators in developing countries themselves. Even in India drugs patents have their defenders. Ranbaxy is a local firm that made its name by manufacturing knock-off drugs. But ask Ramesh Adige, a member of its board, about the current trend and he is quick with his reply: “We do not encourage compulsory licensing.” He explains that his firm has 1,100 researchers and invests 7% of its turnover in research and development. It already has process patents, and within two years hopes to have patents for novel drugs (for malaria, possibly). “We are very supportive of intellectual-property rights, as innovations must be given their reward,” he says. The reason Ranbaxy has gone from ignoring patents to defending them is that it now has inventions of its own that it wishes to protect. Even Yongyuth Yuthavong, Thailand's science and technology minister, seems to acknowledge the risk his country is taking. “As a scientist myself, I know the value of intellectual property,” he says. “In future, we should develop our own drugs industry.” Tiscali Broadband only £9.99 a month for your first 3 months! [URL] Ip-health mailing list [EMAIL] [URL]",0,1208,7590,3.041922


In [38]:
sample_preprocessed_data(test_raw, n=1)

Original Data:


Unnamed: 0,subject,body,label,word_count_original,char_count_original
2142,🕖 7 Sekunden und Schluss mit dem Druckgefühl!,"Privatsph�re ist uns wichtig: So wichtig es uns ist, Ihrem Unternehmen zu helfen, so wichtig ist uns auch Ihre Privatsph�re.\n\nWir f�hlen uns dem Recht auf Ihre Privatsph�re verpflichtet und sind bestrebt, ein sicheres und sicheres Benutzererlebnis zu bieten. \n\nIn unseren Datenschutzrichtlinien wird erl�utert, wie wir die von Ihnen auf unserer Website bereitgestellten personenbezogenen Daten erheben, aufbewahren und verwenden .\n\nEs erkl�rt auch, wie wir es sicher und sicherstellen, dass wir es niemals so missbrauchen, wie es sein sollte privat bleiben.\n\nWas Unsere Privatsph�re sch�tzt: Wenn Sie unsere Website, stellen Sie uns m�glicherweise zwei Arten von Informationen zur Verf�gung: freiwillig bereitgestellten personenbezogenen Daten und der Website-Aktivit�t -Informationen, die auf kombinierter \n\nBasis gesammelt wurden, w�hrend Sie und andere navigieren unsere Online-Plattform \n\nWenn Sie unsere Website besuchen, legen wir gro�en Wert darauf, Ihre Privatsph�re zu gew�hrleisten und allen unseren Besuchern sicheres und qualitativ hochwertiges Online-Erlebnis zu bieten.\n\nWir verstehen, dass es Ihnen wichtig ist, wie die Informationen, die Sie uns zur Verf�gung stellen, mit uns teilen, Reihe von Datenschutzrichtlinien entwickelt, um Sie �ber unsere Richtlinien bez�glich der Erfassung, Nutzung und Offenlegung von Informationen zu informieren, die wir von Benutzern unserer Website erhalten .\n\nUnsere Datenschutzrichtlinien sowie unsere Allgemeinen Gesch�ftsbedingungen regeln Ihre Nutzung dieser Website. Indem Sie unsere Website nutzen oder unsere Benutzervereinbarung akzeptieren (per Opt-in, Kontrollk�stchen,Pop-up oder Ausw�hlen eines E-Mail-Links, der dies best�tigt),\n\nerkl�ren Sie sich damit einverstanden, daran gebunden zu sein unsere Allgemeinen Gesch�ftsbedingungen und Datenschutzrichtlinien .\n\nWenn Sie personalisierte, Zahlungsdaten oder andere optionale Informationen angegeben haben, k�nnen Sie diese ansehen , �berpr�fen und bearbeiten �ber die Richtlinien auf der Website oder indem Sie auf diese E-Mail antworten.\n\nUm Ihren Empfang von Marketing- und nicht-transaktionalen, indem Sie auf den Opt-out -Link unten klicken unten in jeder promotional-E-Mail.\n\nE-Mails im Zusammenhang mit Ihrem Kauf oder der Lieferung von Bestellungen werden automatisch bereitgestellt.\n\n\u2013 Kunden k�nnen transaktionsrelevante E-Mails nicht abbestellen. Wir werden uns bem�hen, alle diesbez�glichen Anforderungen zu erf�llen. zur Verwaltung personenbezogener Daten zeitnah.\n\nAllerdings ist es nicht immer wahrscheinlich , Informationen in unseren Datenbanken vollst�ndig zu entfernen oder zu �ndern (zum Beispiel, wenn wir eine rechtliche Verpflichtung), sie f�r bestimmte Zeitr�ume aufzubewahren, zum Beispiel Beispiel).\n\nWenn Sie irgendwelche Abfragen haben, antworten einfach auf diese E-Mail oder erkunden Sie unsere Website, um unsere formelle Richtlinien.",1,380,2970


Processed Data:


Unnamed: 0,text,label,word_count_processed,char_count_processed,char_reduction_%
2142,"🕖 7 Sekunden und Schluss mit dem Druckgefühl! [SEP] Privatsph�re ist uns wichtig: So wichtig es uns ist, Ihrem Unternehmen zu helfen, so wichtig ist uns auch Ihre Privatsph�re. Wir f�hlen uns dem Recht auf Ihre Privatsph�re verpflichtet und sind bestrebt, ein sicheres und sicheres Benutzererlebnis zu bieten. In unseren Datenschutzrichtlinien wird erl�utert, wie wir die von Ihnen auf unserer Website bereitgestellten personenbezogenen Daten erheben, aufbewahren und verwenden . Es erkl�rt auch, wie wir es sicher und sicherstellen, dass wir es niemals so missbrauchen, wie es sein sollte privat bleiben. Was Unsere Privatsph�re sch�tzt: Wenn Sie unsere Website, stellen Sie uns m�glicherweise zwei Arten von Informationen zur Verf�gung: freiwillig bereitgestellten personenbezogenen Daten und der Website-Aktivit�t -Informationen, die auf kombinierter Basis gesammelt wurden, w�hrend Sie und andere navigieren unsere Online-Plattform Wenn Sie unsere Website besuchen, legen wir gro�en Wert darauf, Ihre Privatsph�re zu gew�hrleisten und allen unseren Besuchern sicheres und qualitativ hochwertiges Online-Erlebnis zu bieten. Wir verstehen, dass es Ihnen wichtig ist, wie die Informationen, die Sie uns zur Verf�gung stellen, mit uns teilen, Reihe von Datenschutzrichtlinien entwickelt, um Sie �ber unsere Richtlinien bez�glich der Erfassung, Nutzung und Offenlegung von Informationen zu informieren, die wir von Benutzern unserer Website erhalten . Unsere Datenschutzrichtlinien sowie unsere Allgemeinen Gesch�ftsbedingungen regeln Ihre Nutzung dieser Website. Indem Sie unsere Website nutzen oder unsere Benutzervereinbarung akzeptieren (per Opt-in, Kontrollk�stchen,Pop-up oder Ausw�hlen eines E-Mail-Links, der dies best�tigt), erkl�ren Sie sich damit einverstanden, daran gebunden zu sein unsere Allgemeinen Gesch�ftsbedingungen und Datenschutzrichtlinien . Wenn Sie personalisierte, Zahlungsdaten oder andere optionale Informationen angegeben haben, k�nnen Sie diese ansehen , �berpr�fen und bearbeiten �ber die Richtlinien auf der Website oder indem Sie auf diese E-Mail antworten. Um Ihren Empfang von Marketing- und nicht-transaktionalen, indem Sie auf den Opt-out -Link unten klicken unten in jeder promotional-E-Mail. E-Mails im Zusammenhang mit Ihrem Kauf oder der Lieferung von Bestellungen werden automatisch bereitgestellt. \u2013 Kunden k�nnen transaktionsrelevante E-Mails nicht abbestellen. Wir werden uns bem�hen, alle diesbez�glichen Anforderungen zu erf�llen. zur Verwaltung personenbezogener Daten zeitnah. Allerdings ist es nicht immer wahrscheinlich , Informationen in unseren Datenbanken vollst�ndig zu entfernen oder zu �ndern (zum Beispiel, wenn wir eine rechtliche Verpflichtung), sie f�r bestimmte Zeitr�ume aufzubewahren, zum Beispiel Beispiel). Wenn Sie irgendwelche Abfragen haben, antworten einfach auf diese E-Mail oder erkunden Sie unsere Website, um unsere formelle Richtlinien.",1,381,2915,1.851852


### No subject / body

In [39]:
df = read_dataset(train_raw)
df = process_text_columns(df)

no_subject_count = (df['subject'] == '[NO_SUBJECT]').sum()
no_body_count = (df['body'] == '[NO_BODY]').sum()

print(f"Rows with [NO_SUBJECT]: {no_subject_count}")
print(f"Rows with [NO_BODY]: {no_body_count}")

Rows with [NO_SUBJECT]: 101
Rows with [NO_BODY]: 0


### Preprocess All

In [15]:
prepare_and_save_full_data(train_raw, train_preprocessed)

Char Count (Original): 36866391
Char Count (Processed): 31624838
Char Reduction (Overall): 14.22%
Char Reduction (Avg. per row): 11.43%
Processed saved to c:\Users\ilian\Documents\Projects\git_projects\university\phishing_detection\data\preprocessed\data_mail\train_processed_balanced.csv


In [40]:
prepare_and_save_full_data(test_raw, test_preprocessed)

Char Count (Original): 15347973
Char Count (Processed): 12392612
Char Reduction (Overall): 19.26%
Char Reduction (Avg. per row): 15.94%
Processed saved to c:\Users\ilian\Documents\Projects\git_projects\university\phishing_detection\data\preprocessed\data_mail\test_processed_de.csv
