# Pre-Processing

In [5]:
from google.colab import drive
from google.colab import files

drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
import pandas as pd
import re
import csv
import sys
import uuid
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # Bag of Words: Term Frequencey and TFIDF
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

# Regex Expressions
REGEX_PATTERNS = {
    'currency': r'[$€£¥]\s*\d+(?:[.,]\d+)?|\d+(?:[.,]\d+)?\s*(?:USD|EUR|GBP|JPY|CAD|AUD|CHF)',

    'time': r'\b(?:[01]?\d|2[0-3]):[0-5]\d(?::[0-5]\d)?(?:\s*[aApP][mM])?\b',

    'day' : r'(?i) (sun|mon|tue(s)?|wed(nesday)?|thu(r(s)?)?|fri)(day|\.)? ', # note the spaces at the beginning and end

    'date': r'(?i)\b(?:\d{1,2}[-\/\.]\d{1,2}[-\/\.]\d{2,4}|\d{4}[-\/\.]\d{1,2}[-\/\.]\d{1,2}|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2}(?:[a-z]{2})?,?\s+\d{2,4}|\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*,?\s+\d{2,4}|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2}(?:\s?[—-]\s?\d{1,2})?)\b',

    'phone': r'^(\+\d{1,2}\s)?\(?\d{3}\)?[\s.-]\d{3}[\s.-]\d{4}$',

    'percentage': r'\b(?<!\.)(?!0+(?:\.0+)?%)(?:\d|[1-9]\d|100)(?:(?<!100)\.\d+)?%',

    'number': r'\b\d+(?:[.,]\d+)?\b',

    'email': r'(?:[a-z0-9!#$%&\'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])',

    'url' : r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
}

## Load Dataset(s)

In [7]:

pd.set_option('display.max_colwidth', None)
csv.field_size_limit(sys.maxsize)

file_paths = {'/content/drive/MyDrive/COS720 Project/Datasets/CEAS_08.csv',
              '/content/drive/MyDrive/COS720 Project/Datasets/Nigerian_Fraud.csv',
              '/content/drive/MyDrive/COS720 Project/Datasets/Ling.csv'}

def load_email_data(file_path):

  with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
        reader = csv.reader(f, quotechar='"', escapechar='\\')
        headers = next(reader)
        data = []
        for row in reader:
            if len(row) == len(headers):
                data.append(row)
            else:
                print(f"Skipping malformed row: {row}")

        df = pd.DataFrame(data, columns=headers)
        print(f"Loaded data with fallback method: {len(df)} rows")
        return df

## Clean Data

### Extract emails from body

### Converting to lowercase

In [8]:
def to_lowercase(df, text_column='body'):
  df_copy = df.copy()

  df_copy[text_column] = df_copy[text_column].apply(
      lambda x: x.lower() if isinstance(x, str) else x
  )

  return df_copy

### Replacing text with classes

In [9]:
## we replace currency with <CUR>, time with <TIME>, date with <DATE>, phone numbers with <PHONE>, percentages with <PERC>, and other general numbers that do not fall into the previous categories with <NUM>

def replace_data_categories(text):
    modified_text = text

    # Replace each type in order (specific to general)
    modified_text = re.sub(REGEX_PATTERNS['currency'], '<cur>', modified_text) # currency
    modified_text = re.sub(REGEX_PATTERNS['time'], '<time>', modified_text) # times
    modified_text = re.sub(REGEX_PATTERNS['day'], '<day>', modified_text) # times
    modified_text = re.sub(REGEX_PATTERNS['date'], '<date>', modified_text) # dates
    modified_text = re.sub(REGEX_PATTERNS['phone'], '<phone>', modified_text) # phone numbers
    modified_text = re.sub(REGEX_PATTERNS['percentage'], '<perc>', modified_text) # percentages
    modified_text = re.sub(REGEX_PATTERNS['number'], '<num>', modified_text) # general numbers

    return modified_text

### Extract emails from body

In [10]:
def extract_urls_from_text(text):
    if not isinstance(text, str):
        return []

    # Regex pattern to match URLs
    url_pattern = re.compile(REGEX_PATTERNS['url'])
    return url_pattern.findall(text)

def replace_urls_with_tag(text):
    if not isinstance(text, str):
        return text

    url_pattern = re.compile(REGEX_PATTERNS['url'])
    return url_pattern.sub('<URL>', text) #  Replace all URLs in text with <URL> tag


def process_email_data(df):

    print(f"Original dataframe shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")

    url_data = []

    # ID to link URLs back to emails
    df['email_id'] = [str(uuid.uuid4()) for _ in range(len(df))]

    # Extract and replace URLs -- Subject to further processing
    for idx, row in df.iterrows():
        email_id = row['email_id']
        body = row['body']

        urls = extract_urls_from_text(body)

        for url in urls:
            url_data.append({
                'email_id': email_id,
                'url': url,
                'sender': row['sender'],
                'label': row['label']
            })

        # Replace urls
        df.at[idx, 'body'] = replace_urls_with_tag(body)

        # Replace data categories
        df.at[idx, 'body'] = replace_data_categories(row['body'])

    # Lowercase
    df = to_lowercase(df)

    # Create URL dataframe
    url_df = pd.DataFrame(url_data)

    print(f"Modified dataframe shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    print(f"URL dataframe shape: {url_df.shape}")

    return df, url_df

data_frames = []

for file_path in file_paths:
     data_frames.append(load_email_data(file_path))

combined_df = pd.concat(data_frames, join='outer', ignore_index=True)
combined_df, url_df = process_email_data(combined_df)

combined_df['label'] = combined_df['label'].astype(int)
print(combined_df.shape)
combined_df

Loaded data with fallback method: 2859 rows
Loaded data with fallback method: 39154 rows
Loaded data with fallback method: 3332 rows
Original dataframe shape: (45345, 7)
Columns: ['subject', 'body', 'label', 'sender', 'receiver', 'date', 'urls']
Modified dataframe shape: (45345, 8)
Columns: ['subject', 'body', 'label', 'sender', 'receiver', 'date', 'urls', 'email_id']
URL dataframe shape: (132701, 4)
(45345, 8)


Unnamed: 0,subject,body,label,sender,receiver,date,urls,email_id
0,job posting - apple-iss research center,"content - length : <num> apple-iss research center a us <cur> million joint venture between apple computer inc . and the institute of systems science of the national university of singapore , located in singapore , is looking for : a senior speech scientist - - - - - - - - - - - - - - - - - - - - - - - - - the successful candidate will have research expertise in computational linguistics , including natural language processing and * * english * * and * * chinese * * statistical language modeling . knowledge of state-of - the-art corpus-based n - gram language models , cache language models , and part-of - speech language models are required . a text - to - speech project leader - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - the successful candidate will have research expertise expertise in two or more of the following areas : computational linguistics , including natural language parsing , lexical database design , and statistical language modeling ; text tokenization and normalization ; prosodic analysis . substantial knowledge of the phonology , syntax , and semantics of chinese is required . knowledge of acoustic phonetics and / or speech signal processing is desirable . both candidates will have a phd with at least <num> to <num> years of relevant work experience , or a technical msc degree with at least <num> to <num> years of experienc e . very strong software engineering skills , including design and implementation , and productization are required in these positions . knowledge of c , c + + and unix are preferred . a unix & c programmer - - - - - - - - - - - - - - - - - - - - we are looking for an experienced unix & c programmer , preferably with good industry experience , to join us in breaking new frontiers . strong knowledge of unix tools ( compilers , linkers , make , x - windows , e - mac , . . . ) and experience in matlab required .<day>and silicon graphic experience is an advantage . programmers with less than two years industry experience need not apply . these positions include interaction with scientists in the national university of singapore , and with apple 's speech research and productization efforts located in cupertino , california . attendance and publication in international scientific / engineering conferences is encouraged . benefits include an internationally competitive salary , housing subsidy , and relocation expenses . _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ send a complete resume , enclosing personal particulars , qualifications , experience and contact telephone number to : mr jean - luc lebrun center manager apple - iss research center , institute of systems science heng mui keng terrace , singapore <num> tel : ( <num> ) <num>-<num> fax : ( <num> ) <num>-<num> email : jllebrun @ iss . nus . sg\n",0,,,,,dddb6946-f92f-47bf-93f6-d8b192fd79c4
1,,"lang classification grimes , joseph e . and barbara f . grimes ; ethnologue language family index ; pb . isbn : <num>-<num> - <num> - <num> ; vi , <num> pp . ; <cur> . <num> . summer institute of linguistics . this companion volume to ethnologue : languages of the world , twelfth edition lists language families of the world with sub-groups shown in a tree arrangement under the broadest classification of language family . the language family index facilitates locating language names in the ethnologue , making the data there more accessible . internet : academic . books @ sil . org languages , reference lang & culture gregerson , marilyn ; ritual , belief , and kinship in sulawesi ; pb . : isbn : <num>-<num> - <num> - <num> ; ix , <num> pp . ; <cur> . <num> . summer institute of linguistics . seven articles discuss five language groups in sulawesi , indonesia ; the primary focus is on cultural matters , with some linguistic content . topics include traditional religion and beliefs , certain ceremonies , and kinship . internet : academic . books @ sil . org language and society , indonesia computers & ling weber , david j . , stephen r . mcconnel , diana d . weber , and beth j . bryson ; primer : a tool for developing early reading materials ; pb . : isbn : <num>-<num> - <num> - <num> ; xvi , <num> pp . + ms-dos software ; <cur> . <num> . summer institute of linguistics . the authors present a computer program and instructions for developing reading materials in languages with little or no background in literacy . the book is structured as a how-to manual with step by step procedures to establish an appropriate primer sequence and to organize words , phrases , and sentences that correlate with the sequence . it presupposes a thorough knowledge of linguistics . internet : academic . books @ sil . org literacy , computer\n",0,,,,,61c51246-2293-433b-a3c5-c9300e2d456c
2,query : letter frequencies for text identification,"i am posting this inquiry for sergei atamas ( satamas @ umabnet . ab . umd . edu ) , a research associate at the university of maryland at baltimore . his field is molecular biology , and his work involves comparing dna strings using various algorithms . i do n't understand the details well enough to pass them along . at any rate , one such algorithm relies upon frequencies with which the letters g , a , t , and c occur in the dna strings . he would like to explore the analogous use of letter ( sound ) frequencies in natural language texts . hence this posting . specifically , sergei wonders if any linguist subscribers could help steer him to recent literature concerning text identification based on letter frequencies . any suggestions could be sent directly to him at the above address , or to me and i ' ll pass them along . he would also be interested in collaborative work if this research connects with the work of any linguists or text processing specialists . he observes that very often work in one field would actually help work in a far-removed field , if only people knew what was going on over there . george fowler george fowler gfowler @ indiana . edu [ email ] dept . of slavic languages * * <num>-<num> - <num>-<num> [ home ] * * [ try here first ! ] ballantine <num> <num>-<num> - <num>-<num> / - <num> / - <num> [ dept . ] indiana university <num>-<num> - <num>-<num> [ office ] bloomington , in <num> usa <num>-<num> - <num>-<num> [ dept . fax ]\n",0,,,,,2f035236-76ef-4107-b9cd-b33f651fa0aa
3,risk,a colleague and i are researching the differing degrees of risk perceived by our hong kong students in different contexts where spoken english is required . we would be interested to find out more about research in the area of risk-taking in language learning . so far we have n't come up with much . can anyone help here ?\n,0,,,,,887050f6-52f8-4401-95d4-f5789b1cf161
4,request book information,"earlier this morning i was on the phone with a friend of mine living in south america . as we were talking in spanish , he said : "" si voy a la liberi ' a , comprare ' el libro "" which can be rendered into english as "" if i go to the bookstore , i will purchase it "" . i found this expression a bit unusual so i asked him saying that he really meant to say "" si fuese a la libreri ' a , comprari ' a el libro "" or "" if i were to go to the bookstore , i would buy it "" to which he said to me , "" ah , the subjunctive is dead in spanish ! "" . weather this is a matter of subjunctive discussion or not , is something to be left for another time . nevertheless , he mentioned in the course of our conversation that there is a book ( a spanish translation of a french original ) titled something like "" la muerte del subjuntivo "" or "" the demise / death of the subjunctive "" . does any one know of this book ? or books which may deal with similar content ? any and all help will be appreciated . joseph m kozono < kozonoj @ gunet . georgetown . edu >",0,,,,,70d23962-b9ef-46aa-af5b-98d71da18fd7
...,...,...,...,...,...,...,...,...
45340,CONTACT GLOBAL MAX SHIPING COMPANY,"atten: my dear ,\n \ni have paid the fee for your cheque draft.because the manager of ecobank\nbenin told me that before the check will get to you that it willexpire.\nso i told him to cash <cur>.<num> however all the necessary arrangement\nof delivering the <cur>.<num> in cash was made with global max shiping\ncourier company. this is the information they need to delivery your package\nto you the only money you have to send to them is there security keeping\nfee which is <cur> us dollars to received your package.\n .\nattn: dr.james eze\n\ne-mail :(globalmaxshipingcompany08@yahoo.fr) \nphone number:+<num>-<date>-<num>\nplease, send them your contacts information to enable them locate youimmediately\nthey arrived in your country with your box.\nthis is what they need from you.\n\n<num>.your full name......\n<num>.home address..........\n<num>.current home telephone number......\n<num>.current office telephone..........\n<num>.a copy of your picture..........\n \nplease make sure you send this needed info's to the director general of\nglobal max shiping courier company dr.james eze with the address\ngiven to you.\n \nnote ;the global max courier company don't know the content of the\nbox. i registered it as a box of an africa cloths. they don't know its\ncontent is money. this is to avoid them delaying with the box. don't let\nthem know that is money that is in that box. i am waiting for your\nurgent response. \n \nthanks and remain blessed. \n \nmicheal agu. \n \n\n\n\n\n\n\n\n\n\n",1,michealagu0255@zipmail.com.br,,,0,cd0a405e-9e56-4c1b-818e-08d003213fe3
45341,TREAT AS URGENT.,"\nfrom: mr ali sherif. african development bank (adb)ouagadougou , burkina faso . \ndear friend, \ni am the manager of foreign remittance and exchange dept at african development bank (adb). burkina- faso, west africa .\nin my department we discover an abandoned sum of usd<cur>m dollars (twelve million, six hundred thousand us dollar). in an account that belongs to one of our foreign customer who died alongside with his entire family in a plane crash the year <num> that took the whole life of the passengers on board.\n \nsince we got information about his death, the bank have been expecting his next of kin to come over and claim his money because the bank cannot release it unless someone applies for it as next of kin or relation to the deceased as indicating in our banking guideline, but unfortunately we learnt that all his supposed next of kin or relation that knows about the account information died alongside with him at the plane crash leaving nobody behind for the claim.\n \nit is therefore upon this discovery that i now decided to make this business proposal to you and for the money to be release to you as the next of kin or relation to the deceased, since nobody will ever come for it, we do not want this money to go into the treasury of the bank as unclaimed bill or fund.\n \nthe bank law and guideline here stipulates that if such money remained unclaimed after six years, the money will be transferred into the bank treasury account as unclaimed fund. the request of foreigner as next of kin in this transaction is occasioned by the fact that the customer was a foreigner and a citizen cannot stand as next of kin to a foreigner.\nin subsequent disbursement of the money, i agree that <perc> of this money will be for you in respect of the provision you made by standing in the claim as next of kin,<perc> will be set aside for expenses in cure during the transaction on taxes that will arise and <perc> would be for me.\nas soon as the money hit into your account i will come over to your country for disbursement according to the percentages indicated above.\nto make the claim, you will first of all apply to the bank by sending a text of application which i will send to you for you to fill and send to the bank. in the application you will indicate your bank account, your private fax and your private telephone number for easy and effective communication.\n \nupon receipt of your reply, i will send to you by email the text of application which you will send to the bank. i will not fail to bring to your notice that this transaction is hitch free and you do not entertain any atom of fear as all required arrangements have been made for the success of the transfer. \n \nyou should contact me immediately as soon as you receive this letter.\nhoping to hear from you immediately. plane crash web site... http://news.bbc.co.uk/<num>/hi/world/europe/<num>.stm \nthanks.\n_________________________________________________________________\nnews, entertainment and everything you care about at live.com. get it now!\nhttp://www.live.com/getstarted.aspx",1,ali sherif <ali_sherif252@hotmail.fr>,,"Mon, 17 Sep 2007 22:28:11 +0000",1,a94e7070-e4ad-4ab1-98b8-d4d250e17430
45342,From Dr Usman Ibrahim / Mr Wahid Yoffe property.,"\nfrom dr usman ibrahim danko.auditing and accounting unit.foreign operations department.banque togolaise pour le commerce et l'industrie,lome- togo. \n \ndear friend, i do not intend to cause any grief to you. the reason for sending this mail is very fundamental to the doctrine of human privileges and right. my name is dr usman ibrahim danko i am quite convicned of the fact that this will come to you as a surprise.however i am writing based on the privlage information i garthered about you in internet during my extensive search for a repose foreign partner who is compitent of assisting me in below business proposal. i am the principal manager controlling the auditing and accounting section of banque togolaise pour le commerce et l'inustrie lome togo. in west africa with due respect and regard. \n \ni have decided to contact you on a business transaction that will be very beneficial to both of us at the end of the transaction. during our investigation and auditing in this bank, my department came across a very huge sum of money belonging to a deceased person mr wahid yoffe who died in the horrific asian tsunamis of dec. <num>, <num>, that has killed more than all 20th- century tsunamis combined that happened in sumatra island, indonesia. \n \n>from the information that our bank have gotten so far, the tsunami killed him with his entire family and no none relation have been identified. the person that he used as his next of kin was his first son of <num> years old who died along side with the entire family. although personally, i keep this information secret within myself and partner to enable the whole plans and idea be profitable and successful during the time of execution. the said amount was (us<cur>m). \n \nmeanwhile all the whole arrangement to put claim over this fund as the bonafide next of kin to the deceased, get the required approval and transfer this money to a foreign account has been put in place and directives and needed information will be relayed to you as soon as you indicate your interest and willingness to assist us and also benefit your self to this great business opportunity that comes once in life. in fact i could have done this deal alone but because of my position in this country as a civil servant, we are not allowed to perate a foreign account and would eventually raise an eye brow on my side during the time of transfer because i work in this bank. \n \nthis is the actual reason why it will require a second party or fellow who will forward claims as the next of kin with affidavit of trust of oath to the bank and also resent a foreign account where he will need the money to be re-transferred in to on his request as it may be after due verification and clarification by the correspondent branch of the bank where the whole money will be remitted from to your own designation bank account. \n \nmay i at this point emphasize that this transaction is <perc> risk free as i have made arrangements for a successful deal before contacting you. on smooth conclusion of this transaction, you will be entitled to <perc> of the total sum as gratification, while <perc> will be set aside to take care of expenses that may arise during the time of transfer processing and also telephone bills, while <perc> will be for me and my partner. \n \nplease, you have been adviced to keep top secret as we are still in service and intend to retire from service after we conclude this deal with you. i will be monitoring the whole situation here in this bank until you confirm the money in your account and ask us to come down to your country for subsequent sharing of the fund according to percentages previously indicated and further investment, either in your country or any country you advice us to invest in. \n \nall other necessary information will be sent to you when i hear from you. i suggest you get back to me as soon as possible stating your wish in this deal.\n yours sincerely, dr usman ibrahim danko.\n_________________________________________________________________\ninvite your mail contacts to join your friends list with windows live spaces. it's easy!\nhttp://spaces.live.com/spacesapi.aspx?wx_action=create&wx_url=/friends.aspx&mkt=en-us",1,Dr Usman Ibrahim Danko <drusmanibrahimtg08@hotmail.fr>,,"Tue, 18 Sep 2007 10:54:53 +0000",1,5ecdcc69-cf9e-4c55-b600-7e69654fe354
45343,My Beloved In Christ.,"\nbeloved in the lord jesus christ, please endeavour to use it for the\nchildren of god.\n\nmy name is mother doris killam 63years old woman from united states of\namerica. i am married to engineer pitt killam who till his death worked\nwith willbros, a u.s oil engineering firm here in nigeria, we were\nmarried for thirteen (<num>) years without a child.\n\nhe died on saturday, <date> after my late husband eng. pitt\nand eight (<num>) other foreign oil workers were abducted by militia groups\nactive in the niger delta region of nigeria, on the process of\nnegotiation by the nigerian government and the militant group\nunfortunately, my\nhusband eng. pitt killam died, before his death, we deposit a sum of\n<cur> million dollars which was proceed of a contract work he just\nconcluded with the nigerian ports authority.\n\nplease find fact on the information from the website below.\n\nhttp://news.bbc.co.uk/<num>/hi/africa/<num>.stm\n\nafter his painful death i decided not to re-marry or get a child\noutside my matrimonial home. my doctor told me that i would not last\nfor the\nnext three months due to cancer problem. though what disturbs me most\nis my partial stroke as a result of high blood pressure.\n\nhaving known my condition i decided to donate this funds to church or\nbetter still a christian individual that will utilize this money the\nway\ni am going to instruct herein. i want a church or individual that will\nuse this money to fund churches, orphanages and widows propagating the\nword of god and to ensure that the house of god is maintained.\n\nthe bible made us to understand that blessed is the hand that grivet. i\ntook this decision because i dont have any child that will\ninherit\nthis money and my husband's relatives are not christians and i\ndont\nwant my husband's hard earned money to be misused by unbelievers. i\ndont want a situation where this money will be used in an\nungodly\nmanner, hence the reason for taking this bold decision.\n\ni am not afraid of death hence i know where i am going. i know that i\nam going to be in the bosom of the lord. exodus <num> vs <num> says that the\nlord will fight my case and i shall hold my peace. with god all things\nare possible. as soon as i receive your reply i will want you to give\nme\nyour\n\nname:...........................\naddress:........................\nsex/age:........................\nphone:..........................\noccupation:.....................\n\ni will send a copy to the bank. for identification when you will\ncontact them. i want you and the church to always pray for me because\nthe\nlord is my shepherd and i shall not want.\n\nmy happiness is that i lived a life of a worthy christian. please\nassure me that you will act accordingly as i stated herein.\n\nnote: this must be kept confidentially from eyes and ears of my\nhusband's family. hoping to hear from you.\n\nreply me and remain blessed in the name of the lord.\n\nregards,\nmrs. doris killam.\n\nprivate email reply me here:(motherdorisk9@yahoo.com.hk)\n_________________________________________________________________\nconnect to the next generation of msn messenger \nhttp://imagine-msn.com/messenger/launch80/default.aspx?locale=en-us&source=wlmailtagline\n\n",1,Mother Doris Killam <motherdorisk61@hotmail.com>,,"Wed, 19 Sep 2007 00:52:16 +0100",1,ef1c6185-78ae-4af4-a628-42b40a9e31f1


### Splitting data

In [19]:
# Split data into test and train subsets
X_train, X_test, target_train, target_test = train_test_split(combined_df['body'], combined_df['label'], train_size=0.7, random_state=1)
print(target_test.head(10))

45258    1
22057    1
21931    0
43867    1
36030    1
10926    1
29301    1
22653    1
44468    1
44120    1
Name: label, dtype: int64


### Removing non-word and non special characters

### Tokenization and Vectorization

In [20]:
# Initialize vecotrizer
def initialise_tfidf_vectorizer(data, max_features = 2000):
    vectorizer_tfidf = TfidfVectorizer(stop_words='english', max_features=max_features) # remove stop words
    vectorizer_tfidf.fit(data)
    X = vectorizer_tfidf.transform(data)
    return X, vectorizer_tfidf

X_train_tfidf, vectorizer_tfidf = initialise_tfidf_vectorizer(X_train)

### Lemmatization

### Train/Import and Test Model(s)

In [21]:
from sklearn.neural_network import MLPClassifier
import joblib

MODEL_PATH = "/content/drive/MyDrive/COS720 Project/Models/body_classifier.joblib"

try:
    # Try loading existing model
    clf = joblib.load(MODEL_PATH)
    print("Loaded existing model from", MODEL_PATH)
except FileNotFoundError:
    print("Model file not found. Training a new model...")
    clf = MLPClassifier(hidden_layer_sizes=(100,),  # one hidden layer with 100 units
                        activation='logistic',
                        solver='adam',
                        max_iter=200,
                        random_state=0)

    # Train the model
    clf.fit(X_train_tfidf, target_train)

# scores = cross_val_score(
#     clf,
#     X_train_tfidf,
#     target_train,
#     cv=5,
#     scoring='f1'
# )

# print("5-fold Cross-Validation F1 score for Word Frequency: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Loaded existing model from /content/drive/MyDrive/COS720 Project/Models/body_classifier.joblib


In [22]:
# from sklearn.pipeline import make_pipeline
# import joblib

# # Build the pipeline (no SVD)
# pipeline = make_pipeline(
#     vectorizer_tfidf,  # fitted TfidfVectorizer
#     clf                # trained classifier
# )

# # Save the pipeline
# joblib.dump(pipeline, 'simple_text_classifier.joblib')

### Testing

In [34]:
test_file_path = "/content/drive/MyDrive/COS720 Project/Datasets/SpamAssasin.csv"

# Evaluate model using different data file from training
with open(test_file_path, 'r', encoding='utf-8', errors='replace') as f:
  reader = csv.reader(f, quotechar='"', escapechar='\\')
  headers = next(reader)
  data = []
  for row in reader:
      if len(row) == len(headers):
          data.append(row)
  test_df = pd.DataFrame(data, columns=headers)
  test_df['label'] = test_df['label'].astype(int)
  print(f"Loaded data with fallback method: {len(test_df)} rows")

test_df, test_url_df = process_email_data(test_df)

# y_pred = clf.predict(X_test)
# y_test = target_test

y_test = test_df['label']
y_pred = clf.predict(test_df['body'])

f1 = f1_score(y_test, y_pred)  # by default, pos_label=1
print(f"F1 Score: {f1:.4f}")

Loaded data with fallback method: 5805 rows
Original dataframe shape: (5805, 7)
Columns: ['sender', 'receiver', 'date', 'subject', 'body', 'label', 'urls']
Modified dataframe shape: (5805, 8)
Columns: ['sender', 'receiver', 'date', 'subject', 'body', 'label', 'urls', 'email_id']
URL dataframe shape: (23462, 4)
F1 Score: 0.8091
