In [157]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

# data manipulation and text 
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\Morsi Store
[nltk_data]     DZ\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [158]:
#file paths
ham = "ham"
spam = "spam_2/spam_2"



In [159]:

#Loads text data from files located in the specified path.

def get_data(path):
    # setting an empty list to store the data
    data = []
    files = os.listdir(path) # getting the files
    
    for file in files:
        # getting the file path
        file_path = os.path.join(path, file)
        
        # opening the file and loading it in memory
        with open(file_path, encoding="ISO-8859-1") as processed_file:
            # reading the file
            words_list = processed_file.read()
            # appending the processed data
            data.append(words_list)
    
    # returning the final result
    return data

In [160]:

ham_data = get_data(ham)
spam_data = get_data(spam)


In [161]:
# cleaning the emails
#turning emails list into dictionaries with keys : sender, recipient, subject, date, content

def clean_emails(emails):
    # a list of cleaned emails to store them
    cleaned_emails = []
    
    for email in emails:
        # Split the email into lines
        lines = email.split('\n')

        # Get the content of the email
        content = ''
        for line in lines:
            if line.startswith('Subject:'):
                subject = line.replace('Subject:', '').strip()
            elif line.startswith('From:'):
                sender = line.replace('From:', '').strip()
            elif line.startswith('To:'):
                recipient = line.replace('To:', '').strip()
            elif line.startswith('Date:'):date = line.replace('Date:', '').strip()
            elif line.startswith('X-'):
                continue
            else:
                content += line.strip()

        # Append the cleaned email to the list of cleaned emails
        cleaned_emails.append({'sender': sender,
                               'recipient': recipient,
                               'subject': subject,
                               'date': date,
                               'content': content})

    return cleaned_emails

In [162]:
cleaned_ham = clean_emails(ham_data)
cleaned_spam = clean_emails(spam_data)

In [163]:
legitimate = pd.DataFrame(cleaned_ham)
spam = pd.DataFrame(cleaned_spam) 

In [164]:
legitimate.head()


Unnamed: 0,sender,recipient,subject,date,content
0,Robert Elz <kre@munnari.OZ.AU>,Chris Garrigues <cwg-dated-1030377287.06fa6d@D...,Re: New Sequences Window,"Thu, 22 Aug 2002 18:26:25 +0700",From exmh-workers-admin@redhat.com Thu Aug 22...
1,"""CNET News.com Daily Dispatch"" <Online#3.19578...",qqqqqqqqqq-cnet-newsletters@example.com,CNET NEWS.COM: Cable companies cracking down o...,"Tue, 9 Jul 2002 15:54:30 -0700 (PDT)",Return-Path: <Online#3.19578.34-UgGTgZFN19NAr9...
2,CNET Shopper Newsletter Alerts <Online#3.19584...,qqqqqqqqqq-cnet-newsletters@example.com,Save an extra $50 on the iPaq 3835 PDA (CNET S...,"Tue, 9 Jul 2002 16:06:08 -0700 (PDT)",Return-Path: <Online#3.19584.83-p1SYlJ1blFvQjR...
3,Steve Burt <Steve_Burt@cursor-system.com>,"""'zzzzteana@yahoogroups.com'"" <zzzzteana@yahoo...",[zzzzteana] RE: Alexander,"Thu, 22 Aug 2002 12:46:18 +0100",From Steve_Burt@cursor-system.com Thu Aug 22 ...
4,"""CNET Download.com for Mac"" <Online#3.19586.b5...",qqqqqqqqqq-zdnet@example.com,"This week: Deck, Tex-Edit Plus, Boom","Tue, 9 Jul 2002 18:53:01 -0700 (PDT)",Return-Path: <Online#3.19586.b5-9w0blztbvHPdZd...


In [165]:
spam.head()

Unnamed: 0,sender,recipient,subject,date,content
0,"""Start Now"" <startnow2002@hotmail.com>",ilug@linux.ie,[ILUG] STOP THE MLM INSANITY,"Fri, 02 Aug 2002 23:37:59 0530",From ilug-admin@linux.ie Tue Aug 6 11:51:02 ...
1,lmrn@mailexcite.com,ranmoore@cybertime.net,"Real Protection, Stun Guns! Free Shipping! Ti...","Mon, 28 Jul 1980 14:01:35",From lmrn@mailexcite.com Mon Jun 24 17:03:24 ...
2,amknight@mailexcite.com,cbmark@cbmark.com,"New Improved Fat Burners, Now With TV Fat Abso...","Wed, 30 Jul 1980 18:25:49",From amknight@mailexcite.com Mon Jun 24 17:03...
3,jordan23@mailexcite.com,ranmoore@swbell.net,"New Improved Fat Burners, Now With TV Fat Abso...","Thu, 31 Jul 1980 07:20:54",From jordan23@mailexcite.com Mon Jun 24 17:04...
4,yyyy@pluriproj.pt,yyyy@pluriproj.pt,"Never Repay Cash Grants, $500 - $50,000, Secre...","Sun, 19 Oct 1980 10:55:16",From merchantsworld2001@juno.com Tue Aug 6 1...


In [166]:
# function for data (emails) processing /
def preprocess_text(text):
    # function to check if a word is a valid English word
    def is_english_word(word):
        #synsets = synonyms sets
        synsets = wordnet.synsets(word)
        #check if the given word is equal to the noune of lemmatized version of synonyms sets
        return len(synsets) > 0 and synsets[0].lemmas()[0].name().lower() == word.lower()
   
    # remove non-word characters and numbers using regular expressions
    #sub = substituion of non characters with  nothing  
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'<[^>]+>', '', text)

    # tokenize the text
    tokens = word_tokenize(text.lower())

    # remove English stop words
    english_stopwords = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in english_stopwords]
     # filter out non-English words
    english_words = [token for token in filtered_tokens if is_english_word(token)]
    
    return english_words


In [167]:
# applying the function to the legit emails content
legitimate["content"] = legitimate["content"].apply(preprocess_text)

In [168]:
legitimate.head()

Unnamed: 0,sender,recipient,subject,date,content
0,Robert Elz <kre@munnari.OZ.AU>,Chris Garrigues <cwg-dated-1030377287.06fa6d@D...,Re: New Sequences Window,"Thu, 22 Aug 2002 18:26:25 +0700","[phobos, discussion, list, reproduce, repeatab..."
1,"""CNET News.com Daily Dispatch"" <Online#3.19578...",qqqqqqqqqq-cnet-newsletters@example.com,CNET NEWS.COM: Cable companies cracking down o...,"Tue, 9 Jul 2002 15:54:30 -0700 (PDT)","[logo, ad, banner, table, width, width, width,..."
2,CNET Shopper Newsletter Alerts <Online#3.19584...,qqqqqqqqqq-cnet-newsletters@example.com,Save an extra $50 on the iPaq 3835 PDA (CNET S...,"Tue, 9 Jul 2002 16:06:08 -0700 (PDT)","[newsletter, table, width, width, width, heigh..."
3,Steve Burt <Steve_Burt@cursor-system.com>,"""'zzzzteana@yahoogroups.com'"" <zzzzteana@yahoo...",[zzzzteana] RE: Alexander,"Thu, 22 Aug 2002 12:46:18 +0100","[phobos, mail, network, unknown, unknown, unve..."
4,"""CNET Download.com for Mac"" <Online#3.19586.b5...",qqqqqqqqqq-zdnet@example.com,"This week: Deck, Tex-Edit Plus, Boom","Tue, 9 Jul 2002 18:53:01 -0700 (PDT)","[download, music, software, record, need, expe..."


In [169]:
spam['content'] = spam['content'].apply(preprocess_text)

In [170]:
spam.head()

Unnamed: 0,sender,recipient,subject,date,content
0,"""Start Now"" <startnow2002@hotmail.com>",ilug@linux.ie,[ILUG] STOP THE MLM INSANITY,"Fri, 02 Aug 2002 23:37:59 0530","[phobos, irish, linux, group, letter, interest..."
1,lmrn@mailexcite.com,ranmoore@cybertime.net,"Real Protection, Stun Guns! Free Shipping! Ti...","Mon, 28 Jul 1980 14:01:35","[may, may, may, unverified, may, need, safety,..."
2,amknight@mailexcite.com,cbmark@cbmark.com,"New Improved Fat Burners, Now With TV Fat Abso...","Wed, 30 Jul 1980 18:25:49","[may, may, may, may, fat, free, purchase, bott..."
3,jordan23@mailexcite.com,ranmoore@swbell.net,"New Improved Fat Burners, Now With TV Fat Abso...","Thu, 31 Jul 1980 07:20:54","[may, may, may, baa, may, may, fat, free, purc..."
4,yyyy@pluriproj.pt,yyyy@pluriproj.pt,"Never Repay Cash Grants, $500 - $50,000, Secre...","Sun, 19 Oct 1980 10:55:16","[sun, release, width, color, edition, summer, ..."


In [171]:
spam["category"] = 1
spam.head()

Unnamed: 0,sender,recipient,subject,date,content,category
0,"""Start Now"" <startnow2002@hotmail.com>",ilug@linux.ie,[ILUG] STOP THE MLM INSANITY,"Fri, 02 Aug 2002 23:37:59 0530","[phobos, irish, linux, group, letter, interest...",1
1,lmrn@mailexcite.com,ranmoore@cybertime.net,"Real Protection, Stun Guns! Free Shipping! Ti...","Mon, 28 Jul 1980 14:01:35","[may, may, may, unverified, may, need, safety,...",1
2,amknight@mailexcite.com,cbmark@cbmark.com,"New Improved Fat Burners, Now With TV Fat Abso...","Wed, 30 Jul 1980 18:25:49","[may, may, may, may, fat, free, purchase, bott...",1
3,jordan23@mailexcite.com,ranmoore@swbell.net,"New Improved Fat Burners, Now With TV Fat Abso...","Thu, 31 Jul 1980 07:20:54","[may, may, may, baa, may, may, fat, free, purc...",1
4,yyyy@pluriproj.pt,yyyy@pluriproj.pt,"Never Repay Cash Grants, $500 - $50,000, Secre...","Sun, 19 Oct 1980 10:55:16","[sun, release, width, color, edition, summer, ...",1


In [172]:
legitimate["category"] = 0
legitimate.head()

Unnamed: 0,sender,recipient,subject,date,content,category
0,Robert Elz <kre@munnari.OZ.AU>,Chris Garrigues <cwg-dated-1030377287.06fa6d@D...,Re: New Sequences Window,"Thu, 22 Aug 2002 18:26:25 +0700","[phobos, discussion, list, reproduce, repeatab...",0
1,"""CNET News.com Daily Dispatch"" <Online#3.19578...",qqqqqqqqqq-cnet-newsletters@example.com,CNET NEWS.COM: Cable companies cracking down o...,"Tue, 9 Jul 2002 15:54:30 -0700 (PDT)","[logo, ad, banner, table, width, width, width,...",0
2,CNET Shopper Newsletter Alerts <Online#3.19584...,qqqqqqqqqq-cnet-newsletters@example.com,Save an extra $50 on the iPaq 3835 PDA (CNET S...,"Tue, 9 Jul 2002 16:06:08 -0700 (PDT)","[newsletter, table, width, width, width, heigh...",0
3,Steve Burt <Steve_Burt@cursor-system.com>,"""'zzzzteana@yahoogroups.com'"" <zzzzteana@yahoo...",[zzzzteana] RE: Alexander,"Thu, 22 Aug 2002 12:46:18 +0100","[phobos, mail, network, unknown, unknown, unve...",0
4,"""CNET Download.com for Mac"" <Online#3.19586.b5...",qqqqqqqqqq-zdnet@example.com,"This week: Deck, Tex-Edit Plus, Boom","Tue, 9 Jul 2002 18:53:01 -0700 (PDT)","[download, music, software, record, need, expe...",0


In [173]:
emails = pd.concat([spam, legitimate], ignore_index=True)
emails


Unnamed: 0,sender,recipient,subject,date,content,category
0,"""Start Now"" <startnow2002@hotmail.com>",ilug@linux.ie,[ILUG] STOP THE MLM INSANITY,"Fri, 02 Aug 2002 23:37:59 0530","[phobos, irish, linux, group, letter, interest...",1
1,lmrn@mailexcite.com,ranmoore@cybertime.net,"Real Protection, Stun Guns! Free Shipping! Ti...","Mon, 28 Jul 1980 14:01:35","[may, may, may, unverified, may, need, safety,...",1
2,amknight@mailexcite.com,cbmark@cbmark.com,"New Improved Fat Burners, Now With TV Fat Abso...","Wed, 30 Jul 1980 18:25:49","[may, may, may, may, fat, free, purchase, bott...",1
3,jordan23@mailexcite.com,ranmoore@swbell.net,"New Improved Fat Burners, Now With TV Fat Abso...","Thu, 31 Jul 1980 07:20:54","[may, may, may, baa, may, may, fat, free, purc...",1
4,yyyy@pluriproj.pt,yyyy@pluriproj.pt,"Never Repay Cash Grants, $500 - $50,000, Secre...","Sun, 19 Oct 1980 10:55:16","[sun, release, width, color, edition, summer, ...",1
...,...,...,...,...,...,...
4193,newscientist <rssfeeds@example.com>,yyyy@example.com,Gene technique reveals human evolution,Not supplied,"[method, probe, first]",0
4194,guardian <rssfeeds@example.com>,yyyy@example.com,Go-ahead for new-style hospitals,2002-10-10T03:26:52+01:00,"[blair, whitehall, deal, borrowing, private, c...",0
4195,newscientist <rssfeeds@example.com>,yyyy@example.com,Malicious code hidden in email software,Not supplied,"[new, software, dummy, code, trojan, horse]",0
4196,guardian <rssfeeds@example.com>,yyyy@example.com,'Flexible' retirement gains ground,2002-10-10T03:26:51+01:00,"[government, may, work, beyond]",0


In [174]:
emails["category"].value_counts()

category
0    2801
1    1397
Name: count, dtype: int64

In [175]:
total_tokens = sum(emails['content'].apply(len))
total_tokens

305483

In [176]:
def count_token_occurrences(emails):
    """Counts the occurrences of each token in the email content.

    Args:
        emails: A pandas DataFrame with a 'content' column containing email text.

    Returns:
        A dictionary where keys are tokens and values are their counts.
    """
    token_counter = {}

    for content_list in emails["content"]:
        for content in content_list:
            # Check if the content is a string, if not, continue to the next content
            if not isinstance(content, str):
                continue

            # Tokenize the content using appropriate methods (e.g., split on whitespace)
            tokens = content.lower().split()  # Lowercase tokens for case-insensitivity

            for token in tokens:
                if token not in token_counter:
                    token_counter[token] = 1  # Initialize count to 1 for new tokens
                else:
                    token_counter[token] += 1  # Increment count for existing tokens

    return token_counter


In [177]:
token_counter = count_token_occurrences(emails)
token_counter

{'phobos': 770,
 'irish': 158,
 'linux': 886,
 'group': 746,
 'letter': 198,
 'interest': 279,
 'information': 1192,
 'business': 1233,
 'please': 1180,
 'accept': 133,
 'sincere': 16,
 'apology': 12,
 'removal': 98,
 'multilevel': 17,
 'read': 613,
 'important': 244,
 'one': 2437,
 'ever': 389,
 'huge': 132,
 'mistake': 49,
 'deliver': 78,
 'past': 269,
 'dream': 91,
 'cost': 365,
 'people': 1601,
 'sacred': 7,
 'fact': 255,
 'meaning': 62,
 'work': 984,
 'money': 1027,
 'finally': 117,
 'courage': 16,
 'tell': 313,
 'truth': 66,
 'good': 834,
 'need': 922,
 'see': 1066,
 'kind': 225,
 'permission': 69,
 'like': 1829,
 'send': 1043,
 'brief': 46,
 'introduce': 49,
 'new': 2597,
 'wonder': 75,
 'promise': 58,
 'unwanted': 38,
 'follow': 141,
 'pitch': 12,
 'call': 466,
 'address': 1071,
 'receive': 915,
 'free': 2184,
 'subject': 602,
 'box': 280,
 'hit': 229,
 'get': 2505,
 'hours': 300,
 'words': 191,
 'wall': 133,
 'shame': 9,
 'recently': 172,
 'sent': 541,
 'financially': 30,
 'be

In [178]:
# checking the frequency of a token

''' This function checks whether a preprocessed token occurs more frequently than a given threshold 
    in a corpus of token counts.'''

'''Args:
        preprocessed_token (str): The preprocessed token to be evaluated.
        threshold (int): The minimum frequency threshold for keeping the token.

    Returns:
        bool: True if the token occurs more frequently than the threshold, False otherwise.'''


def keep_token(preprocessed_token,threshold):
    if preprocessed_token not in token_counter:
        return  False
    else:
        return token_counter[preprocessed_token]> threshold
    
'''example : ''' 
keep_token("random",50)


True

In [179]:
features = set()

for token in token_counter:
    if keep_token(token,2000):
        features.add(token)

features

{'free', 'get', 'height', 'list', 'may', 'new', 'one', 'size', 'width'}

In [180]:
features=list(features)
features

['free', 'may', 'one', 'size', 'get', 'new', 'width', 'height', 'list']

In [181]:
#zip fucntion : combines iterables(lists, tuples..)
 
token_to_index_mapping =  {t:i for t, i in zip(features, range(len(features)))}
token_to_index_mapping

{'free': 0,
 'may': 1,
 'one': 2,
 'size': 3,
 'get': 4,
 'new': 5,
 'width': 6,
 'height': 7,
 'list': 8}

In [182]:
# Initialize NLTK components
tokenizer = nltk.tokenize.TweetTokenizer()
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')
nltk.download('stopwords')  # Download NLTK stopwords corpus

# Initialize stopwords using NLTK corpus
stopwords = set(stopwords.words('english'))

def remove_html_tags(text):
    # Use regular expression to remove HTML tags
    cleaned_text = re.sub(r'<[^>]+>', '', text)
    return cleaned_text

def message_to_token_list(s):
    # Remove HTML tags from the input text
    #nrmlm nastaeml remove html.. but drtli bug :') 
    cleaned_s = remove_html_tags(s).lower().strip()
    
    # Tokenize, lowercase, lemmatize, and remove stopwords
    tokens = tokenizer.tokenize(cleaned_s)
    lowercased_tokens = [t.lower() for t in tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(t) for t in lowercased_tokens]
    useful_tokens = [t for t in lemmatized_tokens if t not in stopwords]

    return useful_tokens

# Test the function
string = "3d b <br> .com bad font con randoms"
tokens = message_to_token_list(string)
print(tokens)

['3d', 'b', '.', 'com', 'bad', 'font', 'con', 'randoms']


[nltk_data] Downloading package wordnet to C:\Users\Morsi Store
[nltk_data]     DZ\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Morsi Store
[nltk_data]     DZ\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


BAG OF WORDS (counts vector)

In [188]:
#using numpy
#takes a list
def message_to_count_vect(msg):
    count_vect = np.zeros(len(features))
  #  processed_list_of_tokens = message_to_token_list(msg)
    for token in msg:
        if token not in features:
            continue
        else:
            index = token_to_index_mapping[token]
            count_vect[index] += 1
    return  count_vect



In [197]:
emails.iloc[3000]

sender                Brian Fahrlander <kilroy@kamakiriad.com>
recipient                            rpm-zzzlist@freshrpms.net
subject                                  Re: apt-get problem ?
date                            Mon, 7 Oct 2002 01:27:40 -0500
content      [discussion, list, dayan, use, install, versio...
category                                                     0
Name: 3000, dtype: object

In [192]:
message_to_count_vect(emails["content"].iloc[95])


array([0., 0., 1., 0., 0., 0., 0., 0., 1.])

In [202]:
def df_to_X_y(dff):
  y = dff['category'].to_numpy().astype(int)

  message_col = dff['content']
  count_vectors = []

  for message in message_col:
    count_vector = message_to_count_vect(message)
    count_vectors.append(count_vector)

  X = np.array(count_vectors).astype(int)

  return X, y

i guess ghlet w mdrtch train_df and test_df


In [203]:
X_train, y_train = df_to_X_y(emails)

X_test, y_test = df_to_X_y(emails)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((4198, 9), (4198,), (4198, 9), (4198,))