# Importing necessary  libraries


In [11]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

# data manipulation and text 
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\Morsi Store
[nltk_data]     DZ\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
#file paths
ham = "ham"
spam = "spam_2/spam_2"



# Created a function to load email data from files and convert them into a list.

In [13]:

#Loads text data from files located in the specified path.

def get_data(path):
    # setting an empty list to store the data
    data = []
    files = os.listdir(path) # getting the files
    
    for file in files:
        # getting the file path
        file_path = os.path.join(path, file)
        
        # opening the file and loading it in memory
        with open(file_path, encoding="ISO-8859-1") as processed_file:
            # reading the file
            words_list = processed_file.read()
            # appending the processed data
            data.append(words_list)
    
    # returning the final result
    return data

In [14]:
# Loaded both spam and legitimate (ham) emails.
ham_data = get_data(ham)
spam_data = get_data(spam)


# Extraction important information

In [15]:
# cleaning the emails
#turning emails list into dictionaries with keys : sender, recipient, subject, date, content

def clean_emails(emails):
    # a list of cleaned emails to store them
    cleaned_emails = []
    
    for email in emails:
        # Split the email into lines
        lines = email.split('\n')

        # Get the content of the email
        content = ''
        for line in lines:
            if line.startswith('Subject:'):
                subject = line.replace('Subject:', '').strip()
            elif line.startswith('From:'):
                sender = line.replace('From:', '').strip()
            elif line.startswith('To:'):
                recipient = line.replace('To:', '').strip()
            elif line.startswith('Date:'):date = line.replace('Date:', '').strip()
            elif line.startswith('X-'):
                continue
            else:
                content += line.strip()

        # Append the cleaned email to the list of cleaned emails
        cleaned_emails.append({'sender': sender,
                               'recipient': recipient,
                               'subject': subject,
                               'date': date,
                               'content': content})

    return cleaned_emails

In [16]:
cleaned_ham = clean_emails(ham_data)
cleaned_spam = clean_emails(spam_data)

# Transformed data into a DataFrame using Pandas.

In [17]:
legitimate = pd.DataFrame(cleaned_ham) 
spam = pd.DataFrame(cleaned_spam) 

In [18]:
legitimate.head()


Unnamed: 0,sender,recipient,subject,date,content
0,Robert Elz <kre@munnari.OZ.AU>,Chris Garrigues <cwg-dated-1030377287.06fa6d@D...,Re: New Sequences Window,"Thu, 22 Aug 2002 18:26:25 +0700",From exmh-workers-admin@redhat.com Thu Aug 22...
1,"""CNET News.com Daily Dispatch"" <Online#3.19578...",qqqqqqqqqq-cnet-newsletters@example.com,CNET NEWS.COM: Cable companies cracking down o...,"Tue, 9 Jul 2002 15:54:30 -0700 (PDT)",Return-Path: <Online#3.19578.34-UgGTgZFN19NAr9...
2,CNET Shopper Newsletter Alerts <Online#3.19584...,qqqqqqqqqq-cnet-newsletters@example.com,Save an extra $50 on the iPaq 3835 PDA (CNET S...,"Tue, 9 Jul 2002 16:06:08 -0700 (PDT)",Return-Path: <Online#3.19584.83-p1SYlJ1blFvQjR...
3,Steve Burt <Steve_Burt@cursor-system.com>,"""'zzzzteana@yahoogroups.com'"" <zzzzteana@yahoo...",[zzzzteana] RE: Alexander,"Thu, 22 Aug 2002 12:46:18 +0100",From Steve_Burt@cursor-system.com Thu Aug 22 ...
4,"""CNET Download.com for Mac"" <Online#3.19586.b5...",qqqqqqqqqq-zdnet@example.com,"This week: Deck, Tex-Edit Plus, Boom","Tue, 9 Jul 2002 18:53:01 -0700 (PDT)",Return-Path: <Online#3.19586.b5-9w0blztbvHPdZd...


In [19]:
spam.head()

Unnamed: 0,sender,recipient,subject,date,content
0,"""Start Now"" <startnow2002@hotmail.com>",ilug@linux.ie,[ILUG] STOP THE MLM INSANITY,"Fri, 02 Aug 2002 23:37:59 0530",From ilug-admin@linux.ie Tue Aug 6 11:51:02 ...
1,lmrn@mailexcite.com,ranmoore@cybertime.net,"Real Protection, Stun Guns! Free Shipping! Ti...","Mon, 28 Jul 1980 14:01:35",From lmrn@mailexcite.com Mon Jun 24 17:03:24 ...
2,amknight@mailexcite.com,cbmark@cbmark.com,"New Improved Fat Burners, Now With TV Fat Abso...","Wed, 30 Jul 1980 18:25:49",From amknight@mailexcite.com Mon Jun 24 17:03...
3,jordan23@mailexcite.com,ranmoore@swbell.net,"New Improved Fat Burners, Now With TV Fat Abso...","Thu, 31 Jul 1980 07:20:54",From jordan23@mailexcite.com Mon Jun 24 17:04...
4,yyyy@pluriproj.pt,yyyy@pluriproj.pt,"Never Repay Cash Grants, $500 - $50,000, Secre...","Sun, 19 Oct 1980 10:55:16",From merchantsworld2001@juno.com Tue Aug 6 1...


# ### Data Preprocessing
Utilized NLTK for tokenization, lemmatization, and removal of stopwords, punctuation, and non-sense words.


In [20]:
# function for data (emails) processing 
def preprocess_text(text):
    # function to check if a word is a valid English word
    def is_english_word(word):
        #synsets = synonyms sets
        synsets = wordnet.synsets(word)
        #check if the given word is equal to the noune of lemmatized version of synonyms sets
        return len(synsets) > 0 and synsets[0].lemmas()[0].name().lower() == word.lower()
   
    # remove non-word characters and numbers using regular expressions
    #sub = substituion of non characters with  nothing  
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'<[^>]+>', '', text)

    # tokenize the text
    tokens = word_tokenize(text.lower())

    # remove English stop words
    english_stopwords = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in english_stopwords]
     # filter out non-English words
    english_words = [token for token in filtered_tokens if is_english_word(token)]
    
    return english_words


In [21]:
# applying the function to the legit emails content
legitimate["content"] = legitimate["content"].apply(preprocess_text)

In [22]:
legitimate.head()

Unnamed: 0,sender,recipient,subject,date,content
0,Robert Elz <kre@munnari.OZ.AU>,Chris Garrigues <cwg-dated-1030377287.06fa6d@D...,Re: New Sequences Window,"Thu, 22 Aug 2002 18:26:25 +0700","[phobos, discussion, list, reproduce, repeatab..."
1,"""CNET News.com Daily Dispatch"" <Online#3.19578...",qqqqqqqqqq-cnet-newsletters@example.com,CNET NEWS.COM: Cable companies cracking down o...,"Tue, 9 Jul 2002 15:54:30 -0700 (PDT)","[logo, ad, banner, table, width, width, width,..."
2,CNET Shopper Newsletter Alerts <Online#3.19584...,qqqqqqqqqq-cnet-newsletters@example.com,Save an extra $50 on the iPaq 3835 PDA (CNET S...,"Tue, 9 Jul 2002 16:06:08 -0700 (PDT)","[newsletter, table, width, width, width, heigh..."
3,Steve Burt <Steve_Burt@cursor-system.com>,"""'zzzzteana@yahoogroups.com'"" <zzzzteana@yahoo...",[zzzzteana] RE: Alexander,"Thu, 22 Aug 2002 12:46:18 +0100","[phobos, mail, network, unknown, unknown, unve..."
4,"""CNET Download.com for Mac"" <Online#3.19586.b5...",qqqqqqqqqq-zdnet@example.com,"This week: Deck, Tex-Edit Plus, Boom","Tue, 9 Jul 2002 18:53:01 -0700 (PDT)","[download, music, software, record, need, expe..."


In [23]:
spam['content'] = spam['content'].apply(preprocess_text)

In [24]:
spam.head()

Unnamed: 0,sender,recipient,subject,date,content
0,"""Start Now"" <startnow2002@hotmail.com>",ilug@linux.ie,[ILUG] STOP THE MLM INSANITY,"Fri, 02 Aug 2002 23:37:59 0530","[phobos, irish, linux, group, letter, interest..."
1,lmrn@mailexcite.com,ranmoore@cybertime.net,"Real Protection, Stun Guns! Free Shipping! Ti...","Mon, 28 Jul 1980 14:01:35","[may, may, may, unverified, may, need, safety,..."
2,amknight@mailexcite.com,cbmark@cbmark.com,"New Improved Fat Burners, Now With TV Fat Abso...","Wed, 30 Jul 1980 18:25:49","[may, may, may, may, fat, free, purchase, bott..."
3,jordan23@mailexcite.com,ranmoore@swbell.net,"New Improved Fat Burners, Now With TV Fat Abso...","Thu, 31 Jul 1980 07:20:54","[may, may, may, baa, may, may, fat, free, purc..."
4,yyyy@pluriproj.pt,yyyy@pluriproj.pt,"Never Repay Cash Grants, $500 - $50,000, Secre...","Sun, 19 Oct 1980 10:55:16","[sun, release, width, color, edition, summer, ..."


# Data labeling (1 : spam, 0 : legitimate)

In [25]:
spam["category"] = 1
spam.head()

Unnamed: 0,sender,recipient,subject,date,content,category
0,"""Start Now"" <startnow2002@hotmail.com>",ilug@linux.ie,[ILUG] STOP THE MLM INSANITY,"Fri, 02 Aug 2002 23:37:59 0530","[phobos, irish, linux, group, letter, interest...",1
1,lmrn@mailexcite.com,ranmoore@cybertime.net,"Real Protection, Stun Guns! Free Shipping! Ti...","Mon, 28 Jul 1980 14:01:35","[may, may, may, unverified, may, need, safety,...",1
2,amknight@mailexcite.com,cbmark@cbmark.com,"New Improved Fat Burners, Now With TV Fat Abso...","Wed, 30 Jul 1980 18:25:49","[may, may, may, may, fat, free, purchase, bott...",1
3,jordan23@mailexcite.com,ranmoore@swbell.net,"New Improved Fat Burners, Now With TV Fat Abso...","Thu, 31 Jul 1980 07:20:54","[may, may, may, baa, may, may, fat, free, purc...",1
4,yyyy@pluriproj.pt,yyyy@pluriproj.pt,"Never Repay Cash Grants, $500 - $50,000, Secre...","Sun, 19 Oct 1980 10:55:16","[sun, release, width, color, edition, summer, ...",1


In [26]:
legitimate["category"] = 0
legitimate.head()

Unnamed: 0,sender,recipient,subject,date,content,category
0,Robert Elz <kre@munnari.OZ.AU>,Chris Garrigues <cwg-dated-1030377287.06fa6d@D...,Re: New Sequences Window,"Thu, 22 Aug 2002 18:26:25 +0700","[phobos, discussion, list, reproduce, repeatab...",0
1,"""CNET News.com Daily Dispatch"" <Online#3.19578...",qqqqqqqqqq-cnet-newsletters@example.com,CNET NEWS.COM: Cable companies cracking down o...,"Tue, 9 Jul 2002 15:54:30 -0700 (PDT)","[logo, ad, banner, table, width, width, width,...",0
2,CNET Shopper Newsletter Alerts <Online#3.19584...,qqqqqqqqqq-cnet-newsletters@example.com,Save an extra $50 on the iPaq 3835 PDA (CNET S...,"Tue, 9 Jul 2002 16:06:08 -0700 (PDT)","[newsletter, table, width, width, width, heigh...",0
3,Steve Burt <Steve_Burt@cursor-system.com>,"""'zzzzteana@yahoogroups.com'"" <zzzzteana@yahoo...",[zzzzteana] RE: Alexander,"Thu, 22 Aug 2002 12:46:18 +0100","[phobos, mail, network, unknown, unknown, unve...",0
4,"""CNET Download.com for Mac"" <Online#3.19586.b5...",qqqqqqqqqq-zdnet@example.com,"This week: Deck, Tex-Edit Plus, Boom","Tue, 9 Jul 2002 18:53:01 -0700 (PDT)","[download, music, software, record, need, expe...",0


# Merging data into one DataFrame.

In [27]:
emails = pd.concat([spam, legitimate], ignore_index=True)
emails


Unnamed: 0,sender,recipient,subject,date,content,category
0,"""Start Now"" <startnow2002@hotmail.com>",ilug@linux.ie,[ILUG] STOP THE MLM INSANITY,"Fri, 02 Aug 2002 23:37:59 0530","[phobos, irish, linux, group, letter, interest...",1
1,lmrn@mailexcite.com,ranmoore@cybertime.net,"Real Protection, Stun Guns! Free Shipping! Ti...","Mon, 28 Jul 1980 14:01:35","[may, may, may, unverified, may, need, safety,...",1
2,amknight@mailexcite.com,cbmark@cbmark.com,"New Improved Fat Burners, Now With TV Fat Abso...","Wed, 30 Jul 1980 18:25:49","[may, may, may, may, fat, free, purchase, bott...",1
3,jordan23@mailexcite.com,ranmoore@swbell.net,"New Improved Fat Burners, Now With TV Fat Abso...","Thu, 31 Jul 1980 07:20:54","[may, may, may, baa, may, may, fat, free, purc...",1
4,yyyy@pluriproj.pt,yyyy@pluriproj.pt,"Never Repay Cash Grants, $500 - $50,000, Secre...","Sun, 19 Oct 1980 10:55:16","[sun, release, width, color, edition, summer, ...",1
...,...,...,...,...,...,...
3623,newscientist <rssfeeds@example.com>,yyyy@example.com,Gene technique reveals human evolution,Not supplied,"[method, probe, first]",0
3624,guardian <rssfeeds@example.com>,yyyy@example.com,Go-ahead for new-style hospitals,2002-10-10T03:26:52+01:00,"[blair, whitehall, deal, borrowing, private, c...",0
3625,newscientist <rssfeeds@example.com>,yyyy@example.com,Malicious code hidden in email software,Not supplied,"[new, software, dummy, code, trojan, horse]",0
3626,guardian <rssfeeds@example.com>,yyyy@example.com,'Flexible' retirement gains ground,2002-10-10T03:26:51+01:00,"[government, may, work, beyond]",0


In [28]:
emails["category"].value_counts()

category
0    2801
1     827
Name: count, dtype: int64

In [29]:
#changing columns order to be more readable
emails = emails[['category','sender','recipient','subject','date','content']]
emails.head()

Unnamed: 0,category,sender,recipient,subject,date,content
0,1,"""Start Now"" <startnow2002@hotmail.com>",ilug@linux.ie,[ILUG] STOP THE MLM INSANITY,"Fri, 02 Aug 2002 23:37:59 0530","[phobos, irish, linux, group, letter, interest..."
1,1,lmrn@mailexcite.com,ranmoore@cybertime.net,"Real Protection, Stun Guns! Free Shipping! Ti...","Mon, 28 Jul 1980 14:01:35","[may, may, may, unverified, may, need, safety,..."
2,1,amknight@mailexcite.com,cbmark@cbmark.com,"New Improved Fat Burners, Now With TV Fat Abso...","Wed, 30 Jul 1980 18:25:49","[may, may, may, may, fat, free, purchase, bott..."
3,1,jordan23@mailexcite.com,ranmoore@swbell.net,"New Improved Fat Burners, Now With TV Fat Abso...","Thu, 31 Jul 1980 07:20:54","[may, may, may, baa, may, may, fat, free, purc..."
4,1,yyyy@pluriproj.pt,yyyy@pluriproj.pt,"Never Repay Cash Grants, $500 - $50,000, Secre...","Sun, 19 Oct 1980 10:55:16","[sun, release, width, color, edition, summer, ..."


# Split the DataFrame into 80% for training and 20% for testing.

In [30]:
#spliting data frame into  train and test sets

emails = emails.sample(frac=1,random_state=1)
emails = emails.reset_index(drop=True)

split_index = int(len(emails)*0.8) # 80% for training data

train_data = emails[:split_index]
test_data = emails[split_index:]

train_data,test_data 

(      category                                             sender  \
 0            1                                 sat@niederhasli.ch   
 1            1                      <jjc7y7676668t04@hotmail.com>   
 2            0                  Tom Reingold <noglider@pobox.com>   
 3            0            Chris Garrigues <cwg-exmh@DeepEddy.Com>   
 4            0                   Terry Yapt <yapt@technovell.com>   
 ...        ...                                                ...   
 2897         0              "Adam L. Beberg" <beberg@mithral.com>   
 2898         0  "James C. McMaster (Jim)" <mcmasjc@tatanka.sto...   
 2899         1                                              "" <>   
 2900         1                        "sexygirl" <gbest@mail.com>   
 2901         0                Michael <mogmios@mlug.missouri.edu>   
 
                                      recipient  \
 0      <Undisclosed Recipients@netnoteinc.com>   
 1                                                
 2   

In [31]:
total_tokens = sum(emails['content'].apply(len))
total_tokens

261379

# Token Frequency Analysis

In [32]:
def count_token_occurrences(emails):
    """Counts the occurrences of each token in the email content.

    Args:
        emails: A pandas DataFrame with a 'content' column containing email text.

    Returns:
        A dictionary where keys are tokens and values are their counts.
    """
    token_counter = {}

    for content_list in emails["content"]:
        for content in content_list:
            # Check if the content is a string, if not, continue to the next content
            if not isinstance(content, str):
                continue

            # Tokenize the content using appropriate methods (e.g., split on whitespace)
            tokens = content.lower().split()  # Lowercase tokens for case-insensitivity

            for token in tokens:
                if token not in token_counter:
                    token_counter[token] = 1  # Initialize count to 1 for new tokens
                else:
                    token_counter[token] += 1  # Increment count for existing tokens

    return token_counter


In [33]:
token_counter = count_token_occurrences(emails)
token_counter

{'unknown': 639,
 'apparently': 59,
 'free': 1643,
 'satellite': 81,
 'channels': 24,
 'system': 615,
 'get': 2006,
 'include': 283,
 'direct': 98,
 'need': 775,
 'software': 909,
 'ease': 29,
 'use': 1566,
 'minute': 82,
 'company': 554,
 'charge': 92,
 'card': 401,
 'reset': 22,
 'order': 591,
 'may': 2417,
 'importance': 53,
 'fa': 30,
 'message': 1325,
 'james': 44,
 'gibbon': 50,
 'discussion': 524,
 'list': 2141,
 'tom': 61,
 'organization': 70,
 'technological': 18,
 'equivalent': 25,
 'blood': 77,
 'random': 58,
 'street': 167,
 'short': 141,
 'virus': 51,
 'well': 665,
 'mailing': 1373,
 'phobos': 612,
 'network': 749,
 'date': 288,
 'work': 839,
 'startup': 49,
 'attempt': 65,
 'change': 551,
 'fail': 32,
 'sigh': 13,
 'already': 296,
 'found': 488,
 'sent': 437,
 'fast': 228,
 'enough': 264,
 'see': 943,
 'like': 1646,
 'box': 232,
 'however': 270,
 'testing': 81,
 'unread': 11,
 'congress': 83,
 'suite': 84,
 'austin': 40,
 'war': 225,
 'version': 533,
 'revision': 70,
 'ru

In [34]:
# checking the frequency of a token

''' This function checks whether a preprocessed token occurs more frequently than a given threshold 
    in a corpus of token counts.'''

'''Args:
        preprocessed_token (str): The preprocessed token to be evaluated.
        threshold (int): The minimum frequency threshold for keeping the token.

    Returns:
        bool: True if the token occurs more frequently than the threshold, False otherwise.'''


def keep_token(preprocessed_token,threshold):
    if preprocessed_token not in token_counter:
        return  False
    else:
        return token_counter[preprocessed_token]> threshold
    
'''example : ''' 
keep_token("random",50)


True

# Selected some features based on token frequency.

In [35]:
features = set()

for token in token_counter:
    if keep_token(token,2000):
        features.add(token)

features

{'get', 'height', 'list', 'may', 'new', 'one', 'size', 'width'}

In [36]:
features=list(features)
features

['may', 'size', 'get', 'new', 'height', 'width', 'list', 'one']

In [37]:
#zip fucntion : combines iterables(lists, tuples..)
 
token_to_index_mapping =  {t:i for t, i in zip(features, range(len(features)))}
token_to_index_mapping

{'may': 0,
 'size': 1,
 'get': 2,
 'new': 3,
 'height': 4,
 'width': 5,
 'list': 6,
 'one': 7}

In [38]:
# Initialize NLTK components
tokenizer = nltk.tokenize.TweetTokenizer()
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')
nltk.download('stopwords')  # Download NLTK stopwords corpus

# Initialize stopwords using NLTK corpus
stopwords = set(stopwords.words('english'))

def remove_html_tags(text):
    # Use regular expression to remove HTML tags
    cleaned_text = re.sub(r'<[^>]+>', '', text)
    return cleaned_text

def message_to_token_list(s):
    # Remove HTML tags from the input text
    #nrmlm nastaeml remove html.. but drtli bug :') 
    cleaned_s = remove_html_tags(s).lower().strip()
    
    # Tokenize, lowercase, lemmatize, and remove stopwords
    tokens = tokenizer.tokenize(cleaned_s)
    lowercased_tokens = [t.lower() for t in tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(t) for t in lowercased_tokens]
    useful_tokens = [t for t in lemmatized_tokens if t not in stopwords]

    return useful_tokens

# Test the function
string = "3d b <br> .com bad font con randoms"
tokens = message_to_token_list(string)
print(tokens)

['3d', 'b', '.', 'com', 'bad', 'font', 'con', 'randoms']


[nltk_data] Downloading package wordnet to C:\Users\Morsi Store
[nltk_data]     DZ\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Morsi Store
[nltk_data]     DZ\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


BAG OF WORDS (counts vector)

In [39]:
#using numpy
#takes a list
def message_to_count_vect(msg):
    count_vect = np.zeros(len(features))
  #  processed_list_of_tokens = message_to_token_list(msg)
    for token in msg:
        if token not in features:
            continue
        else:
            # If the token is a feature, find its index in the features list
            index = token_to_index_mapping[token]
            # Increment the count of the corresponding index in the count vector
            count_vect[index] += 1
    return  count_vect



In [40]:
# .iloc function in pandas is used to select rows and columns by integer position
emails.iloc[3000]

category                                                     0
sender                          Hal DeVore <haldevore@acm.org>
recipient    Chris Garrigues <cwg-dated-1033999132.9ba1d6@D...
subject              Re: Unseen window versus Sequences Window
date                           Wed, 02 Oct 2002 09:34:08 -0500
content      [discussion, list, get, time, pretty, good, id...
Name: 3000, dtype: object

In [41]:
#iloc : access a row in Dataframe
message_to_count_vect(emails["content"].iloc[95])


array([0., 0., 0., 0., 0., 0., 0., 1.])

In [42]:
#convert df into feautures matrix X and target array Y for ML task
def df_to_X_y(dff):
  # Extract the target array Y
  y = dff['category'].to_numpy().astype(int)

    # Extract the content column from the DataFrame
  message_col = dff['content']
  count_vectors = []

  # Iterate through each message in the content column
  for message in message_col:
    # Convert the message into a count vector
    count_vector = message_to_count_vect(message)
    # Add the count vector to the list of vectors
    count_vectors.append(count_vector)

  # Convert the list of count vectors into a numpy array representing the features matrix X
  X = np.array(count_vectors).astype(int)
  # Return the features matrix X and the target array Y
  return X, y

#  Convert the training data DataFrame to features matrix X_train and target array y_train


In [43]:

X_train, y_train = df_to_X_y(train_data)

X_test, y_test = df_to_X_y(test_data)

# Display the shapes of the resulting arrays
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((2902, 8), (2902,), (726, 8), (726,))

# Data scaling

In [44]:
from sklearn.preprocessing import MinMaxScaler
# Create an instance of MinMaxScaler and fit it to the training data
scaler = MinMaxScaler().fit(X_train) 

# Use the fitted scaler to transform the training and test data
X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)

# Display the scaled training data
X_train


array([[0.        , 0.        , 0.05263158, ..., 0.        , 0.        ,
        0.        ],
       [0.18181818, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.02777778,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.27272727, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

# Model Training:

In [45]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
#apply logistic regression to data & report
lf = LogisticRegression().fit(X_train,y_train)
print(classification_report(y_test, lf.predict(X_test)))


              precision    recall  f1-score   support

           0       0.85      0.99      0.91       558
           1       0.91      0.40      0.55       168

    accuracy                           0.85       726
   macro avg       0.88      0.69      0.73       726
weighted avg       0.86      0.85      0.83       726



# Comparing with random forest instance 


In [46]:
# compare logistic regression to random forest

from sklearn.ensemble import RandomForestClassifier
#It creates an instance of the random forest classifier (rf) and fits it to the training data (X_train, y_train).
rf=RandomForestClassifier().fit(X_train,y_train)
print(classification_report(y_test, rf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.88      0.97      0.92       558
           1       0.87      0.55      0.67       168

    accuracy                           0.88       726
   macro avg       0.87      0.76      0.80       726
weighted avg       0.88      0.88      0.87       726

