In [3]:
!pip install tld
!pip install tldextract



In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from urllib.parse import urlparse
from collections import Counter
import math

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import time

# Added imports for new preprocessing methods
from tld import get_tld
import tldextract
import os.path
import re
import ipaddress

In [5]:
from google.colab import files
uploaded = files.upload()
# Load the dataset using the path from the preprocessing notebook
urldata = pd.read_csv('malicious_phish.csv')

# Rename the DataFrame to 'df' for consistency with the rest of the Final Project notebook
df = urldata.copy()

# Remove the 'Unnamed: 0' column if it exists, as seen in the preprocessing notebook
if 'Unnamed: 0' in df.columns:
    df = df.drop("Unnamed: 0", axis=1)

df.head()

Saving malicious_phish.csv to malicious_phish.csv


Unnamed: 0,url,label,result
0,https://www.google.com,benign,0
1,https://www.youtube.com,benign,0
2,https://www.facebook.com,benign,0
3,https://www.baidu.com,benign,0
4,https://www.wikipedia.org,benign,0


In [6]:
# Check for missing values and data info (from preprocessing.ipynb)
print("Missing values:")
print(df.isnull().sum())
print("\nData Info:")
df.info()

Missing values:
url       0
label     0
result    0
dtype: int64

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450176 entries, 0 to 450175
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     450176 non-null  object
 1   label   450176 non-null  object
 2   result  450176 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 10.3+ MB


In [7]:
urldata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450176 entries, 0 to 450175
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  450176 non-null  int64 
 1   url         450176 non-null  object
 2   label       450176 non-null  object
 3   result      450176 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 13.7+ MB


In [8]:
# Normalizing URLs before parsing
from urllib.parse import urlparse
def normalize_url(url):
    if not isinstance(url, str):
        return ''
    url = url.strip()
    url = url.replace("[.]", ".") # Replace obfuscated dots
    if not url.startswith(('http://', 'https://')): # Add scheme if missing
        url = 'http://' + url
    return url

# Data Preprocessing
#### 1. Lengths
- URL Length
- Hostname Length
- Path Length
- Fist Directory Length
- Top Level Domain Length
- Query Length

In [9]:
from urllib.parse import urlparse
from tld import get_tld
import os.path

# URL Length
urldata["url_length"] = urldata["url"].apply(lambda x: len(str(x)))

# Hostname Length
def hostname_length(url):
    url = normalize_url(url)
    parsed = urlparse(url)
    return len(parsed.hostname or '')
urldata["hostname_length"] = urldata["url"].apply(hostname_length)

# Path Length
def path_length(url):
    url = normalize_url(url)
    parsed = urlparse(url)
    return len(parsed.path)
urldata["path_length"] = urldata["url"].apply(path_length)

# First Directory Length
def first_directory_length(url):
    url = normalize_url(url)
    path = urlparse(url).path
    first_directory = path.split('/')[1] if len(path.split('/')) > 1 else ''
    return len(first_directory)
urldata["first_directory_length"] = urldata["url"].apply(first_directory_length)

# Top Level Domain Length
def tld_length(url):
    url = normalize_url(url)
    try:
        tld = get_tld(url, fail_silently=True)
        return len(tld) if tld else 0
    except:
        return 0
urldata["tld_length"] = urldata["url"].apply(tld_length)

# Query Length
def query_length(url):
    url = normalize_url(url)
    parsed = urlparse(url)
    return len(parsed.query)
urldata["query_length"] = urldata["url"].apply(query_length)

urldata.head()

Unnamed: 0.1,Unnamed: 0,url,label,result,url_length,hostname_length,path_length,first_directory_length,tld_length,query_length
0,0,https://www.google.com,benign,0,22,14,0,0,3,0
1,1,https://www.youtube.com,benign,0,23,15,0,0,3,0
2,2,https://www.facebook.com,benign,0,24,16,0,0,3,0
3,3,https://www.baidu.com,benign,0,21,13,0,0,3,0
4,4,https://www.wikipedia.org,benign,0,25,17,0,0,3,0


#### 2. Counts
- Special Characters
    - !@#$%^&*()-_=+[]{}|;:'\",.<>?/\\`~
- Count Of 'www'
- Count Of Digits
- Count Of Letters
- Count Of Number Of Directories
- Count Of Number Of Subdomains
- Count Of Tokens In URL

In [10]:
import tldextract

def special_char_count(url):
    url = str(url)
    special_chars = "!@#$%^&*()-_=+[]{}|;:'\",.<>?/\\`~"
    count = sum(1 for c in url if c in special_chars)
    return count
urldata["special_char_count"] = urldata["url"].apply(special_char_count)

urldata["count-www"] = urldata["url"].apply(lambda x: str(x).count('www'))

# Digit Count
def digit_count(url):
    url = str(url)
    digits = sum(c.isdigit() for c in url)
    return digits
urldata["digit_count"] = urldata["url"].apply(digit_count)

# Letter Count
def letter_count(url):
    url = str(url)
    letters = sum(c.isalpha() for c in url)
    return letters
urldata["letter_count"] = urldata["url"].apply(letter_count)

# Directory Count
def dir_count(url):
    url = normalize_url(url)
    dir = urlparse(url).path
    return dir.count('/')
urldata["dir_count"] = urldata["url"].apply(dir_count)

# Subdomain Count
def subdomain_count(url):
    url = normalize_url(url)
    hostname = urlparse(url).hostname or ''
    ext = tldextract.extract(hostname)
    if not ext.subdomain:
        return 0
    return len(ext.subdomain.split('.'))
urldata["subdomain_count"] = urldata["url"].apply(subdomain_count)

# Query Parameter Count
def query_param_count(url):
    url = normalize_url(url)
    parsed = urlparse(url)
    query = parsed.query
    if query:
        return query.count('&') + 1
    return 0
urldata["query_param_count"] = urldata["url"].apply(query_param_count)

urldata.head()

Unnamed: 0.1,Unnamed: 0,url,label,result,url_length,hostname_length,path_length,first_directory_length,tld_length,query_length,special_char_count,count-www,digit_count,letter_count,dir_count,subdomain_count,query_param_count
0,0,https://www.google.com,benign,0,22,14,0,0,3,0,5,1,0,17,0,1,0
1,1,https://www.youtube.com,benign,0,23,15,0,0,3,0,5,1,0,18,0,1,0
2,2,https://www.facebook.com,benign,0,24,16,0,0,3,0,5,1,0,19,0,1,0
3,3,https://www.baidu.com,benign,0,21,13,0,0,3,0,5,1,0,16,0,1,0
4,4,https://www.wikipedia.org,benign,0,25,17,0,0,3,0,5,1,0,20,0,1,0


#### 3. Binary Features
- Has IP
- Uses HTTPS
- Has Suspicious Extension
- Uses Shortener

In [11]:
import re
import ipaddress

# Has IP Address
def has_ip_address(url):
    url = normalize_url(url)
    host = urlparse(url).hostname or ''
    try:
        ipaddress.ip_address(host)
        return 1
    except ValueError:
        return 0
urldata["has_ip_address"] = urldata["url"].apply(has_ip_address)

# Uses HTTPS
def uses_https(url):
    url = normalize_url(url)
    return 1 if urlparse(url).scheme == 'https' else 0
urldata["uses_https"] = urldata["url"].apply(uses_https)

# Has Suspicious Extension
def suspicious_extension(url):
    url = normalize_url(url)
    suspicious_exts = ['.exe', '.zip', '.rar', '.scr', '.pif', '.bat', '.cmd', '.js', '.vbs']
    path = urlparse(url).path.lower()
    for ext in suspicious_exts:
        if path.endswith(ext):
            return 1
    return 0
urldata["suspicious_extension"] = urldata["url"].apply(suspicious_extension)

# Uses Shortening
shortening_services = re.compile(
    r'bit\.ly|goo\.gl|shorte\.st|go2l\.ink|ow\.ly|t\.co|tinyurl|is\.gd|cli\.gs|'
    r'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
    r'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
    r'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|lnkd\.in|'
    r'db\.tt|qr\.ae|adf\.ly|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|ity\.im|'
    r'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
    r'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|'
    r'tr\.im|link\.zip\.net'
    , re.IGNORECASE               # added this flag becuase we're casting to lower case in uses_shortening
)

def uses_shortening(url):
    url = normalize_url(url)
    hostname = (urlparse(url).hostname or '').lower()
    return 1 if shortening_services.search(hostname) else 0
urldata["uses_shortening"] = urldata["url"].apply(uses_shortening)

urldata.head()

Unnamed: 0.1,Unnamed: 0,url,label,result,url_length,hostname_length,path_length,first_directory_length,tld_length,query_length,...,count-www,digit_count,letter_count,dir_count,subdomain_count,query_param_count,has_ip_address,uses_https,suspicious_extension,uses_shortening
0,0,https://www.google.com,benign,0,22,14,0,0,3,0,...,1,0,17,0,1,0,0,1,0,0
1,1,https://www.youtube.com,benign,0,23,15,0,0,3,0,...,1,0,18,0,1,0,0,1,0,0
2,2,https://www.facebook.com,benign,0,24,16,0,0,3,0,...,1,0,19,0,1,0,0,1,0,0
3,3,https://www.baidu.com,benign,0,21,13,0,0,3,0,...,1,0,16,0,1,0,0,1,0,0
4,4,https://www.wikipedia.org,benign,0,25,17,0,0,3,0,...,1,0,20,0,1,0,0,1,0,0


#### 4. Entropy
- Shannon Entropy

In [12]:
# Shannon Entropy
import math
def shannon_entropy(url):
    url = str(url)
    if not url:
        return 0
    freq = {}
    for char in url:
        freq[char] = freq.get(char, 0) + 1
    entropy = 0.0
    length = len(url)
    for char, count in freq.items():
        p = count / length
        entropy -= p * math.log2(p)
    return entropy
urldata["shannon_entropy"] = urldata["url"].apply(shannon_entropy)

urldata.head()

Unnamed: 0.1,Unnamed: 0,url,label,result,url_length,hostname_length,path_length,first_directory_length,tld_length,query_length,...,digit_count,letter_count,dir_count,subdomain_count,query_param_count,has_ip_address,uses_https,suspicious_extension,uses_shortening,shannon_entropy
0,0,https://www.google.com,benign,0,22,14,0,0,3,0,...,0,17,0,1,0,0,1,0,0,3.663533
1,1,https://www.youtube.com,benign,0,23,15,0,0,3,0,...,0,18,0,1,0,0,1,0,0,3.762267
2,2,https://www.facebook.com,benign,0,24,16,0,0,3,0,...,0,19,0,1,0,0,1,0,0,3.855389
3,3,https://www.baidu.com,benign,0,21,13,0,0,3,0,...,0,16,0,1,0,0,1,0,0,3.88018
4,4,https://www.wikipedia.org,benign,0,25,17,0,0,3,0,...,0,20,0,1,0,0,1,0,0,3.813661


#### 5. Ratios
- Digit Ratio
- Letter Ratio
- Special Character Ratio

In [13]:
# Digit Ratio
def digit_ratio(url):
    url = str(url)
    digits = sum(c.isdigit() for c in url)
    length = len(url)
    return digits / length if length > 0 else 0
urldata["digit_ratio"] = urldata["url"].apply(digit_ratio)

# Letter Ratio
def letter_ratio(url):
    url = str(url)
    letters = sum(c.isalpha() for c in url)
    length = len(url)
    return letters / length if length > 0 else 0
urldata["letter_ratio"] = urldata["url"].apply(letter_ratio)

# Special Character Ratio
def special_char_ratio(url):
    url = str(url)
    special_chars = "!@#$%^&*()-_=+[]{}|;:'\",.<>?/\\`~"
    special_count = sum(1 for c in url if c in special_chars)
    length = len(url)
    return special_count / length if length > 0 else 0
urldata["special_char_ratio"] = urldata["url"].apply(special_char_ratio)

## Uppercase to length ratio
urldata['num_uppercase'] = urldata['url'].apply(lambda x: sum(1 for c in x if c.isupper()))
urldata['uppercase_ratio'] = urldata['num_uppercase'] / urldata['total_len']

urldata.head().T

Unnamed: 0,0,1,2,3,4
Unnamed: 0,0,1,2,3,4
url,https://www.google.com,https://www.youtube.com,https://www.facebook.com,https://www.baidu.com,https://www.wikipedia.org
label,benign,benign,benign,benign,benign
result,0,0,0,0,0
url_length,22,23,24,21,25
hostname_length,14,15,16,13,17
path_length,0,0,0,0,0
first_directory_length,0,0,0,0,0
tld_length,3,3,3,3,3
query_length,0,0,0,0,0


## Handling Missing Values

In [14]:
df = urldata
print("Missing values per column after feature engineering:")
# Update feature_cols to include all new columns
feature_cols = ['url_length', 'hostname_length', 'path_length', 'first_directory_length', 'tld_length', 'query_length', 'special_char_count', 'count-www', 'digit_count', 'letter_count', 'dir_count', 'subdomain_count', 'query_param_count', 'has_ip_address', 'uses_https', 'suspicious_extension', 'uses_shortening', 'shannon_entropy', 'digit_ratio', 'letter_ratio', 'special_char_ratio']
missing = df[feature_cols].isnull().sum()
print(missing[missing > 0])

if df[feature_cols].isnull().sum().sum() == 0:
 print("\nNo missing values found in features!")
else:
 print(f"\nTotal missing values: {df[feature_cols].isnull().sum().sum()}")

Missing values per column after feature engineering:
Series([], dtype: int64)

No missing values found in features!


## Handling Duplicates

In [15]:
# Check if same URL has multiple different labels
# Note: The original project used 'type' column with multiple categories. The preprocessing notebook uses 'label' and 'result'.
# We will use the 'label' column for checking conflicts, as 'result' is the numerical target (0 or 1).
duplicate_urls = df[df['url'].duplicated(keep=False)]

if len(duplicate_urls) > 0:
 # Group by URL and check if they have different labels
 url_type_check = duplicate_urls.groupby('url')['label'].nunique()

 # URLs with multiple different labels
 conflicting_urls = url_type_check[url_type_check > 1]

 print(f"Total duplicate URL entries: {len(duplicate_urls)}")
 print(f"URLs with conflicting labels: {len(conflicting_urls)}")

 if len(conflicting_urls) > 0:
  print("\nExamples of URLs with multiple labels:")
  for url in conflicting_urls.index[:5]: # Show first 5
   print(f"\nURL: {url}")
   print(df[df['url'] == url][['url', 'label']])
 else:
  print("\nAll duplicate URLs have the same label - safe to remove duplicates")
else:
 print("No duplicate URLs found!")

No duplicate URLs found!


# Removing duplicate

In [16]:
# URLs with conflicting labels
duplicate_urls = df[df['url'].duplicated(keep=False)]
url_type_check = duplicate_urls.groupby('url')['label'].nunique()
conflicting_urls = url_type_check[url_type_check > 1].index.tolist()

print(f"URLs with conflicting labels: {len(conflicting_urls)}")
print("These URLs will be REMOVED due to unreliable labeling:")
for url in conflicting_urls:
 labels = df[df['url'] == url]['label'].unique()
 print(f" {url}: {labels}")

# Remove conflicting URLs: unreliable data
print(f"\nRemoving {len(df[df['url'].isin(conflicting_urls)])} rows with conflicting labels")
df_clean = df[~df['url'].isin(conflicting_urls)]

# Remove remaining duplicates: same URL, same label: reliable
print(f"Removing remaining duplicates (same URL, same label) - {df_clean.duplicated(subset=['url']).sum()} rows")
df_clean = df_clean.drop_duplicates(subset=['url'], keep='first')

print(f"\nFinal clean dataset shape: {df_clean.shape}")
df = df_clean

URLs with conflicting labels: 0
These URLs will be REMOVED due to unreliable labeling:

Removing 0 rows with conflicting labels
Removing remaining duplicates (same URL, same label) - 0 rows

Final clean dataset shape: (450176, 25)


# Splittting and Scaling

In [17]:
# The 'result' column is already in numerical form (0/1) from the preprocessing notebook.
y = df['result']

# Select the 21 features generated in the preprocessing step
feature_cols = ['url_length', 'hostname_length', 'path_length', 'first_directory_length', 'tld_length', 'query_length', 'special_char_count', 'count-www', 'digit_count', 'letter_count', 'dir_count', 'subdomain_count', 'query_param_count', 'has_ip_address', 'uses_https', 'suspicious_extension', 'uses_shortening', 'shannon_entropy', 'digit_ratio', 'letter_ratio', 'special_char_ratio']

X = df[feature_cols]

print(f"Feature DataFrame shape: {X.shape}")
print(f"Target Series shape: {y.shape}")
X.head()

Feature DataFrame shape: (450176, 21)
Target Series shape: (450176,)


Unnamed: 0,url_length,hostname_length,path_length,first_directory_length,tld_length,query_length,special_char_count,count-www,digit_count,letter_count,...,subdomain_count,query_param_count,has_ip_address,uses_https,suspicious_extension,uses_shortening,shannon_entropy,digit_ratio,letter_ratio,special_char_ratio
0,22,14,0,0,3,0,5,1,0,17,...,1,0,0,1,0,0,3.663533,0.0,0.772727,0.227273
1,23,15,0,0,3,0,5,1,0,18,...,1,0,0,1,0,0,3.762267,0.0,0.782609,0.217391
2,24,16,0,0,3,0,5,1,0,19,...,1,0,0,1,0,0,3.855389,0.0,0.791667,0.208333
3,21,13,0,0,3,0,5,1,0,16,...,1,0,0,1,0,0,3.88018,0.0,0.761905,0.238095
4,25,17,0,0,3,0,5,1,0,20,...,1,0,0,1,0,0,3.813661,0.0,0.8,0.2


In [18]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

print("\nClass distribution in training set:")
train_dist = pd.Series(y_train).value_counts()
print(f" Benign (0): {train_dist[0]:,} ({train_dist[0]/len(y_train)*100:.1f}%)")
print(f" Malicious (1): {train_dist[1]:,} ({train_dist[1]/len(y_train)*100:.1f}%)")

print("\nClass distribution in test set:")
test_dist = pd.Series(y_test).value_counts()
print(f" Benign (0): {test_dist[0]:,} ({test_dist[0]/len(y_test)*100:.1f}%)")
print(f" Malicious (1): {test_dist[1]:,} ({test_dist[1]/len(y_test)*100:.1f}%)")

X_train shape: (360140, 21)
X_test shape: (90036, 21)
y_train shape: (360140,)
y_test shape: (90036,)

Class distribution in training set:
 Benign (0): 276,590 (76.8%)
 Malicious (1): 83,550 (23.2%)

Class distribution in test set:
 Benign (0): 69,148 (76.8%)
 Malicious (1): 20,888 (23.2%)


In [20]:
from sklearn.preprocessing import StandardScaler

# Scale features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns, index=X_test.index)

print("\nFeature scaling complete!")
print(f"Scaled training set shape: {X_train_scaled.shape}")
print(f"Scaled test set shape: {X_test_scaled.shape}")


Feature scaling complete!
Scaled training set shape: (360140, 21)
Scaled test set shape: (90036, 21)


# Model Development
## Logistic Regression

In [21]:
# Model Development
## Logistic Regression
print("Logistic Regression:")
start_time = time.time()
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_scaled, y_train)
lr_train_time = time.time() - start_time

y_pred_lr = lr_model.predict(X_test_scaled)
y_pred_lr_proba = lr_model.predict_proba(X_test_scaled)[:, 1]

print(f"Training completed in {lr_train_time:.2f} seconds")

Logistic Regression:
Training completed in 1.69 seconds


## SVM

In [None]:
## SVM
from sklearn.svm import SVC

print("SVM")
start_time = time.time()
svm_model = SVC(kernel='rbf', random_state=42, probability=True)
svm_model.fit(X_train_scaled, y_train)
svm_train_time = time.time() - start_time

y_pred_svm = svm_model.predict(X_test_scaled)
y_pred_svm_proba = svm_model.predict_proba(X_test_scaled)[:, 1]

print(f"Training completed in {svm_train_time:.2f} seconds")

SVM


## XGBoost

In [22]:
from xgboost import XGBClassifier

print("XGBoost")
start_time = time.time()
xgb_model = XGBClassifier(random_state=42, eval_metric='logloss', use_label_encoder=False)
xgb_model.fit(X_train_scaled, y_train)
xgb_train_time = time.time() - start_time

y_pred_xgb = xgb_model.predict(X_test_scaled)
y_pred_xgb_proba = xgb_model.predict_proba(X_test_scaled)[:, 1]

print(f"Training completed in {xgb_train_time:.2f} seconds")

XGBoost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Training completed in 2.61 seconds


# Evaluation

In [23]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, roc_curve, auc
)

def evaluate_model(y_true, y_pred, y_pred_proba, model_name):
    print(f"\n{'=' * 70}")
    print(f"{model_name}")
    print(f"{'=' * 70}")

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f"\nPerformance Metrics:")
    print(f"  Accuracy:  {accuracy:.4f} ({accuracy*100:.2f}%)")
    print(f"  Precision: {precision:.4f} ({precision*100:.2f}%)")
    print(f"  Recall:    {recall:.4f} ({recall*100:.2f}%)")
    print(f"  F1-Score:  {f1:.4f}")

    cm = confusion_matrix(y_true, y_pred)
    print(f"\nConfusion Matrix:")
    print(f"                     Predicted")
    print(f"                Benign    Malicious")
    print(f"Actual Benign     {cm[0][0]:7,}   {cm[0][1]:7,}")
    print(f"       Malicious  {cm[1][0]:7,}   {cm[1][1]:7,}")

    # ROC-AUC
    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    print(f"\nROC-AUC Score: {roc_auc:.4f}")

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc,
        'confusion_matrix': cm,
        'fpr': fpr,
        'tpr': tpr
    }

In [25]:
print("Baseline Model Evaluation Results")
print("=" * 70)

lr_results = evaluate_model(y_test, y_pred_lr, y_pred_lr_proba, "Logistic Regression")
svm_results = evaluate_model(y_test, y_pred_svm, y_pred_svm_proba, "SVM")
xgb_results = evaluate_model(y_test, y_pred_xgb, y_pred_xgb_proba, "XGBoost")

Baseline Model Evaluation Results

Logistic Regression

Performance Metrics:
  Accuracy:  0.9964 (99.64%)
  Precision: 0.9930 (99.30%)
  Recall:    0.9915 (99.15%)
  F1-Score:  0.9923

Confusion Matrix:
                     Predicted
                Benign    Malicious
Actual Benign      69,003       145
       Malicious      177    20,711

ROC-AUC Score: 0.9971

XGBoost

Performance Metrics:
  Accuracy:  0.9976 (99.76%)
  Precision: 0.9976 (99.76%)
  Recall:    0.9922 (99.22%)
  F1-Score:  0.9949

Confusion Matrix:
                     Predicted
                Benign    Malicious
Actual Benign      69,098        50
       Malicious      162    20,726

ROC-AUC Score: 0.9992
