#Import Libraries

In [1]:
import pandas as pd
import sklearn
import numpy as np  # Added this import
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import warnings
import time

#Evaluation Function

In [2]:
def evaluate_model(model, X, y, fold_count=10):
    """
    Evaluate model using k-fold cross validation and return various metrics
    """
    kf = KFold(n_splits=fold_count, shuffle=True, random_state=42)
    scores = {'accuracy': [], 'precision': [], 'recall': [], 'f1': []}
    train_times = []
    test_times = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Measure training time
        train_start = time.time()
        model.fit(X_train, y_train)
        train_end = time.time()
        train_times.append(train_end - train_start)

        # Measure testing time
        test_start = time.time()
        y_pred = model.predict(X_test)
        test_end = time.time()
        test_times.append(test_end - test_start)

        # Calculate metrics
        scores['accuracy'].append(accuracy_score(y_test, y_pred))
        scores['precision'].append(precision_score(y_test, y_pred))
        scores['recall'].append(recall_score(y_test, y_pred))
        scores['f1'].append(f1_score(y_test, y_pred))

    return {
        'accuracy': np.mean(scores['accuracy']),
        'precision': np.mean(scores['precision']),
        'recall': np.mean(scores['recall']),
        'f1': np.mean(scores['f1']),
        'train_time': np.mean(train_times),
        'test_time': np.mean(test_times)
    }

In [3]:
def mean_score(scoring):
    return {i:j.mean() for i,j in scoring.items()}

#Loading Data

In [4]:
# Load and prepare data with visible index
df = pd.read_csv("dataset.csv", index_col=0)
df = sklearn.utils.shuffle(df, random_state=42)
df = df.reset_index()

# Transform labels from [-1, 1] to [0, 1] for XGBoost compatibility
df['Result'] = (df['Result'] + 1) // 2  # Convert -1 to 0, and 1 stays 1

X = df.drop("Result", axis=1).values
X = preprocessing.scale(X)
y = df['Result'].values
fold_count = 10
print("\n5 random rows of data:")
print(df.sample(n=5, random_state=42).to_string())


5 random rows of data:
       having_IP_Address  URL_Length  Shortining_Service  double_slash_redirecting  Prefix_Suffix  having_Sub_Domain  SSLfinal_State  suspicious_parameter  Favicon  HTTPS_token  Request_URL  URL_of_Anchor  Links_in_tags  SFH  Submitting_to_email  Abnormal_URL  Redirect  on_mouseover  RightClick  popUpWidnow  Iframe  age_of_domain  DNSRecord  web_traffic  Links_pointing_to_page  Statistical_report  Result
12093                  1           1                   1                         1              1                  0               1                     1        1            1            1             -1              0    0                   -1            -1         1             1          -1            1      -1              1          1            0                       0                   1       0
15253                  1           1                   1                         1              1                  0              -1                     1      

# Classication models for phishing detection:


Logistic Regression

In [5]:
print("\nLogistic Regression Classification Results:")
logistic_clf = LogisticRegression(random_state=42)
lr_scores = evaluate_model(logistic_clf, X, y)
print("Logistic Regression Metrics:")
print(f"Accuracy: {lr_scores['accuracy']:.4f}")
print(f"Recall: {lr_scores['recall']:.4f}")
print(f"Precision: {lr_scores['precision']:.4f}")
print(f"F1 Score: {lr_scores['f1']:.4f}")
print(f"Average Training Time: {lr_scores['train_time']:.4f}s")
print(f"Average Testing Time: {lr_scores['test_time']:.4f}s")


Logistic Regression Classification Results:
Logistic Regression Metrics:
Accuracy: 0.9338
Recall: 0.9046
Precision: 0.9261
F1 Score: 0.9152
Average Training Time: 0.1621s
Average Testing Time: 0.0021s


XGBoost

In [6]:
print("\nXGBoost Classification Results:")
XGB_clf = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_scores = evaluate_model(XGB_clf, X, y)

print("XGBoost Metrics:")
print(f"Accuracy: {xgb_scores['accuracy']:.4f}")
print(f"Recall: {xgb_scores['recall']:.4f}")
print(f"Precision: {xgb_scores['precision']:.4f}")
print(f"F1 Score: {xgb_scores['f1']:.4f}")
print(f"Average Training Time: {xgb_scores['train_time']:.4f}s")
print(f"Average Testing Time: {xgb_scores['test_time']:.4f}s")


XGBoost Classification Results:


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



XGBoost Metrics:
Accuracy: 0.9677
Recall: 0.9628
Precision: 0.9559
F1 Score: 0.9593
Average Training Time: 0.4225s
Average Testing Time: 0.0109s
