## Benchmark Model

- Here a very simple logistic regression model will be trained just using `URLLength` as a feature
- This is will serve as a benchmark result to compare to the performance of the final model

## Import libraries

In [None]:
# Data Handling
import pandas as pd

# Model Selection
from sklearn.model_selection import train_test_split

# Models
from sklearn.linear_model import LogisticRegression

# Evaluation Metrics
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)

## Prepare data

In [None]:
# S3 path
bucket_name = 'bucket_name'
prefix = 'initial_dataset/PhiUSIIL_Phishing_URL_Dataset 3.csv'
s3_path = f's3://{bucket_name}/{prefix}'

df = pd.read_csv(s3_path)
# Select only URL Length
selected_features = ['URLLength']

X = df[selected_features]
y = df['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=0
)

In [None]:
print(X_train.head())

## Train Model

In [None]:
# Benchmark model: Logistic Regression
baseline_model = LogisticRegression(max_iter=500, solver='lbfgs')  # 'lbfgs' is default and works well
baseline_model.fit(X_train, y_train)

## Evaluate Model

In [None]:
# Predictions
y_pred = baseline_model.predict(X_test)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Accuracy
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)

# Precision
prec = precision_score(y_test, y_pred)
print("Precision:", prec)

# Recall
rec = recall_score(y_test, y_pred)
print("Recall:", rec)

# F1 Score
f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)

# Full classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))