In [1]:
# ============================================================
# 1. Imports
# ============================================================
import pandas as pd
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, f1_score, confusion_matrix

# ============================================================
# 2. Load dataset
# ============================================================
file_path = "/content/feature_extracted_data.csv"
df = pd.read_csv(file_path)

print("Initial shape:", df.shape)

# ============================================================
# 3. Drop text column (URL is not used directly)
# ============================================================
df = df.drop(columns=["url"], errors="ignore")

# ============================================================
# 4. Prepare features and label
# ============================================================
X = df.drop(columns=["label"])
y = df["label"].astype(int)

print("Feature matrix:", X.shape)
print("Target vector:", y.shape)

# ============================================================
# 5. Train-test split
# ============================================================
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

# ============================================================
# 6. Compute class weight (important for recall)
# ============================================================
neg = (y_train == 0).sum()
pos = (y_train == 1).sum()
scale_pos_weight = neg / pos

print("scale_pos_weight:", scale_pos_weight)

# ============================================================
# 7. Train LightGBM model
# ============================================================
model = lgb.LGBMClassifier(
    objective="binary",
    n_estimators=100,
    learning_rate=0.1,
    num_leaves=15,
    scale_pos_weight=scale_pos_weight,
    random_state=42
)

model.fit(X_train, y_train)

# ============================================================
# 8. Quick evaluation (DEFAULT threshold = 0.5)
# ============================================================
y_pred = model.predict(X_test)

print("\nRecall:", recall_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("F1-score:", f1_score(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Initial shape: (106494, 7)
Feature matrix: (106494, 5)
Target vector: (106494,)
Train shape: (74545, 5)
Test shape: (31949, 5)
scale_pos_weight: 0.9999731709280176
[LightGBM] [Info] Number of positive: 37273, number of negative: 37272
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007240 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 416
[LightGBM] [Info] Number of data points in the train set: 74545, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500007 -> initscore=0.000027
[LightGBM] [Info] Start training from score 0.000027

Recall: 0.9943658444973081
Precision: 0.9969871955812202
F1-score: 0.9956747947094591

Confusion Matrix:
[[15927    48]
 [   90 15884]]


In [2]:
import joblib

# Save the trained LightGBM model
model_path = "url_phishing_lgbm_model.pkl"
joblib.dump(model, model_path)

print(f"Model saved to {model_path}")

# Save threshold separately (best practice)
FINAL_THRESHOLD = 0.25
print("Final decision threshold:", FINAL_THRESHOLD)


Model saved to url_phishing_lgbm_model.pkl
Final decision threshold: 0.25
