In [2]:
!pip install xgboost cudf-cu12 cuml-cu12 --extra-index-url=https://pypi.nvidia.com --quiet


In [12]:
# ==========================================================
# 🧠 GPU-Accelerated Phishing URL Detection (Balanced Version)
#  ✅ XGBoost (GPU) + cuML RandomForest (GPU)
#  ✅ SMOTE Oversampling + Threshold Tuning
#  ✅ False Positive Correction for Trusted Domains
#  ✅ Model Saving (.pkl via joblib)
# ==========================================================

!pip install xgboost cudf-cu12 cuml-cu12 imbalanced-learn joblib --extra-index-url=https://pypi.nvidia.com --quiet

import pandas as pd
import numpy as np
import re
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from xgboost import XGBClassifier
from cuml.ensemble import RandomForestClassifier as cuRF
from imblearn.over_sampling import SMOTE
import joblib
import warnings
warnings.filterwarnings('ignore')


class URLSafetyGPU:
    def __init__(self):
        self.model = None
        self.label_encoder = LabelEncoder()

    def calculate_entropy(self, string):
        if not string:
            return 0
        prob = [float(string.count(c)) / len(string) for c in set(string)]
        return -sum(p * np.log2(p) for p in prob if p > 0)

    def extract_url_features(self, urls):
        feats = []
        for url in urls:
            p = urlparse(url)
            feats.append({
                "length": len(url),
                "num_dots": url.count('.'),
                "num_hyphens": url.count('-'),
                "num_digits": sum(c.isdigit() for c in url),
                "num_slashes": url.count('/'),
                "entropy": self.calculate_entropy(url),
                "has_ip": int(bool(re.search(r'\d+\.\d+\.\d+\.\d+', url))),
                "suspicious_tld": int(any(t in url.lower() for t in ['.tk','.ml','.ga','.cf','.cc','.pw'])),
                "phish_keywords": sum(k in url.lower() for k in
                                      ['secure','account','update','bank','paypal','login','verify']),
                "domain_len": len(p.netloc),
                "path_len": len(p.path),
            })
        return pd.DataFrame(feats)

    def prepare_data(self, df, url_col='url', label_col='type'):
        X = self.extract_url_features(df[url_col])
        y = df[label_col].apply(
            lambda x: 'safe' if str(x).lower() in ['benign','benigndefacement'] else 'not_safe'
        )
        y_encoded = self.label_encoder.fit_transform(y)
        return X, y_encoded, y

    def train_xgboost_gpu(self, X, y_encoded):
        X_tr, X_te, y_tr, y_te = train_test_split(
            X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
        )

        # 🧩 Balance with SMOTE
        print("\n⚖️ Applying SMOTE Oversampling...")
        smote = SMOTE(random_state=42)
        X_tr_bal, y_tr_bal = smote.fit_resample(X_tr, y_tr)
        print(f"After SMOTE: {np.bincount(y_tr_bal)} (balanced)")

        print("\n⚡ Training XGBoost (GPU)...")
        self.model = XGBClassifier(
            n_estimators=250,
            learning_rate=0.1,
            max_depth=6,
            tree_method='gpu_hist',
            predictor='gpu_predictor',
            random_state=42,
            eval_metric='logloss'
        )
        self.model.fit(X_tr_bal, y_tr_bal)

        probs = self.model.predict_proba(X_te)[:, 1]
        preds = (probs > 0.4).astype(int)

        acc = accuracy_score(y_te, preds)
        print(f"✅ XGBoost (GPU) Accuracy: {acc:.4f}")
        print("\nClassification Report:")
        print(classification_report(y_te, preds, target_names=['safe','not_safe']))
        print("Confusion Matrix:\n", confusion_matrix(y_te, preds))

        # 💾 Save the trained model and encoder
        joblib.dump(self.model, "xgboost_gpu_model.pkl")
        joblib.dump(self.label_encoder, "label_encoder.pkl")
        print("📦 Saved: xgboost_gpu_model.pkl & label_encoder.pkl")

    def train_random_forest_gpu(self, X, y_encoded):
        X_tr, X_te, y_tr, y_te = train_test_split(
            X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
        )
        smote = SMOTE(random_state=42)
        X_tr_bal, y_tr_bal = smote.fit_resample(X_tr, y_tr)

        print("\n⚡ Training cuML Random Forest (GPU)...")
        self.model = cuRF(
            n_estimators=200,
            max_depth=12,
            bootstrap=True,
            random_state=42,
            n_streams=4
        )
        self.model.fit(X_tr_bal, y_tr_bal)

        preds = self.model.predict(X_te)
        acc = accuracy_score(y_te, preds)
        print(f"✅ Random Forest (GPU) Accuracy: {acc:.4f}")
        print("\nClassification Report:")
        print(classification_report(y_te, preds, target_names=['safe','not_safe']))
        print("Confusion Matrix:\n", confusion_matrix(y_te, preds))

        # 💾 Save Random Forest model
        joblib.dump(self.model, "random_forest_gpu_model.pkl")
        print("📦 Saved: random_forest_gpu_model.pkl")

    # ------------------ URL Prediction (with False Positive Check) ------------------
    def predict_url(self, url):
        feats = self.extract_url_features([url])
        pred = None

        if hasattr(self.model, "predict_proba"):
            try:
                prob = self.model.predict_proba(feats)[0][1]
                pred = 1 if prob > 0.4 else 0
            except Exception:
                pass

        if pred is None:
            pred = int(self.model.predict(feats)[0])

        label = self.label_encoder.inverse_transform([pred])[0]

        # ✅ False Positive Correction
        safe_domains = [
            'google.com', 'youtube.com', 'microsoft.com', 'apple.com',
            'amazon.com', 'wikipedia.org', 'facebook.com', 'instagram.com',
            'linkedin.com', 'yahoo.com'
        ]

        for domain in safe_domains:
            if domain in url.lower() and label == 'not_safe':
                print(f"⚠️ False Positive Detected: {url}")
                print(f"✅ Corrected → Safe (Trusted Domain Match: {domain})")
                label = 'safe'
                break

        return label


def main():
    print("📂 Loading dataset ...")
    df = pd.read_csv('/content/drive/MyDrive/malicious_phish.csv')
    print(f"✅ Loaded {len(df)} samples")

    clf = URLSafetyGPU()
    print("\n🔧 Extracting features ...")
    X, y_encoded, y_original = clf.prepare_data(df)
    print(f"Feature shape: {X.shape}")
    print(pd.Series(y_original).value_counts())

    # Train and Save Models
    clf.train_xgboost_gpu(X, y_encoded)
    clf.train_random_forest_gpu(X, y_encoded)

    print("\n================= TESTING URLs =================")
    test_urls = [
        'https://www.google.com',
        'http://paypal-update.tk/verify-account',
        'http://192.168.1.1/login'
    ]
    for u in test_urls:
        pred = clf.predict_url(u)
        print(f"🔍 {u} → {pred}")


if __name__ == "__main__":
    main()


📂 Loading dataset ...
✅ Loaded 651192 samples

🔧 Extracting features ...
Feature shape: (651192, 11)
type
safe        428104
not_safe    223088
Name: count, dtype: int64

⚖️ Applying SMOTE Oversampling...
After SMOTE: [342483 342483] (balanced)

⚡ Training XGBoost (GPU)...
✅ XGBoost (GPU) Accuracy: 0.9436

Classification Report:
              precision    recall  f1-score   support

        safe       0.93      0.91      0.92     44618
    not_safe       0.95      0.96      0.96     85621

    accuracy                           0.94    130239
   macro avg       0.94      0.94      0.94    130239
weighted avg       0.94      0.94      0.94    130239

Confusion Matrix:
 [[40492  4126]
 [ 3213 82408]]
📦 Saved: xgboost_gpu_model.pkl & label_encoder.pkl

⚡ Training cuML Random Forest (GPU)...
✅ Random Forest (GPU) Accuracy: 0.9381

Classification Report:
              precision    recall  f1-score   support

        safe       0.90      0.92      0.91     44618
    not_safe       0.96      