In [None]:
# ============================================================================
# MODEL 1: LOGISTIC REGRESSION - COMPLETE TRAINING PIPELINE
# ============================================================================
# This cell contains the entire Model 1 workflow:
# - Data loading and preprocessing
# - Feature engineering 
# - Model training (Logistic Regression)
# - Evaluation and model saving
# Collapse this cell when working on other models
# ============================================================================

import pandas as pd
import urllib.parse
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

print("🚀 Starting Model 1: Logistic Regression Training Pipeline")
print("=" * 60)

# ========== Step 1: Load dataset ==========
print("📂 Loading dataset...")
file_path = "features_10000.csv" 
df = pd.read_csv(file_path)

# Clean column names
df.columns = df.columns.str.strip().str.lower()
df = df.rename(columns={"type": "label"})

# Balance dataset: 2500 benign, 1500 malicious
benign_df = df[df["label"].str.lower() == "benign"].head(2500)
malicious_df = df[df["label"].str.lower() == "malicious"].head(1500)
sample_df = pd.concat([benign_df, malicious_df]).reset_index(drop=True)
print(f"✅ Dataset loaded: {len(sample_df)} samples ({len(benign_df)} benign, {len(malicious_df)} malicious)")

# ========== Step 2: Feature Engineering ==========
print("\n🔧 Extracting features...")
def extract_features(url):
    try:
        parsed = urllib.parse.urlparse(url)
    except:
        parsed = None

    url_length = len(url)
    num_dots = url.count(".")
    num_hyphens = url.count("-")
    num_at = url.count("@")
    num_digits = sum(c.isdigit() for c in url)
    num_params = url.count("=")
    num_slashes = url.count("/")
    num_question = url.count("?")
    num_percent = url.count("%")
    num_special = sum(c in [';', '_', '?', '=', '&'] for c in url)

    hostname = parsed.hostname if parsed and parsed.hostname else ""
    domain_length = len(hostname)

    path_length = len(parsed.path) if parsed and parsed.path else 0
    has_https = 1 if parsed and parsed.scheme == "https" else 0
    has_http = 1 if parsed and parsed.scheme == "http" else 0

    keywords = ["login", "secure", "update", "free", "verify", "bank", "account", "paypal"]
    has_suspicious_kw = any(kw in url.lower() for kw in keywords)

    return [
        url_length, num_dots, num_hyphens, num_at, num_digits,
        num_params, num_slashes, num_question, num_percent, num_special,
        domain_length, path_length, has_https, has_http,
        has_suspicious_kw
    ]

# Apply feature extraction
features = sample_df["url"].apply(extract_features)
feature_names = [
    "url_length", "num_dots", "num_hyphens", "num_at", "num_digits",
    "num_params", "num_slashes", "num_question", "num_percent", "num_special",
    "domain_length", "path_length", "has_https", "has_http",
    "has_suspicious_kw"
]
features_df = pd.DataFrame(features.tolist(), columns=feature_names)

# Merge with URL + label
final_df = pd.concat([sample_df["url"].reset_index(drop=True),
                      features_df,
                      sample_df["label"].reset_index(drop=True)], axis=1)
print(f"✅ Features extracted: {len(feature_names)} features")

# ========== Step 3: Prepare Data ==========
print("\n📊 Preparing training data...")
X = final_df.drop(columns=["url", "label"])
y = final_df["label"].map({"benign": 0, "malicious": 1})

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(f"✅ Data prepared: {len(X_train)} training, {len(X_test)} test samples")

# ========== Step 4: Train Logistic Regression ==========
print("\n🤖 Training Logistic Regression model...")
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)
print("✅ Model training completed!")

# ========== Step 5: Evaluate ==========
print("\n📈 Evaluating model performance...")
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

print(f"✅ Logistic Regression Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature importance
feature_importance = pd.DataFrame({
    "feature": X.columns,
    "coefficient": model.coef_[0]
}).sort_values(by="coefficient", ascending=False)

print("\n🔎 Feature Importance:")
print(feature_importance)

# ========== Step 6: Save Models ==========
print("\n💾 Saving trained models...")

# Save the trained logistic regression model
with open('logistic_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Save the scaler
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Save the label encoder mapping (for reference)
label_mapping = {"benign": 0, "malicious": 1}
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_mapping, f)

print("✅ Models saved successfully!")
print("- logistic_model.pkl: Trained logistic regression model")
print("- scaler.pkl: Feature scaler")
print("- label_encoder.pkl: Label mapping")

# Display model summary
print(f"\n📊 Model 1 Summary:")
print("=" * 40)
print(f"Model Type: Logistic Regression")
print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"Features: {len(feature_names)}")
print(f"Test Accuracy: {accuracy:.4f}")
print("=" * 40)
print("🎉 Model 1 training pipeline completed!")

In [3]:
# ============================================================================
# MODEL 2: LOGISTIC REGRESSION WITHOUT HTTPS FEATURE - COMPLETE TRAINING PIPELINE
# ============================================================================
# This cell contains the entire Model 2 workflow (similar to Model 1 but drops has_https):
# - Data loading and preprocessing
# - Feature engineering (14 features instead of 15)
# - Model training (Logistic Regression)
# - Evaluation and model saving
# Collapse this cell when working on other models
# ============================================================================

import pandas as pd
import urllib.parse
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

print("🚀 Starting Model 2: Logistic Regression Training Pipeline (No HTTPS Feature)")
print("=" * 70)

# ========== Step 1: Load dataset ==========
print("📂 Loading dataset...")
file_path = "features_10000.csv" 
df = pd.read_csv(file_path)

# Clean column names
df.columns = df.columns.str.strip().str.lower()
df = df.rename(columns={"type": "label"})

# Balance dataset: 2500 benign, 1500 malicious
benign_df = df[df["label"].str.lower() == "benign"].head(2500)
malicious_df = df[df["label"].str.lower() == "malicious"].head(1500)
sample_df = pd.concat([benign_df, malicious_df]).reset_index(drop=True)
print(f"✅ Dataset loaded: {len(sample_df)} samples ({len(benign_df)} benign, {len(malicious_df)} malicious)")

# ========== Step 2: Feature Engineering (WITHOUT has_https) ==========
print("\n🔧 Extracting features (excluding has_https)...")
def extract_features_no_https(url):
    try:
        parsed = urllib.parse.urlparse(url)
    except:
        parsed = None

    url_length = len(url)
    num_dots = url.count(".")
    num_hyphens = url.count("-")
    num_at = url.count("@")
    num_digits = sum(c.isdigit() for c in url)
    num_params = url.count("=")
    num_slashes = url.count("/")
    num_question = url.count("?")
    num_percent = url.count("%")
    num_special = sum(c in [';', '_', '?', '=', '&'] for c in url)

    hostname = parsed.hostname if parsed and parsed.hostname else ""
    domain_length = len(hostname)

    path_length = len(parsed.path) if parsed and parsed.path else 0
    has_http = 1 if parsed and parsed.scheme == "http" else 0

    keywords = ["login", "secure", "update", "free", "verify", "bank", "account", "paypal"]
    has_suspicious_kw = any(kw in url.lower() for kw in keywords)

    # NOTE: Removed has_https feature
    return [
        url_length, num_dots, num_hyphens, num_at, num_digits,
        num_params, num_slashes, num_question, num_percent, num_special,
        domain_length, path_length, has_http,
        has_suspicious_kw
    ]

# Apply feature extraction
features = sample_df["url"].apply(extract_features_no_https)
feature_names = [
    "url_length", "num_dots", "num_hyphens", "num_at", "num_digits",
    "num_params", "num_slashes", "num_question", "num_percent", "num_special",
    "domain_length", "path_length", "has_http",
    "has_suspicious_kw"
]
features_df = pd.DataFrame(features.tolist(), columns=feature_names)

# Merge with URL + label
final_df = pd.concat([sample_df["url"].reset_index(drop=True),
                      features_df,
                      sample_df["label"].reset_index(drop=True)], axis=1)
print(f"✅ Features extracted: {len(feature_names)} features (dropped has_https)")

# ========== Step 3: Prepare Data ==========
print("\n📊 Preparing training data...")
X = final_df.drop(columns=["url", "label"])
y = final_df["label"].map({"benign": 0, "malicious": 1})

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(f"✅ Data prepared: {len(X_train)} training, {len(X_test)} test samples")

# ========== Step 4: Train Logistic Regression ==========
print("\n🤖 Training Logistic Regression model...")
model_v2 = LogisticRegression(max_iter=1000)
model_v2.fit(X_train_scaled, y_train)
print("✅ Model training completed!")

# ========== Step 5: Evaluate ==========
print("\n📈 Evaluating model performance...")
y_pred = model_v2.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

print(f"✅ Logistic Regression Accuracy (No HTTPS): {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature importance
feature_importance = pd.DataFrame({
    "feature": X.columns,
    "coefficient": model_v2.coef_[0]
}).sort_values(by="coefficient", ascending=False)

print("\n🔎 Feature Importance:")
print(feature_importance)

# ========== Step 6: Save Models ==========
print("\n💾 Saving trained models...")

# Save the trained logistic regression model (v2)
with open('logistic_model_v2.pkl', 'wb') as f:
    pickle.dump(model_v2, f)

# Save the scaler (v2)
with open('scaler_v2.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Save the label encoder mapping (for reference)
label_mapping = {"benign": 0, "malicious": 1}
with open('label_encoder_v2.pkl', 'wb') as f:
    pickle.dump(label_mapping, f)

print("✅ Models saved successfully!")
print("- logistic_model_v2.pkl: Trained logistic regression model (no HTTPS)")
print("- scaler_v2.pkl: Feature scaler")
print("- label_encoder_v2.pkl: Label mapping")

# Display model summary
print(f"\n📊 Model 2 Summary:")
print("=" * 40)
print(f"Model Type: Logistic Regression (No HTTPS)")
print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"Features: {len(feature_names)} (dropped has_https)")
print(f"Test Accuracy: {accuracy:.4f}")
print("=" * 40)
print("🎉 Model 2 training pipeline completed!")

🚀 Starting Model 2: Logistic Regression Training Pipeline (No HTTPS Feature)
📂 Loading dataset...
✅ Dataset loaded: 4000 samples (2500 benign, 1500 malicious)

🔧 Extracting features (excluding has_https)...
✅ Features extracted: 14 features (dropped has_https)

📊 Preparing training data...
✅ Data prepared: 3200 training, 800 test samples

🤖 Training Logistic Regression model...
✅ Model training completed!

📈 Evaluating model performance...
✅ Logistic Regression Accuracy (No HTTPS): 0.9263

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.97      0.94       500
           1       0.94      0.86      0.90       300

    accuracy                           0.93       800
   macro avg       0.93      0.91      0.92       800
weighted avg       0.93      0.93      0.93       800


🔎 Feature Importance:
              feature  coefficient
10      domain_length     4.229212
12           has_http     2.956892
5          num_params     0

In [1]:
# ============================================================================
# MODEL 3: IMPROVED LOGISTIC REGRESSION WITH BALANCED TRAINING DATA
# ============================================================================
# This cell addresses the fundamental issue: the model is biased against legitimate domains
# Improvements:
# - Better feature engineering with domain reputation features
# - More balanced approach to suspicious keyword detection
# - Enhanced feature selection to reduce false positives
# - Training data augmentation techniques
# ============================================================================

import pandas as pd
import urllib.parse
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import re

print("🚀 Starting Model 3: Improved Logistic Regression Training Pipeline")
print("=" * 70)

# ========== Step 1: Load dataset ==========
print("📂 Loading dataset...")
file_path = "features_10000.csv" 
df = pd.read_csv(file_path)

# Clean column names
df.columns = df.columns.str.strip().str.lower()
df = df.rename(columns={"type": "label"})

# Balance dataset: 2500 benign, 1500 malicious
benign_df = df[df["label"].str.lower() == "benign"].head(2500)
malicious_df = df[df["label"].str.lower() == "malicious"].head(1500)
sample_df = pd.concat([benign_df, malicious_df]).reset_index(drop=True)
print(f"✅ Dataset loaded: {len(sample_df)} samples ({len(benign_df)} benign, {len(malicious_df)} malicious)")

# ========== Step 2: Enhanced Feature Engineering ==========
print("\n🔧 Extracting enhanced features...")
def extract_enhanced_features(url):
    try:
        parsed = urllib.parse.urlparse(url)
    except:
        parsed = None

    # Basic URL features
    url_length = len(url)
    num_dots = url.count(".")
    num_hyphens = url.count("-")
    num_at = url.count("@")
    num_digits = sum(c.isdigit() for c in url)
    num_params = url.count("=")
    num_slashes = url.count("/")
    num_question = url.count("?")
    num_percent = url.count("%")
    num_special = sum(c in [';', '_', '?', '=', '&'] for c in url)

    hostname = parsed.hostname if parsed and parsed.hostname else ""
    domain_length = len(hostname)
    path_length = len(parsed.path) if parsed and parsed.path else 0
    has_http = 1 if parsed and parsed.scheme == "http" else 0

    # IMPROVED: More nuanced suspicious keyword detection
    # Split into different categories with different weights
    phishing_keywords = ["login", "secure", "verify", "account", "bank", "paypal", "update"]
    marketing_keywords = ["free", "win", "prize", "offer", "deal"]
    
    has_phishing_kw = 1 if any(kw in url.lower() for kw in phishing_keywords) else 0
    has_marketing_kw = 1 if any(kw in url.lower() for kw in marketing_keywords) else 0

    # NEW: Domain reputation features
    domain_parts = hostname.split('.')
    main_domain = domain_parts[0] if domain_parts else ""
    
    # Check for suspicious TLDs
    suspicious_tlds = ['.tk', '.ml', '.ga', '.cf', '.pw']
    has_suspicious_tld = 1 if any(hostname.endswith(tld) for tld in suspicious_tlds) else 0
    
    # Check for legitimate TLDs
    legitimate_tlds = ['.com', '.org', '.net', '.edu', '.gov', '.mil']
    has_legitimate_tld = 1 if any(hostname.endswith(tld) for tld in legitimate_tlds) else 0
    
    # Domain age indicator (simple heuristic based on length and structure)
    domain_looks_established = 1 if (len(main_domain) >= 4 and 
                                   not re.match(r'^[0-9]+$', main_domain) and
                                   len(domain_parts) >= 2) else 0
    
    # URL shortener detection
    url_shorteners = ['bit.ly', 'tinyurl.com', 'ow.ly', 't.co', 'goo.gl']
    is_url_shortener = 1 if any(shortener in hostname for shortener in url_shorteners) else 0
    
    # Subdomain analysis
    num_subdomains = len(domain_parts) - 2 if len(domain_parts) > 2 else 0
    
    # IP address detection
    is_ip_address = 1 if re.match(r'^\d+\.\d+\.\d+\.\d+', hostname) else 0
    
    return [
        url_length, num_dots, num_hyphens, num_at, num_digits,
        num_params, num_slashes, num_question, num_percent, num_special,
        domain_length, path_length, has_http,
        has_phishing_kw, has_marketing_kw,  # Split suspicious keywords
        has_suspicious_tld, has_legitimate_tld, domain_looks_established,
        is_url_shortener, num_subdomains, is_ip_address
    ]

# Apply enhanced feature extraction
features = sample_df["url"].apply(extract_enhanced_features)
feature_names = [
    "url_length", "num_dots", "num_hyphens", "num_at", "num_digits",
    "num_params", "num_slashes", "num_question", "num_percent", "num_special",
    "domain_length", "path_length", "has_http",
    "has_phishing_kw", "has_marketing_kw",
    "has_suspicious_tld", "has_legitimate_tld", "domain_looks_established",
    "is_url_shortener", "num_subdomains", "is_ip_address"
]
features_df = pd.DataFrame(features.tolist(), columns=feature_names)

# Merge with URL + label
final_df = pd.concat([sample_df["url"].reset_index(drop=True),
                      features_df,
                      sample_df["label"].reset_index(drop=True)], axis=1)
print(f"✅ Features extracted: {len(feature_names)} enhanced features")

# ========== Step 3: Data Preparation with Class Balancing ==========
print("\n📊 Preparing training data with enhanced balancing...")
X = final_df.drop(columns=["url", "label"])
y = final_df["label"].map({"benign": 0, "malicious": 1})

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(f"✅ Data prepared: {len(X_train)} training, {len(X_test)} test samples")

# ========== Step 4: Train Improved Logistic Regression ==========
print("\n🤖 Training improved Logistic Regression model...")
# Use class_weight='balanced' to handle class imbalance better
model_v3 = LogisticRegression(max_iter=1000, class_weight='balanced', C=0.1)
model_v3.fit(X_train_scaled, y_train)
print("✅ Model training completed!")

# ========== Step 5: Comprehensive Evaluation ==========
print("\n📈 Evaluating model performance...")
y_pred = model_v3.predict(X_test_scaled)
y_pred_proba = model_v3.predict_proba(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

print(f"✅ Improved Logistic Regression Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature importance analysis
feature_importance = pd.DataFrame({
    "feature": X.columns,
    "coefficient": model_v3.coef_[0],
    "abs_coefficient": np.abs(model_v3.coef_[0])
}).sort_values(by="abs_coefficient", ascending=False)

print("\n🔎 Feature Importance (Top 10):")
print(feature_importance.head(10))

# ========== Step 6: Save Enhanced Models ==========
print("\n💾 Saving enhanced models...")

# Save the trained logistic regression model (v3)
with open('logistic_model_v3.pkl', 'wb') as f:
    pickle.dump(model_v3, f)

# Save the scaler (v3)
with open('scaler_v3.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Save feature names for reference
with open('feature_names_v3.pkl', 'wb') as f:
    pickle.dump(feature_names, f)

# Save the label encoder mapping (for reference)
label_mapping = {"benign": 0, "malicious": 1}
with open('label_encoder_v3.pkl', 'wb') as f:
    pickle.dump(label_mapping, f)

print("✅ Enhanced models saved successfully!")
print("- logistic_model_v3.pkl: Enhanced logistic regression model")
print("- scaler_v3.pkl: Feature scaler")
print("- feature_names_v3.pkl: Feature names for reference")
print("- label_encoder_v3.pkl: Label mapping")

# Display model summary
print(f"\n📊 Model 3 Summary:")
print("=" * 50)
print(f"Model Type: Enhanced Logistic Regression")
print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"Features: {len(feature_names)} (enhanced feature set)")
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Class weighting: Balanced")
print(f"Regularization: C=0.1 (stronger regularization)")
print("=" * 50)
print("🎉 Model 3 training pipeline completed!")


🚀 Starting Model 3: Improved Logistic Regression Training Pipeline
📂 Loading dataset...
✅ Dataset loaded: 4000 samples (2500 benign, 1500 malicious)

🔧 Extracting enhanced features...
✅ Features extracted: 21 enhanced features

📊 Preparing training data with enhanced balancing...
✅ Data prepared: 3200 training, 800 test samples

🤖 Training improved Logistic Regression model...
✅ Model training completed!

📈 Evaluating model performance...
✅ Improved Logistic Regression Accuracy: 0.9463

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.97      0.96       500
           1       0.95      0.90      0.93       300

    accuracy                           0.95       800
   macro avg       0.95      0.94      0.94       800
weighted avg       0.95      0.95      0.95       800


🔎 Feature Importance (Top 10):
                     feature  coefficient  abs_coefficient
12                  has_http     2.841601         2.841601
10      