In [1]:
import pandas as pd
import urllib.parse
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# ========== Step 1: Load dataset ==========
file_path = "features_10000.csv" 
df = pd.read_csv(file_path)

# Clean column names
df.columns = df.columns.str.strip().str.lower()
df = df.rename(columns={"type": "label"})

# Balance dataset: 2500 benign, 1500 malicious
benign_df = df[df["label"].str.lower() == "benign"].head(2500)
malicious_df = df[df["label"].str.lower() == "malicious"].head(1500)
sample_df = pd.concat([benign_df, malicious_df]).reset_index(drop=True)

# ========== Step 2: Feature Engineering ==========
def extract_features(url):
    try:
        parsed = urllib.parse.urlparse(url)
    except:
        parsed = None

    url_length = len(url)
    num_dots = url.count(".")
    num_hyphens = url.count("-")
    num_at = url.count("@")
    num_digits = sum(c.isdigit() for c in url)
    num_params = url.count("=")
    num_slashes = url.count("/")
    num_question = url.count("?")
    num_percent = url.count("%")
    num_special = sum(c in [';', '_', '?', '=', '&'] for c in url)

    hostname = parsed.hostname if parsed and parsed.hostname else ""
    domain_length = len(hostname)

    path_length = len(parsed.path) if parsed and parsed.path else 0
    has_https = 1 if parsed and parsed.scheme == "https" else 0
    has_http = 1 if parsed and parsed.scheme == "http" else 0

    keywords = ["login", "secure", "update", "free", "verify", "bank", "account", "paypal"]
    has_suspicious_kw = any(kw in url.lower() for kw in keywords)

    return [
        url_length, num_dots, num_hyphens, num_at, num_digits,
        num_params, num_slashes, num_question, num_percent, num_special,
        domain_length, path_length, has_https, has_http,
        has_suspicious_kw
    ]

# Apply feature extraction
features = sample_df["url"].apply(extract_features)
feature_names = [
    "url_length", "num_dots", "num_hyphens", "num_at", "num_digits",
    "num_params", "num_slashes", "num_question", "num_percent", "num_special",
    "domain_length", "path_length", "has_https", "has_http",
    "has_suspicious_kw"
]
features_df = pd.DataFrame(features.tolist(), columns=feature_names)

# Merge with URL + label
final_df = pd.concat([sample_df["url"].reset_index(drop=True),
                      features_df,
                      sample_df["label"].reset_index(drop=True)], axis=1)

# ========== Step 3: Prepare Data ==========
X = final_df.drop(columns=["url", "label"])
y = final_df["label"].map({"benign": 0, "malicious": 1})

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ========== Step 4: Train Logistic Regression ==========
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

# ========== Step 5: Evaluate ==========
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

print("✅ Logistic Regression Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Feature importance
feature_importance = pd.DataFrame({
    "feature": X.columns,
    "coefficient": model.coef_[0]
}).sort_values(by="coefficient", ascending=False)

print("\n🔎 Feature Importance:\n", feature_importance)


✅ Logistic Regression Accuracy: 0.9275

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.97      0.94       500
           1       0.94      0.86      0.90       300

    accuracy                           0.93       800
   macro avg       0.93      0.91      0.92       800
weighted avg       0.93      0.93      0.93       800


🔎 Feature Importance:
               feature  coefficient
10      domain_length     4.078839
13           has_http     3.099904
5          num_params     0.860129
1            num_dots     0.314823
14  has_suspicious_kw     0.266311
12          has_https     0.154435
8         num_percent    -0.083344
4          num_digits    -0.097778
3              num_at    -0.172047
11        path_length    -0.466237
7        num_question    -0.507050
0          url_length    -0.759001
2         num_hyphens    -0.975982
9         num_special    -1.287151
6         num_slashes    -2.278590


In [2]:
# ========== Step 6: Save Models ==========
import pickle

# Save the trained logistic regression model
with open('logistic_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Save the scaler
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Save the label encoder mapping (for reference)
label_mapping = {"benign": 0, "malicious": 1}
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_mapping, f)

print("✅ Models saved successfully!")
print("- logistic_model.pkl: Trained logistic regression model")
print("- scaler.pkl: Feature scaler")
print("- label_encoder.pkl: Label mapping")

# Display model summary
print(f"\n📊 Model Summary:")
print(f"- Training samples: {len(X_train)}")
print(f"- Test samples: {len(X_test)}")
print(f"- Features: {len(feature_names)}")
print(f"- Test Accuracy: {accuracy:.4f}")
print(f"- Benign precision: {0.92:.2f}")
print(f"- Malicious precision: {0.94:.2f}")

✅ Models saved successfully!
- logistic_model.pkl: Trained logistic regression model
- scaler.pkl: Feature scaler
- label_encoder.pkl: Label mapping

📊 Model Summary:
- Training samples: 3200
- Test samples: 800
- Features: 15
- Test Accuracy: 0.9275
- Benign precision: 0.92
- Malicious precision: 0.94
