In [None]:
import pandas as pd
import re
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Load data
df = pd.read_csv("phishing_url.csv")

# Sanity check: view columns
print(df.columns)

# Example assumption: URL is under 'url' column, label is under 'Label'
# If the column is named differently, rename it:
if 'url' not in df.columns:
    df.rename(columns={df.columns[0]: 'url'}, inplace=True)

# Drop any rows with missing labels
df = df.dropna(subset=['url', 'Label'])

# -------- Feature Extraction Function --------
def extract_url_features(url):
    parsed = urlparse(url)
    features = {
        'url_length': len(url),
        'num_dots': url.count('.'),
        'https_flag': 1 if parsed.scheme == 'https' else 0,
        'num_subdirs': url.count('/') - 2,
        'num_special_chars': sum(url.count(c) for c in ['@', '-', '_', '?', '=', '&']),
        'have_ip': 1 if re.search(r'\d{1,3}(\.\d{1,3}){3}', url) else 0
    }
    return features

# Apply feature extraction
features_df = df['url'].apply(extract_url_features).apply(pd.Series)

# Features and target
X = features_df
y = df['Label']

# -------- Train-Test Split --------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -------- Decision Tree Model --------
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# -------- Evaluation --------
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))
import joblib

# Save the model
joblib.dump(model, "decision_tree_phishing_model.pkl")

#  Save feature list to ensure same input order in Streamlit
feature_list = list(X.columns)
joblib.dump(feature_list, "feature_list.pkl")


Index(['Domain', 'Have_IP', 'Have_At', 'URL_Length', 'URL_Depth',
       'Redirection', 'https_Domain', 'TinyURL', 'Prefix/Suffix', 'DNS_Record',
       'Web_Traffic', 'Domain_Age', 'Domain_End', 'iFrame', 'Mouse_Over',
       'Right_Click', 'Web_Forwards', 'Label'],
      dtype='object')
Confusion Matrix:
[[2790   58]
 [ 450  562]]

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.98      0.92      2848
           1       0.91      0.56      0.69      1012

    accuracy                           0.87      3860
   macro avg       0.88      0.77      0.80      3860
weighted avg       0.87      0.87      0.86      3860



['feature_list.pkl']