In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from joblib import dump, load
import tldextract

1. Load Dataset

In [16]:
df = pd.read_csv('phishing_site_urls_updated.csv')  # Replace with your dataset
print("Original Dataset:")
print(df.head())

Original Dataset:
                                                 URL  Label
0  nobell.it/70ffb52d079109dca5664cce6f317373782/...      0
1  www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...      0
2  serviciosbys.com/paypal.cgi.bin.get-into.herf....      0
3  mail.printakid.com/www.online.americanexpress....      0
4  thewhiskeydregs.com/wp-content/themes/widescre...      0


2. Extract Domains from URLs

In [18]:
# Extract just the suffix (TLD)
def extract_suffix(url):
    extracted = tldextract.extract(url)
    return extracted.suffix  # Returns only the suffix part (e.g., "co.uk", "com")

# Apply to DataFrame
df['suffix'] = df['URL'].apply(extract_suffix)  # Ensure column name matches your data ("URL" vs "url")

# Count suffix frequencies
suffix_counts = df['suffix'].value_counts()

# print("Top 10 Most Common Suffixes:")
# print(suffix_counts.head(10))

print("\nFull Suffix Distribution:")
print(suffix_counts)


Full Suffix Distribution:
suffix
com          362065
org           48771
net           23546
edu           10086
ca             9558
              ...  
world             1
gov.pg            1
pr.gov.br         1
kr.ua             1
ky                1
Name: count, Length: 819, dtype: int64


In [20]:
# Extract suffix function
def extract_suffix(url):
    extracted = tldextract.extract(url)
    return extracted.suffix

# Apply to DataFrame
df['suffix'] = df['URL'].apply(extract_suffix)

# 1. Create suffix counts with index numbers
suffix_counts = df['suffix'].value_counts().reset_index()
suffix_counts.columns = ['suffix', 'count']
suffix_counts.insert(0, 'index', range(1, 1 + len(suffix_counts)))  # Add index column

# Save suffix counts to CSV
suffix_counts.to_csv("suffix_counts.csv", index=False)
print("Saved suffix counts to suffix_counts.csv")

# 2. Create CSV with original labels and suffixes
suffix_labels = df[['suffix', 'Label']].reset_index(drop=True)
suffix_labels.to_csv("suffix_labels.csv", index_label="url_id")  # Preserve original dataset order
print("Saved suffixes with labels to suffix_labels.csv")

# Display counts
print("\nFull Suffix Distribution:")
print(suffix_counts.to_string(index=False))

Saved suffix counts to suffix_counts.csv
Saved suffixes with labels to suffix_labels.csv

Full Suffix Distribution:
 index             suffix  count
     1                com 362065
     2                org  48771
     3                net  23546
     4                edu  10086
     5                 ca   9558
     6                      8349
     7              co.uk   6978
     8                 ru   6197
     9               info   4432
    10             com.br   3798
    11             com.au   3258
    12                 de   3222
    13                gov   3038
    14                 pl   2426
    15                biz   2385
    16                 it   2089
    17                 in   1754
    18                 fr   1730
    19                 nl   1467
    20                 us   1297
    21                 ro   1279
    22                 cn   1254
    23                 cl   1194
    24                 eu   1156
    25                 mx   1121
    26                 tk 

In [30]:
suffix_report = pd.read_csv('suffix_report.csv')  # Replace with your dataset
print("Original Dataset:")
print(suffix_report.head())

Original Dataset:
   Index Suffix   Count  Label
0      1    com  362065      1
1      2    org   48771      1
2      3    net   23546      1
3      4    edu   10086      1
4      5     ca    9558      1


3. Feature Engineering

In [32]:
def extract_features(url):
    extracted = tldextract.extract(url)
    
    # Get suffix stats from pre-generated report
    suffix_data = suffix_report[suffix_report['Suffix'] == extracted.suffix]
    
    return {
        # Existing features
        'domain_length': len(extracted.domain),
        'subdomain_count': len(extracted.subdomain.split('.')),
        'has_https': 1 if url.startswith('https') else 0,
        'num_hyphens': extracted.domain.count('-'),
        'num_digits': sum(c.isdigit() for c in extracted.domain),
        
        # New features from suffix report
        'suffix_freq': suffix_data['Count'].values[0] if not suffix_data.empty else 0,
        'suffix_label': suffix_data['Label'].values[0] if not suffix_data.empty else 0
    }

# 2. Create feature matrix
features = df['URL'].apply(lambda x: pd.Series(extract_features(x)))
X = features.values
y = df['Label'].values


4. Split and Scale Data

In [33]:

# 3. Train/Test Split & Scaling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

5. Train SVM Model

In [None]:
# 4. Enhanced SVM Model
clf = svm.SVC(
    kernel='rbf',
    class_weight='balanced',  # Handles imbalanced suffix labels
    probability=True,
    gamma='auto'
)
clf.fit(X_train, y_train)

6. Save Model and Scaler

In [None]:
dump({
    'model': clf,
    'scaler': scaler,
    'suffix_report': suffix_report
}, 'phishing_detector_pipeline.joblib')

 7. Evaluation

In [None]:
y_pred = clf.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

8. Load and Test with New URLs

In [None]:
def predict_phishing(url):
    # Load saved components
    clf = load('phishing_svm_model.joblib')
    scaler = load('scaler.joblib')
    
    # Extract features
    features = pd.Series(extract_features(url)).values.reshape(1, -1)
    scaled_features = scaler.transform(features)
    
    # Predict
    prob = clf.predict_proba(scaled_features)[0][1]
    return "Phishing" if prob > 0.5 else "Legitimate", prob

# Test with sample URLs
test_urls = [
    "https://www.paypal-login.scam/verify",
    "https://www.google.com/search",
    "http://secure-bank-update.com"
]

print("\nTest Predictions:")
for url in test_urls:
    domain = extract_domain(url)
    pred, prob = predict_phishing(url)
    print(f"URL: {domain.ljust(30)} | Prediction: {pred.ljust(10)} | Probability: {prob:.4f}")