# Phishing URL detection

In [33]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import re
from urllib.parse import urlparse, parse_qs
import time

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import joblib

## Dataset

In [7]:
df = pd.read_csv('datasets/phishing_url_dataset/dataset_phishing.csv')
df.head()

Unnamed: 0,url,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,http://www.crestonwood.com/router.php,37,19,0,3,0,0,0,0,0,...,0,1,0,45,-1,0,1,1,4,legitimate
1,http://shadetreetechnology.com/V4/validation/a...,77,23,1,1,0,0,0,0,0,...,1,0,0,77,5767,0,0,1,2,phishing
2,https://support-appleld.com.secureupdate.duila...,126,50,1,4,1,0,1,2,0,...,1,0,0,14,4004,5828815,0,1,0,phishing
3,http://rgipt.ac.in,18,11,0,2,0,0,0,0,0,...,1,0,0,62,-1,107721,0,0,3,legitimate
4,http://www.iracing.com/tracks/gateway-motorspo...,55,15,0,2,2,0,0,0,0,...,0,1,0,224,8175,8725,0,0,6,legitimate


In [11]:
features = [
    'length_url', 'length_hostname', 'ip', 'nb_dots', 'nb_hyphens', 'nb_at', 'nb_qm', 'nb_and', 'nb_or', 'nb_eq',
    'nb_underscore', 'nb_tilde', 'nb_percent', 'nb_slash', 'nb_star', 'nb_colon', 'nb_comma', 'nb_semicolumn',
    'nb_dollar', 'nb_space', 'nb_www', 'nb_com', 'nb_dslash', 'http_in_path', 'https_token', 'ratio_digits_url',
    'ratio_digits_host', 'punycode', 'shortening_service', 'path_extension', 'phish_hints', 'domain_in_brand',
    'brand_in_subdomain', 'brand_in_path', 'suspecious_tld'
]

In [12]:
# Target feature mapping
df['status'] = df['status'].map({'phishing': 1, 'legitimate':0})

In [13]:
df['status'].value_counts()

status
0    5715
1    5715
Name: count, dtype: int64

In [14]:
df.describe()

Unnamed: 0,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,nb_eq,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
count,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,...,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0
mean,61.126684,21.090289,0.150569,2.480752,0.99755,0.022222,0.141207,0.162292,0.0,0.293176,...,0.775853,0.439545,0.072878,492.532196,4062.543745,856756.6,0.020122,0.533946,3.185739,0.5
std,55.297318,10.777171,0.357644,1.369686,2.087087,0.1555,0.364456,0.821337,0.0,0.998317,...,0.417038,0.496353,0.259948,814.769415,3107.7846,1995606.0,0.140425,0.498868,2.536955,0.500022
min,12.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-1.0,-12.0,0.0,0.0,0.0,0.0,0.0
25%,33.0,15.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,84.0,972.25,0.0,0.0,0.0,1.0,0.0
50%,47.0,19.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,242.0,3993.0,1651.0,0.0,1.0,3.0,0.5
75%,71.0,24.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,449.0,7026.75,373845.5,0.0,1.0,5.0,1.0
max,1641.0,214.0,1.0,24.0,43.0,4.0,3.0,19.0,0.0,19.0,...,1.0,1.0,1.0,29829.0,12874.0,10767990.0,1.0,1.0,10.0,1.0


In [15]:
numerical_df = df.select_dtypes(include=['float64', 'int64'])
corr_matrix= numerical_df.corr()

In [16]:
status_corr = corr_matrix['status']
status_corr.shape

(88,)

In [21]:
def featureSelectorCorrelation(cmatrix, threshold):
    '''
    A function for selecting features that are above a certain threshold value
    '''
    selected_features = []
    feature_score = []
    i=0
    for score in cmatrix:
        if abs(score)>threshold:
            selected_features.append(cmatrix.index[i])
            feature_score.append(['{:3f}'.format(score)])
        i+=1
    result = list(zip(selected_features, feature_score))
    return result

In [22]:
features_selected = featureSelectorCorrelation(status_corr, 0.2)
features_selected

[('length_url', ['0.248580']),
 ('length_hostname', ['0.238322']),
 ('ip', ['0.321698']),
 ('nb_dots', ['0.207029']),
 ('nb_qm', ['0.294319']),
 ('nb_eq', ['0.233386']),
 ('nb_slash', ['0.242270']),
 ('nb_www', ['-0.443468']),
 ('ratio_digits_url', ['0.356395']),
 ('ratio_digits_host', ['0.224335']),
 ('tld_in_subdomain', ['0.208884']),
 ('prefix_suffix', ['0.214681']),
 ('shortest_word_host', ['0.223084']),
 ('longest_words_raw', ['0.200147']),
 ('longest_word_path', ['0.212709']),
 ('phish_hints', ['0.335393']),
 ('nb_hyperlinks', ['-0.342628']),
 ('ratio_intHyperlinks', ['-0.243982']),
 ('empty_title', ['0.207043']),
 ('domain_in_title', ['0.342807']),
 ('domain_age', ['-0.331889']),
 ('google_index', ['0.731171']),
 ('page_rank', ['-0.511137']),
 ('status', ['1.000000'])]

In [23]:
selected_features = []
for feature, score in features_selected:
    if feature != 'status':
        selected_features.append(feature)

In [24]:
selected_features

['length_url',
 'length_hostname',
 'ip',
 'nb_dots',
 'nb_qm',
 'nb_eq',
 'nb_slash',
 'nb_www',
 'ratio_digits_url',
 'ratio_digits_host',
 'tld_in_subdomain',
 'prefix_suffix',
 'shortest_word_host',
 'longest_words_raw',
 'longest_word_path',
 'phish_hints',
 'nb_hyperlinks',
 'ratio_intHyperlinks',
 'empty_title',
 'domain_in_title',
 'domain_age',
 'google_index',
 'page_rank']

## Train Test Split, Feature Scaling

In [26]:
X = df[selected_features]
y = df['status']

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [28]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Model Training

In [30]:
model = RandomForestClassifier(max_depth=20, n_estimators=100)
t0 = time.time()
model.fit(X_train, y_train)
print(f'Model Training took {time.time()-t0} seconds')

Model Training took 0.5434350967407227 seconds


In [32]:
joblib.dump(model, 'phishing_url_model.joblib')

['phishing_url_model.joblib']

## Model Testing

### Feature Extraction Function

In [None]:

def extract_url_features(url):
    parsed_url = urlparse(url)
    
    # Extracting features
    features = {
        'length_url': len(url),
        'length_hostname': len(parsed_url.hostname) if parsed_url.hostname else 0,
        'ip': None,  # Placeholder for IP resolution (requires additional implementation)
        'nb_dots': url.count('.'),
        'nb_qm': url.count('?'),
        'nb_eq': url.count('='),
        'nb_slash': url.count('/'),
        'nb_www': url.count('www.'),
        'ratio_digits_url': sum(c.isdigit() for c in url) / len(url) if len(url) > 0 else 0,
        'ratio_digits_host': sum(c.isdigit() for c in parsed_url.hostname) / len(parsed_url.hostname) if parsed_url.hostname else 0,
        'tld_in_subdomain': 1 if parsed_url.hostname and re.search(r'\.[a-z]{2,}$', parsed_url.hostname) else 0,
        'prefix_suffix': 1 if (parsed_url.path.startswith('/') or parsed_url.path.endswith('.html')) else 0,
        'shortest_word_host': min(len(word) for word in parsed_url.hostname.split('.') if word) if parsed_url.hostname else 0,
        'longest_words_raw': max(len(word) for word in parsed_url.path.split('/') if word) if parsed_url.path else 0,
        'longest_word_path': max(len(word) for word in parsed_url.path.split('/') if word) if parsed_url.path else 0,
        'phish_hints': 0,  # Placeholder for phishing detection logic
        'nb_hyperlinks': None,  # Requires HTML parsing (additional implementation needed)
        'ratio_intHyperlinks': None,  # Requires HTML parsing (additional implementation needed)
        'empty_title': None,  # Requires HTML fetching (additional implementation needed)
        'domain_in_title': None,  # Requires HTML fetching (additional implementation needed)
        'domain_age': None,  # Requires domain registration lookup (additional implementation needed)
        'google_index': None,  # Requires Google search API check (additional implementation needed)
        'page_rank': None   # Requires external service/API to get page rank
    }
    
    return features