# Phishing URL detection

In [59]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import re
from urllib.parse import urlparse, parse_qs, quote
import time
from datetime import datetime
import whois
import requests
from bs4 import BeautifulSoup

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import joblib

## Dataset

In [3]:
df = pd.read_csv('datasets/phishing_url_dataset/dataset_phishing.csv')
df.head()

Unnamed: 0,url,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,http://www.crestonwood.com/router.php,37,19,0,3,0,0,0,0,0,...,0,1,0,45,-1,0,1,1,4,legitimate
1,http://shadetreetechnology.com/V4/validation/a...,77,23,1,1,0,0,0,0,0,...,1,0,0,77,5767,0,0,1,2,phishing
2,https://support-appleld.com.secureupdate.duila...,126,50,1,4,1,0,1,2,0,...,1,0,0,14,4004,5828815,0,1,0,phishing
3,http://rgipt.ac.in,18,11,0,2,0,0,0,0,0,...,1,0,0,62,-1,107721,0,0,3,legitimate
4,http://www.iracing.com/tracks/gateway-motorspo...,55,15,0,2,2,0,0,0,0,...,0,1,0,224,8175,8725,0,0,6,legitimate


In [39]:
print(df['empty_title'].value_counts())

empty_title
0    10004
1     1426
Name: count, dtype: int64


In [40]:
print(df['domain_in_title'].value_counts())

domain_in_title
1    8868
0    2562
Name: count, dtype: int64


In [4]:
features = [
    'length_url', 'length_hostname', 'ip', 'nb_dots', 'nb_hyphens', 'nb_at', 'nb_qm', 'nb_and', 'nb_or', 'nb_eq',
    'nb_underscore', 'nb_tilde', 'nb_percent', 'nb_slash', 'nb_star', 'nb_colon', 'nb_comma', 'nb_semicolumn',
    'nb_dollar', 'nb_space', 'nb_www', 'nb_com', 'nb_dslash', 'http_in_path', 'https_token', 'ratio_digits_url',
    'ratio_digits_host', 'punycode', 'shortening_service', 'path_extension', 'phish_hints', 'domain_in_brand',
    'brand_in_subdomain', 'brand_in_path', 'suspecious_tld'
]

In [5]:
# Target feature mapping
df['status'] = df['status'].map({'phishing': 1, 'legitimate':0})

In [47]:
df['google_index'].value_counts()

google_index
1    6103
0    5327
Name: count, dtype: int64

In [7]:
df.describe()

Unnamed: 0,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,nb_eq,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
count,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,...,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0
mean,61.126684,21.090289,0.150569,2.480752,0.99755,0.022222,0.141207,0.162292,0.0,0.293176,...,0.775853,0.439545,0.072878,492.532196,4062.543745,856756.6,0.020122,0.533946,3.185739,0.5
std,55.297318,10.777171,0.357644,1.369686,2.087087,0.1555,0.364456,0.821337,0.0,0.998317,...,0.417038,0.496353,0.259948,814.769415,3107.7846,1995606.0,0.140425,0.498868,2.536955,0.500022
min,12.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-1.0,-12.0,0.0,0.0,0.0,0.0,0.0
25%,33.0,15.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,84.0,972.25,0.0,0.0,0.0,1.0,0.0
50%,47.0,19.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,242.0,3993.0,1651.0,0.0,1.0,3.0,0.5
75%,71.0,24.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,449.0,7026.75,373845.5,0.0,1.0,5.0,1.0
max,1641.0,214.0,1.0,24.0,43.0,4.0,3.0,19.0,0.0,19.0,...,1.0,1.0,1.0,29829.0,12874.0,10767990.0,1.0,1.0,10.0,1.0


In [8]:
numerical_df = df.select_dtypes(include=['float64', 'int64'])
corr_matrix= numerical_df.corr()

In [9]:
status_corr = corr_matrix['status']
status_corr.shape

(88,)

In [10]:
def featureSelectorCorrelation(cmatrix, threshold):
    '''
    A function for selecting features that are above a certain threshold value
    '''
    selected_features = []
    feature_score = []
    i=0
    for score in cmatrix:
        if abs(score)>threshold:
            selected_features.append(cmatrix.index[i])
            feature_score.append(['{:3f}'.format(score)])
        i+=1
    result = list(zip(selected_features, feature_score))
    return result

In [11]:
features_selected = featureSelectorCorrelation(status_corr, 0.2)
features_selected

[('length_url', ['0.248580']),
 ('length_hostname', ['0.238322']),
 ('ip', ['0.321698']),
 ('nb_dots', ['0.207029']),
 ('nb_qm', ['0.294319']),
 ('nb_eq', ['0.233386']),
 ('nb_slash', ['0.242270']),
 ('nb_www', ['-0.443468']),
 ('ratio_digits_url', ['0.356395']),
 ('ratio_digits_host', ['0.224335']),
 ('tld_in_subdomain', ['0.208884']),
 ('prefix_suffix', ['0.214681']),
 ('shortest_word_host', ['0.223084']),
 ('longest_words_raw', ['0.200147']),
 ('longest_word_path', ['0.212709']),
 ('phish_hints', ['0.335393']),
 ('nb_hyperlinks', ['-0.342628']),
 ('ratio_intHyperlinks', ['-0.243982']),
 ('empty_title', ['0.207043']),
 ('domain_in_title', ['0.342807']),
 ('domain_age', ['-0.331889']),
 ('google_index', ['0.731171']),
 ('page_rank', ['-0.511137']),
 ('status', ['1.000000'])]

In [12]:
selected_features = []
for feature, score in features_selected:
    if feature != 'status':
        selected_features.append(feature)

In [13]:
selected_features

['length_url',
 'length_hostname',
 'ip',
 'nb_dots',
 'nb_qm',
 'nb_eq',
 'nb_slash',
 'nb_www',
 'ratio_digits_url',
 'ratio_digits_host',
 'tld_in_subdomain',
 'prefix_suffix',
 'shortest_word_host',
 'longest_words_raw',
 'longest_word_path',
 'phish_hints',
 'nb_hyperlinks',
 'ratio_intHyperlinks',
 'empty_title',
 'domain_in_title',
 'domain_age',
 'google_index',
 'page_rank']

In [33]:
df['empty_title']

0        0
1        0
2        0
3        0
4        0
        ..
11425    0
11426    0
11427    0
11428    0
11429    0
Name: empty_title, Length: 11430, dtype: int64

## Train Test Split, Feature Scaling

In [14]:
X = df[selected_features]
y = df['status']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [16]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Model Training

In [17]:
model = RandomForestClassifier(max_depth=20, n_estimators=100)
t0 = time.time()
model.fit(X_train, y_train)
print(f'Model Training took {time.time()-t0} seconds')

Model Training took 0.5384204387664795 seconds


In [18]:
joblib.dump(model, 'phishing_url_model.joblib')

['phishing_url_model.joblib']

## Model Testing

### Feature Extraction Function

In [27]:
def count_hyperlinks(url):
    try:
        # Step 2: Fetch the webpage
        response = requests.get(url)
        
        # Check for successful request
        if response.status_code != 200:
            print(f"Failed to retrieve webpage: Status code {response.status_code}")
            return 0
        
        # Step 3: Parse HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Step 4: Extract all <a> tags
        links = soup.find_all('a')
        
        # Step 5: Return the count of hyperlinks
        return len(links)
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return 0

In [30]:
def link_ratio(url):
    try:
        # Fetch the webpage
        response = requests.get(url)
        
        # Check for successful request
        if response.status_code != 200:
            print(f"Failed to retrieve webpage: Status code {response.status_code}")
            return None
        
        # Parse HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract all <a> tags
        links = soup.find_all('a')
        
        # Initialize counters
        internal_count = 0
        external_count = 0
        
        # Get the base domain of the URL
        base_domain = urlparse(url).netloc
        
        # Classify links as internal or external
        for link in links:
            href = link.get('href')
            if href:
                parsed_href = urlparse(href)
                # Check if it's an internal link
                if parsed_href.netloc == base_domain or not parsed_href.netloc:
                    internal_count += 1
                else:
                    external_count += 1
        
        # Calculate ratio
        total_links = internal_count + external_count
        if total_links == 0:
            return (0, 0)  # Avoid division by zero
        
        ratio = internal_count / external_count if external_count > 0 else float('inf')
        
        return ratio
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [36]:
def has_empty_title(url):
    try:
        # Step 2: Fetch the webpage
        response = requests.get(url)
        
        # Check for successful request
        if response.status_code != 200:
            print(f"Failed to retrieve webpage: Status code {response.status_code}")
            return False
        
        # Step 3: Parse HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Step 4: Extract title
        title = soup.title.string if soup.title else None
        
        # Check if title is empty or None
        if title is None or title.strip() == "":
            return 1  # Title is empty
        
        return 0  # Title is not empty
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return False

In [41]:
def title_contains_domain(url):
    try:
        # Step 1: Fetch the webpage
        response = requests.get(url)
        
        # Check for successful request
        if response.status_code != 200:
            print(f"Failed to retrieve webpage: Status code {response.status_code}")
            return False
        
        # Step 2: Parse HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract title
        title = soup.title.string if soup.title else ""
        
        # Step 3: Get the domain from the URL
        domain = urlparse(url).netloc
        
        # Check if the domain is in the title (case insensitive)
        if domain.lower() in title.lower():
            return 0
        
        return 1
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return False

In [45]:
def get_domain_age(url):
    try:
        # Step 2: Extract domain from URL
        domain_name = urlparse(url).netloc
        
        # Step 3: Query WHOIS information for the domain
        domain_info = whois.whois(domain_name)
        
        # Extract creation date
        creation_date = domain_info.creation_date
        
        # Handle cases where creation_date is a list (some domains may have multiple creation dates)
        if isinstance(creation_date, list):
            creation_date = creation_date[0]  # Take the first date if multiple
        
        # Step 4: Calculate age of the domain
        if isinstance(creation_date, datetime):
            today = datetime.now()
            age = today - creation_date
            return age.days  # Return age in days
        else:
            print("Creation date is not valid.")
            return None
            
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [48]:
def check_indexing_with_site_command(url):
    query = f"site:{url}"
    response = requests.get(f"https://www.google.com/search?q={quote(query)}")
    
    # Check if the response contains any results
    if "did not match any documents" in response.text:
        return 0  # Not indexed
    else:
        return 1  # Indexed

In [50]:
api_key = 'wgk8sgo0ckkcgw84kw8w4w0w4gwc00cgk08kk4s0'

In [61]:
def get_page_rank(url, api_key):
    # Extract the domain from the URL
    domain = url.split("//")[-1].split("/")[0]  # Get domain part
    
    # Define the API endpoint
    api_url = "https://openpagerank.com/api/v1.0/getPageRank"
    
    # Set up headers with your API key
    headers = {
        "API-OPR": api_key
    }
    
    # Prepare parameters for the request
    params = {
        "domains[]": domain  # Send the domain as a parameter
    }
    
    # Make the GET request to the Open PageRank API
    response = requests.get(api_url, headers=headers, params=params)
    
    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()
        if data['response']:
            page_rank_info = data['response'][0]
            return page_rank_info['page_rank_integer']
        else:
            return None  # No data found for this domain
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return None

In [62]:
def extract_url_features(url, api_key):
    parsed_url = urlparse(url)
    
    # Extracting features
    features = {
        'length_url': len(url),
        'length_hostname': len(parsed_url.hostname) if parsed_url.hostname else 0,
        'ip': None,  # Placeholder for IP resolution (requires additional implementation)
        'nb_dots': url.count('.'),
        'nb_qm': url.count('?'),
        'nb_eq': url.count('='),
        'nb_slash': url.count('/'),
        'nb_www': url.count('www.'),
        'ratio_digits_url': sum(c.isdigit() for c in url) / len(url) if len(url) > 0 else 0,
        'ratio_digits_host': sum(c.isdigit() for c in parsed_url.hostname) / len(parsed_url.hostname) if parsed_url.hostname else 0,
        'tld_in_subdomain': 1 if parsed_url.hostname and re.search(r'\.[a-z]{2,}$', parsed_url.hostname) else 0,
        'prefix_suffix': 1 if (parsed_url.path.startswith('/') or parsed_url.path.endswith('.html')) else 0,
        'shortest_word_host': min(len(word) for word in parsed_url.hostname.split('.') if word) if parsed_url.hostname else 0,
        'longest_words_raw': max((len(word) for word in parsed_url.path.split('/') if word), default=0),
        'longest_word_path': max((len(word) for word in parsed_url.path.split('/') if word), default=0),
        'phish_hints': 0,  # Placeholder for phishing detection logic
        'nb_hyperlinks': count_hyperlinks(url),  # Requires HTML parsing (additional implementation needed)
        'ratio_intHyperlinks': link_ratio(url),  # Requires HTML parsing (additional implementation needed)
        'empty_title': has_empty_title(url),  # Requires HTML fetching (additional implementation needed)
        'domain_in_title': title_contains_domain(url),  # Requires HTML fetching (additional implementation needed)
        'domain_age': get_domain_age(url),  # Requires domain registration lookup (additional implementation needed)
        'google_index': check_indexing_with_site_command(url),  # Requires Google search API check (additional implementation needed)
        'page_rank': get_page_rank(url, api_key)   # Requires external service/API to get page rank
    }
    
    return features

In [63]:
extract_url_features(url='http://www.enkiquotes.com/', api_key=api_key)

{'length_url': 26,
 'length_hostname': 18,
 'ip': None,
 'nb_dots': 2,
 'nb_qm': 0,
 'nb_eq': 0,
 'nb_slash': 3,
 'nb_www': 1,
 'ratio_digits_url': 0.0,
 'ratio_digits_host': 0.0,
 'tld_in_subdomain': 1,
 'prefix_suffix': 1,
 'shortest_word_host': 3,
 'longest_words_raw': 0,
 'longest_word_path': 0,
 'phish_hints': 0,
 'nb_hyperlinks': 47,
 'ratio_intHyperlinks': 10.5,
 'empty_title': 0,
 'domain_in_title': 1,
 'domain_age': 2756,
 'google_index': 1,
 'page_rank': 4}

In [82]:
test_url_1 = 'https://www.lukeptaylor.com/blog/sigma-fp-a-tiny-full-frame-photography-powerhouse'
test_url='http://vamoaestudiarmedicina.blogspot.com/'

In [83]:
features_dict = extract_url_features(test_url, api_key)
features_df = pd.DataFrame([features_dict])
features_df = features_df[selected_features]
features_scaled = scaler.transform(features_df)

prediction = model.predict(features_scaled)

# Output the result
result = 'phishing' if prediction[0] == 1 else 'legitimate'
print(f'The URL "{test_url}" is classified as: {result}')

An error occurred: No match for "VAMOAESTUDIARMEDICINA.BLOGSPOT.COM".
>>> Last update of whois database: 2024-11-26T19:40:11Z <<<

NOTICE: The expiration date displayed in this record is the date the
registrar's sponsorship of the domain name registration in the registry is
currently set to expire. This date does not necessarily reflect the expiration
date of the domain name registrant's agreement with the sponsoring
registrar.  Users may consult the sponsoring registrar's Whois database to
view the registrar's reported date of expiration for this registration.

TERMS OF USE: You are not authorized to access or query our Whois
database through the use of electronic processes that are high-volume and
automated except as reasonably necessary to register domain names or
modify existing registrations; the Data in VeriSign Global Registry
Services' ("VeriSign") Whois database is provided by VeriSign for
information purposes only, and to assist persons in obtaining information
about or relat



In [75]:
prediction

array([1])