In [None]:
!pip install requests python-whois dnspython pandas scikit-learn joblib

In [1]:
pip install ipwhois

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install numpy

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install tldextract

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [1]:
import whois
import os
import dns.resolver
import requests
import time
import socket
import ssl
import urllib.parse
from urllib.parse import urlparse
import tldextract
from datetime import datetime
import joblib
import numpy as np
import pandas as pd

def extract_features(url):
    features = []
    
    # Feature 1: directory_length
    try:
        parsed_url = urlparse(url)
        url_path = parsed_url.path.rsplit('/', 1)[0]
        features.append(len(url_path) if url_path else 0)
    except:
        features.append(0)

    # Feature 2: time_domain_activation
    try:
        domain_info = whois.whois(url)
        creation_date = domain_info.creation_date
        if isinstance(creation_date, list):
            creation_date = creation_date[0]
        features.append((datetime.now() - creation_date).days)
    except:
        features.append(-1)

    # Feature 3: asn_ip
    try:
        ip = socket.gethostbyname(urlparse(url).netloc)
        asn = requests.get(f"https://ipapi.co/{ip}/asn/").text.strip()
        features.append(asn.strip('AS')) if asn else features.append(0)
    except:
        features.append(0)

    # Feature 4: time_response
    try:
        start = time.time()
        response = requests.get(url, timeout=5)
        end = time.time()
        features.append(end - start)
    except:
        features.append(0.207 if not None else 0.207)

    # Feature 5: length_url
    try:
        features.append(len(url))
    except:
        features.append(0)
    
    # Feature 6: ttl_hostname
    try:
        domain = tldextract.extract(url).registered_domain
        ttl = dns.resolver.resolve(domain, 'NS').rrset.ttl
        features.append(ttl)
    except:
        features.append(0)

    # Feature 7: qty_dot_domain
    try:
        domain = urlparse(url).netloc
        features.append(domain.count('.') if '.' in domain else 0)
    except:
        features.append(0)

    # Feature 8: time_domain_expiration
    try:
        expiration_date = whois.whois(url).expiration_date
        today = datetime.now()
        features.append((expiration_date - today).days)
    except:
        features.append(-1)

    # Feature 9: qty_nameservers
    try:
        domain = tldextract.extract(url).registered_domain
        ns_list = dns.resolver.resolve(domain, 'NS')
        features.append(len(ns_list))
    except:
        features.append(0)

    # Feature 10: domain_length
    try:
        domain = urlparse(url).netloc
        features.append(len(domain) if domain else 0)
    except:
        features.append(0)

    # Feature 11: qty_slash_url
    try:
        features.append(url.count('/') if '/' in url else 0)
    except:
        features.append(0)

    # Feature 12: qty_mx_servers
    try:
        domain = tldextract.extract(url).registered_domain
        mx_list = dns.resolver.resolve(domain, 'MX')
        features.append(len(mx_list))
    except:
        features.append(0)

    # Feature 13: qty_hyphen_directory
    try:
        parsed_url = urlparse(url)
        url_path = parsed_url.path.rsplit('/', 1)[0]
        features.append(url_path.count('-') if '-' in url_path else 0)
    except:
        features.append(0)

    # Feature 14: qty_vowels_domain
    try:
        domain = urlparse(url).netloc
        vowels = set(['a', 'e', 'i', 'o', 'u', 'A','E','I','O','U'])
        qty_vowels = sum(1 for c in domain if c in vowels)
        features.append(qty_vowels)
    except:
        features.append(0)

    # Feature 15: qty_ip_resolved
    try:
        ip_list = socket.getaddrinfo(urlparse(url).netloc, None)
        features.append(len(ip_list))
    except:
        features.append(-1)

    # Feature 16: file_length
    try:
        file_name = os.path.basename(urlparse(url).path)
        features.append(len(file_name) if file_name else 0)
    except:
        features.append(0)

    # Feature 17: qty_redirects
    try:
        response = requests.get(url)
        features.append(len(response.history))
    except:
        features.append(-1)

    # Feature 18: qty_slash_directory
    try:
        parsed_url = urlparse(url)
        url_path = parsed_url.path.rsplit('/', 1)[0]
        features.append(url_path.count('/') if '/' in url_path else 0)
    except:
        features.append(0)

    # Feature 19: qty_dot_url
    try:
        features.append(url.count('.') if '.' in url else 0)
    except:
        features.append(0)

    # Feature 20: qty_dot_file
    try:
        file_name = os.path.basename(urlparse(url).path)
        features.append(file_name.count('.') if '.' in file_name else 0)
    except:
        features.append(0)

    return features

# Load the saved model and feature names
with open('../Model/random_forest_model.pkl', 'rb') as model_file:
    model = joblib.load(model_file)

with open('../Model/feature_names.pkl', 'rb') as feature_names_file:
    feature_names = joblib.load(feature_names_file)

# Test the function with a sample URL
sample_url = "https://st-join.com/"
extracted_features = extract_features(sample_url)

# Create a dictionary of feature names and their values
features_dict = dict(zip(feature_names, extracted_features))

# Print the extracted features with their names
print("Extracted Features with Names:", features_dict)

# Convert the extracted features to a DataFrame with the correct feature names
features_df = pd.DataFrame([extracted_features], columns=feature_names)

# Predict if the URL is legitimate or phishing
prediction = model.predict(features_df)

if prediction == 1:
    print(f"\nThe URL: {sample_url} is Phishing!!")
else:
    print(f"\nThe URL: {sample_url} is Legitimate")


Extracted Features with Names: {'directory_length': 0, 'time_domain_activation': 6, 'asn_ip': 0, 'time_response': 0.207, 'length_url': 20, 'ttl_hostname': 0, 'qty_dot_domain': 1, 'time_domain_expiration': -1, 'qty_nameservers': 0, 'domain_length': 11, 'qty_slash_url': 3, 'qty_mx_servers': 0, 'qty_ip_resolved': 0, 'qty_vowels_domain': 3, 'qty_hyphen_directory': -1, 'qty_redirects': 0, 'file_length': -1, 'qty_dot_url': 0, 'qty_slash_directory': 1, 'tls_ssl_certificate': 0}

The URL: https://st-join.com/ is Phishing!!
