In [73]:
!pip install tldextract python-whois requests beautifulsoup4




In [81]:
import re
import requests
import tldextract
import whois
from datetime import datetime
from urllib.parse import urlparse

# Known free hosting domains
FREE_HOSTING_DOMAINS = [
    "weebly.com", "wixsite.com", "ukit.me",
    "000webhostapp.com", "wordpress.com",
    "blogspot.com", "yolasite.com", "tripod.com"
]

# Suspicious keywords in URL path
SUSPICIOUS_KEYWORDS = [
    "login", "signin", "secure", "verify", "account",
    "update", "password", "outlook", "office", "paypal",
    "banking", "webmail", "inbox", "light.aspx"
]

SUSPICIOUS_EXTENSIONS = [".php", ".asp", ".aspx", ".cgi", ".exe"]
SUSPICIOUS_QUERY_TOKENS = ["id=", "rand=", "login", "session"]

# Checking if URL is reachable
def is_url_alive(url):
    try:
        resp = requests.head(url, timeout=5)
        return resp.status_code < 400
    except:
        return False

# Extracting features from URL
def extract_features(url):
    features = {}
    parsed = urlparse(url)
    extracted = tldextract.extract(url)
    domain = extracted.registered_domain
    subdomain = extracted.subdomain
    path = parsed.path.lower()
    query = parsed.query.lower()

    # Hard rules
    ip_pattern = re.compile(r"(\d{1,3}\.){3}\d{1,3}")
    features['has_ip_in_url'] = bool(ip_pattern.search(url))
    features['url_length'] = len(url)
    features['has_at_symbol'] = "@" in url
    features['has_dash_in_domain'] = "-" in extracted.domain
    features['subdomain_count'] = 0 if not subdomain else subdomain.count(".") + 1
    features['https_token_in_domain'] = "https" in extracted.domain.lower()

    # WHOIS info
    try:
        domain_info = whois.whois(domain)
        creation_date = domain_info.creation_date
        expiration_date = domain_info.expiration_date
        now = datetime.utcnow()

        if isinstance(creation_date, list):
            creation_date = creation_date[0]
        if isinstance(expiration_date, list):
            expiration_date = expiration_date[0]

        features['domain_age_months'] = (now - creation_date).days // 30 if creation_date else 0
        features['registration_length_months'] = (expiration_date - creation_date).days // 30 if creation_date and expiration_date else 0
        features['dns_record_found'] = True
        features['months_to_expire'] = max((expiration_date - now).days // 30, 0) if expiration_date else 0
    except:
        features['domain_age_months'] = 0
        features['registration_length_months'] = 0
        features['dns_record_found'] = False
        features['months_to_expire'] = 0

    if features['has_ip_in_url']:
        features['dns_record_found'] = False

    # Path features
    features['path_depth'] = path.count("/") if path else 0
    features['suspicious_path_keyword'] = any(k in path for k in SUSPICIOUS_KEYWORDS)
    features['suspicious_extension'] = any(path.endswith(ext) for ext in SUSPICIOUS_EXTENSIONS)
    features['suspicious_query_token'] = any(tok in query for tok in SUSPICIOUS_QUERY_TOKENS)

    # Free hosting
    features['is_free_hosting'] = domain in FREE_HOSTING_DOMAINS

    # URL reachability
    features['url_alive'] = is_url_alive(url)

    return features

# Decision tree classification
def decision_tree_classify(features, url):
    # Hard phishing rules
    if features['has_ip_in_url']:
        return "phishing", "Contains IP in URL"
    if features['has_at_symbol']:
        return "phishing", "Contains '@' in URL"
    if not features['dns_record_found']:
        return "phishing", "No DNS record found"
    if features['https_token_in_domain']:
        return "phishing", "Domain contains 'https' token"
    if features['domain_age_months'] == 0:
        return "phishing", "Domain age missing or 0 months"
    if features['months_to_expire'] <= 3:
        return "phishing", f"Domain expiring in {features['months_to_expire']} months"

    # URL reachability check
    if not features['url_alive']:
        return "suspicious", "URL unreachable or dead"

    # Strong suspicious signals
    suspicious_reasons = []
    if features['is_free_hosting']:
        suspicious_reasons.append("Free hosting domain")
    if features['suspicious_path_keyword']:
        suspicious_reasons.append("Suspicious keyword in path")
    if features['has_dash_in_domain']:
        suspicious_reasons.append("Dash in domain")

    # Weak signals
    weak_signals = 0
    if features['suspicious_extension']:
        weak_signals += 1
    if features['suspicious_query_token']:
        weak_signals += 1
    if features['path_depth'] >= 3:
        weak_signals += 1
    if weak_signals >= 2:
        suspicious_reasons.append("Multiple weak suspicious signals (extension/query/path depth)")

    if suspicious_reasons:
        return "suspicious", ", ".join(suspicious_reasons)

    return "legitimate", "No strong or weak suspicious patterns detected"

# Runner
def classify_url(url):
    features = extract_features(url)
    label, reason = decision_tree_classify(features, url)

    activity_info = (f" | Domain age: {features['domain_age_months']} months"
                     f" | Months to expire: {features['months_to_expire']}")

    if label == "legitimate":
        print(f"\nURL: {url}\nDecision Tree → LEGITIMATE ({reason}){activity_info}")
    elif label == "phishing":
        print(f"\nURL: {url}\nDecision Tree → PHISHING ({reason}){activity_info}")
    else:
        print(f"\nURL: {url}\nDecision Tree → SUSPICIOUS ({reason}){activity_info}")

# Test URLs
test_urls = [
    "https://google.com",
    "http://facebook.com",
    "http://writeassociate.com/test/Portal/inicio/IO8Hc30w_Eq8DfVjyJGvwEO4GhAnH48CqLwGx-uH4XXCpAPCJlRkBsaGmGQ6QgAIyLKwQ/www.Bancasaleon.com.do/bhdi/", # Phishing site
    "http://acornpresscanada.com/x487kjfdsd9274r98yuofiwo5uodjld2/chase-home/verification-card.php?https://chaseonline.chase.com/Logon.aspx?LOB=RBGLogon", #Phising site
    "https://tubuh-syarikat.com/plugins/fields/files/",# Phising Site
    "http://webmasteradmin.ukit.me/",
    "https://youtube.com"
]

for u in test_urls:
    print("\n" + "="*60)
    classify_url(u)





  domain = extracted.registered_domain
  now = datetime.utcnow()



URL: https://google.com
Decision Tree → LEGITIMATE (No strong or weak suspicious patterns detected) | Domain age: 340 months | Months to expire: 36


URL: http://facebook.com
Decision Tree → LEGITIMATE (No strong or weak suspicious patterns detected) | Domain age: 346 months | Months to expire: 104


URL: http://writeassociate.com/test/Portal/inicio/IO8Hc30w_Eq8DfVjyJGvwEO4GhAnH48CqLwGx-uH4XXCpAPCJlRkBsaGmGQ6QgAIyLKwQ/www.Bancasaleon.com.do/bhdi/
Decision Tree → SUSPICIOUS (URL unreachable or dead) | Domain age: 5 months | Months to expire: 6


URL: http://acornpresscanada.com/x487kjfdsd9274r98yuofiwo5uodjld2/chase-home/verification-card.php?https://chaseonline.chase.com/Logon.aspx?LOB=RBGLogon
Decision Tree → SUSPICIOUS (URL unreachable or dead) | Domain age: 293 months | Months to expire: 34


URL: https://tubuh-syarikat.com/plugins/fields/files/
Decision Tree → PHISHING (Domain expiring in 0 months) | Domain age: 11 months | Months to expire: 0


URL: http://webmasteradmin.ukit.me/