# **Rule-Based Detection of Malicious URLs**
* The system performs URL validation, detects potential domain misspellings, flags subdomain abuse, identifies uncommon top-level domains (TLDs), checks for suspicious URL encoding, and evaluates URL length.
* It also includes a mechanism to detect shortened URLs by comparing them to a list of known URL shorteners. 
* The implementation avoids reliance on external libraries, ensuring compatibility in environments like Kaggle.
* This approach provides a straightforward method to detect potentially malicious URLs for research purposes.

In [11]:
import re
from urllib.parse import urlparse

# Define common TLDs and URL shorteners for quick reference
common_tlds = ['com', 'org', 'net', 'edu', 'gov', 'ac']
url_shorteners = ['bit.ly', 'goo.gl', 'tinyurl.com', 't.co', 'is.gd', 'buff.ly']

# Manual URL validation using regex
def is_valid_url(url):
    url_regex = re.compile(
        r'^(https?://)?'  # http:// or https:// (optional)
        r'((([A-Za-z0-9-]+\.)+[A-Za-z]{2,})|'  # Domain name
        r'localhost|'  # localhost
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # IPv4 address
        r'(:\d+)?(/.*)?$'  # Port number and path (optional)
    )
    return re.match(url_regex, url) is not None

# Extract domain name and subdomain using urlparse
def extract_domain_and_subdomain(url):
    parsed_url = urlparse(url)
    domain_parts = parsed_url.netloc.split('.')
    if len(domain_parts) > 2:
        subdomain = ".".join(domain_parts[:-2])
        domain = domain_parts[-2]
        suffix = domain_parts[-1]
    else:
        subdomain = ""
        domain = domain_parts[0]
        suffix = domain_parts[1] if len(domain_parts) > 1 else ""
    
    return subdomain, domain, suffix

# Check for subdomain abuse
def check_subdomain_abuse(subdomain):
    if subdomain and len(subdomain.split('.')) > 2:  # More than 2 levels of subdomains is suspicious
        return f"Suspicious subdomain detected: {subdomain}"
    return "No subdomain abuse detected"

# Check for uncommon TLDs
def check_uncommon_tlds(tld):
    if tld not in common_tlds:
        return f"Uncommon TLD detected: {tld}"
    return "Common TLD used"

# Check for suspicious URL encoding
def check_suspicious_encoding(url):
    if re.search(r'%[0-9A-Fa-f]{2}', url):  # Looking for percent-encoded characters
        return "Suspicious URL encoding detected"
    return "No suspicious encoding"

# Check URL length
def check_url_length(url):
    if len(url) > 100:  # Arbitrary threshold for a long URL
        return f"URL length suspicious: {len(url)} characters"
    return "URL length within safe range"

# Detect shortened URLs
def check_shortened_url(parsed_url):
    if parsed_url.netloc in url_shorteners:
        return f"Shortened URL detected: {parsed_url.netloc}"
    return "No shortened URL detected"

# Function to run all checks on a given URL
def check_url(url):
    # Validate the URL manually
    if not is_valid_url(url):
        return "Invalid URL format"
    
    # Parse and extract domain components
    parsed_url = urlparse(url)
    subdomain, domain, suffix = extract_domain_and_subdomain(url)

    # Running all checks
    results = {
        "Misspelled Domain": check_misspelled_domain(domain),
        "Subdomain Abuse": check_subdomain_abuse(subdomain),
        "Uncommon TLD": check_uncommon_tlds(suffix),
        "Suspicious URL Encoding": check_suspicious_encoding(url),
        "URL Length": check_url_length(url),
        "Shortened URL Detection": check_shortened_url(parsed_url)
    }
    
    return results

# Example Usage
test_url = "https://ucam.uiu.ac.bd/Security/LogIn.aspx"
result = check_url(test_url)

# Print results
for check, outcome in result.items():
    print(f"{check}: {outcome}")

Misspelled Domain: Potential misspelling: ac (similar to amazon)
Subdomain Abuse: No subdomain abuse detected
Uncommon TLD: Uncommon TLD detected: bd
Suspicious URL Encoding: No suspicious encoding
URL Length: URL length within safe range
Shortened URL Detection: No shortened URL detected
