# Feature Extraction

In [2]:
import pandas as pd

# URL Based
"URL-based" typically refers to operations, processes, or functionality that specifically work with Uniform Resource Locators (URLs), which are the web addresses used to identify resources on the internet. When you work with URLs in your code, you are often performing tasks like parsing, validating, extracting, or manipulating parts of the URL (e.g., protocol, host, path, query parameters).

In [3]:
# Importing urlparse module from urllib
from urllib.parse import urlparse

# Importing the regular expression module
import re


### Checking IP
checks if a given URL contains an IP address and assigns a value based on whether the URL is malicious or legitimate

In [4]:
import ipaddress

# Function to check if the URL contains an IP address
def havingIP(url):
    try:
        # Try to convert the URL to an IP address using the ip_address function
        ipaddress.ip_address(url)
        ip = 1  # If it is a valid IP address, return 1 (malicious)
    except:
        ip = 0  # If it's not a valid IP address, return 0 (legitimate)
    return ip


### Checking @
haveAt() checks whether the provided URL contains the "@" symbol

In [5]:
# Function to check if the URL contains the "@" symbol
def haveAt(url):
    if "@" in url:  # If "@" symbol is found in the URL
        return 1  # Malicious URL (contains "@")
    else:
        return 0  # Legitimate URL (no "@")


### Finding URL Lenght
url_length() checks the length of a given URL and assigns a value based on whether the URL length is greater than 55 characters

In [6]:
# Function to check if the URL length is greater than 55 characters
def url_length(url):
    if len(url) > 55:  # If the length of the URL is greater than 55 characters
        return 1  # Malicious (long URL)
    else:
        return 0  # Legitimate (short URL)


### Finding URL Depth
url_depth() is designed to check the "depth" of a URL based on how many slashes (/) are present in the URL's path. This can be used to indicate how deeply nested a URL is, which can sometimes be a feature of malicious URLs

In [7]:
# Function to check the URL depth (count of slashes in the path)
def url_depth(url):
    parsed_url = urlparse(url)  # Parse the URL into its components
    path = parsed_url.path  # Extract the path component of the URL
    depth = path.count('/')  # Count the number of slashes in the path to determine depth
    return depth  # Return the depth of the URL


### Checking Redirects
The Redirection() function you provided checks whether a URL contains a potential redirection (i.e., if the URL has // at positions that suggest a redirection might be taking place). The function appears to be trying to detect URLs that may involve redirects by checking the position of the // sequence in the URL

In [8]:
def Redirection(url):
    loc = url.rfind('//')  # Find the last occurrence of "//" in the URL
    if loc > 6:  # Check if "//" appears after the 6th character
        if loc > 7:  # If "//" occurs after the 7th character
            return 1  # Potential redirection (malicious)
        else:
            return 0  # No redirection (legitimate)
    else:
        return 0  # No redirection (legitimate)


### Checking for HTTP and HTPPS
The https_domain() function you provided aims to check if a URL's domain contains the string https, indicating whether the URL is using HTTPS or not

In [9]:
def https_domain(url):
    domain = urlparse(url).netloc  # Extract the domain part of the URL
    if 'https' in domain:  # Check if 'https' appears in the domain (which is not correct for HTTPS checking)
        return 1  # Malicious URL
    else:
        return 0  # Legitimate URL


### Checking for URL Shorteners


In [10]:
# Function for checking if the URL used link shorteners.
# If the URL has used link shortener it is assigned with 1 and 0 if it didn't use.
# 1 indicating that the URL is malacious whereas 0 indicating legitimate URL.

def TinyUrl(url):
    shortener_domains = ['bit.ly','goo.gl','t.co','ow.ly', 'tinyurl.com','is.gd', 'buff.ly',
    'adobe.ly','shorturl.at','tiny.cc','ow.ly',
    's.coop','rebrand.ly','soo.gd','v.gd','prettylinks.com','bc.vc',
    'cutt.ly','qr.ae','u.nu','shorl.com',
    'mcaf.ee','tr.im','fur.ly','cli.gs','yourls.org','tiny.pl','vzturl.com','adcrun.ch','x.co','zz.gd',
    'qr.net','tr.im','shorte.st','9.bb','ity.im','adf.ly','flyt.it','lin.ks','adflav.com','amzn.to',
    '0rz.tw','crisco.com','snipurl.com','memurl.com','dft.ba','clicky.me','7.ly',
    'budurl.com','u.to','pnt.me','viralurl.com','2.gp','xlinkz.info','3.ly',
    '9.bb','10.go','ergo.pp.ua','golinks.co','tiny.ie','adcraft.co','sk.gy',
    'xzb.cc','lin.io','go.9nl.com','u.bb','hiderefer.com','tu2.ru','x.vu','lnk.co',
    'su.pr','shar.as','notlong.com','zpag.es','u6e.de','2ya.com','viralurl.biz','4ms.me',
    'rofl.my','lurl.no','url.ie','ff.im','hit.my','korta.nu','x.se','ref.so','durl.me',
    'fwib.net','zii.bz','vzturl.com','memurl.com','dft.ba','1url.com','tinylinks.co','vb.ly',
    'qr.cx','go.2link.me','tweetburner.com','pic.gd','2u.pw','ls.gd','2pl.us',
    'urlx.ie','utrack.me','yi.tl','ref.li','zipmyurl.com','qicute.com','cx6.co','x90.es','urlcorta.es',
    'pw2.ro','cort.as','minilien.com','yourls.com','hurl.me','tgr.me','shout.to',
    'x2c.eu','shrten.com','dwarfurl.com','lnkd.in','dai.ly','v.gd','nyti.ms','aje.me',
    'huff.to','slate.me','trib.al','pco.lt','thetim.es','n.pr','reut.rs','on.wsj.com','usat.ly','nbcnews.to',
    'abcn.ws','cbsn.ws','tcrn.ch','engt.co','bzfd.it','bzfd.it','ti.me','bzfd.it','natgeo.org',
    'ars.to','ti.me','bzfd.it','natgeo.org','ars.to','bzfd.it','for.tn','bzfd.it','t.ted.com',
    'hbr.org','slate.me','bzfd.it','bzfd.it','for.tn','bzfd.it','huffp.st','bzfd.it','nyti.ms','slate.me',
]
    domain = urlparse(url).netloc  # Extract the domain part of the URL
    
    # Check if any of the shortener domains are in the URL's domain
    for shortener in shortener_domains:
        if shortener in domain:
            return 1  # Malicious URL
    return 0  # Legitimate URL if no shortener domain found

### Checking Prefix and Suffix
The PSFix() function you've written checks whether the domain part of the URL contains a hyphen (-). If it does, the function returns 1, indicating that the URL might be suspicious (malicious). Otherwise, it returns 0, indicating that the URL is legitimate.

In [11]:
def PSFix(url):
    # Extract the domain (netloc) from the URL
    fix = urlparse(url).netloc

    # Check if a hyphen ('-') is present in the domain part of the URL
    if '-' in fix:
        return 1  # Potentially malicious, as many phishing sites use hyphens in the domain
    else:
        return 0  # No hyphen, likely a legitimate URL


# HTML and JavaScript based Features

In [12]:
# The requests module is a popular Python library used for making HTTP requests. It simplifies the process of sending HTTP requests and handling responses, making it easier to interact with web APIs and web scraping tasks
import requests

### iFrame Redirections 
The function iFrame(response) is designed to check the response from a web request to determine if an iframe is present within the HTML content.

In [13]:
def iFrame(response):
    # Check if response is empty
    if response == "":
        return 1
    else:
        # Look for <iframe> tags in the HTML content
        if re.findall(r"<iframe.*?>", response.text):
            return 0  # Indicates possible malicious content with iframe
        else:
            return 1  # No iframe found, likely legitimate


### Mouse Over
The function Mouse_Over(response) seems to be attempting to check for some specific content in a webpage's HTML that might indicate a "mouseover" event, which typically involves a user interacting with an element on the page by hovering the mouse over it.

In [14]:
def Mouse_Over(response):
    if response == "":
        return 1  # Returning 1 for empty response
    
    # Looking for the presence of the 'onmouseover' attribute in the HTML content
    elif re.findall(r'onmouseover=".*?"', response.text):  # Search for mouseover events
        return 1  # Indicates possible malicious content or interactive behavior
    else:
        return 0  # No mouseover events, likely legitimate


### Right Click
The Right_Click function you're implementing appears to be attempting to detect the presence of JavaScript code that prevents right-clicking (which is a common tactic used in phishing or malicious websites to restrict user actions).

In [15]:
def Right_Click(response):
    if not response:  # Check if the response is empty
        return 1  # Return 1 for empty response
    else:
        # Check if the page contains the pattern indicating right-click is disabled
        if re.findall(r"event.button ?== ?2", response.text):
            return 0  # Return 0 if right-click is disabled
        else:
            return 1  # Return 1 if right-click is enabled


### Web Forwards
The Web_Forwards function you’ve defined is likely trying to check if a URL undergoes multiple redirects, which could be a characteristic of phishing or malicious websites

In [25]:
def Web_Forwards(url):
    if not url or not url.startswith(('http://', 'https://')):
        print(f"Invalid URL: {url}")
        return 0  # Return a default value or handle accordingly
    
    # Send a GET request to the URL and follow redirects
    try:
        response = requests.get(url, allow_redirects=True)
        # Check if the response has a redirect history
        if len(response.history) <= 2:  # Less than or equal to 2 redirects
            return 0  # No suspicious redirects
        else:
            return 1  # Suspicious redirects
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return 0  # Return a default value if the request fails


# Input Format

In [45]:
import pickle
import numpy as np
import pandas as pd
import warnings
import re
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup

# Suppress warnings
warnings.filterwarnings("ignore")

def validate_url(url):
    """Clean and validate URL input"""
    # Remove any leading/trailing whitespace and 'Enter the url...' text
    url = url.strip()
    if "Enter the url" in url:
        url = url.split(":-")[-1].strip()
    
    parsed_url = urlparse(url)
    if not parsed_url.scheme:
        url = "https://" + url  # Use HTTPS as default scheme
    return url

def havingIP(url):
    """Check if URL contains IP address"""
    ip_pattern = r'(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)'
    return 1 if re.search(ip_pattern, url) else 0

def haveAt(url):
    """Check if URL contains @ symbol"""
    return 1 if '@' in url else 0

def url_length(url):
    """Check if URL length is suspicious"""
    return 1 if len(url) > 75 else 0

def url_depth(url):
    """Calculate URL directory depth"""
    parsed = urlparse(url)
    depth = parsed.path.count('/')
    return 1 if depth > 4 else 0

def Redirection(url):
    """Check for redirections in URL"""
    return 1 if '//' in url[8:] else 0

def https_domain(url):
    """Check HTTPS in domain part"""
    domain = urlparse(url).netloc
    return 1 if 'https' in domain else 0

def TinyUrl(url):
    """Check if URL is using a URL shortening service"""
    shortening_services = ['bit.ly', 'tinyurl.com', 'goo.gl', 't.co']
    return 1 if any(service in url.lower() for service in shortening_services) else 0

def PSFix(url):
    """Check for suspicious prefixes/suffixes"""
    suspicious = ['-', '.', '_']
    domain = urlparse(url).netloc
    return 1 if any(domain.startswith(c) or domain.endswith(c) for c in suspicious) else 0

def iFrame(response):
    """Check for iframes in response"""
    try:
        if isinstance(response, requests.models.Response):
            soup = BeautifulSoup(response.text, 'html.parser')
            return 1 if soup.find_all('iframe') else 0
    except:
        pass
    return 0

def Mouse_Over(response):
    """Check for mouseover events that modify status bar"""
    try:
        if isinstance(response, requests.models.Response):
            soup = BeautifulSoup(response.text, 'html.parser')
            onmouseover_events = soup.find_all(attrs={"onmouseover": True})
            return 1 if any('window.status' in str(event) for event in onmouseover_events) else 0
    except:
        pass
    return 0

def Right_Click(response):
    """Check if right click is disabled"""
    try:
        if isinstance(response, requests.models.Response):
            soup = BeautifulSoup(response.text, 'html.parser')
            scripts = soup.find_all('script')
            return 1 if any('preventDefault()' in str(script) or 'return false' in str(script) for script in scripts) else 0
    except:
        pass
    return 0

def Web_Forwards(url):
    """Check number of redirects"""
    try:
        response = requests.get(url, allow_redirects=True)
        return 1 if len(response.history) > 1 else 0
    except:
        return 0

def evaluate_url(url):
    """Main function to evaluate URL"""
    # Clean and validate URL
    url = validate_url(url)
    print(f"Analyzing URL: {url}")
    
    # Extract features
    feature = []
    
    # URL-based features
    feature.append(havingIP(url))
    feature.append(haveAt(url))
    feature.append(url_length(url))
    feature.append(url_depth(url))
    feature.append(Redirection(url))
    feature.append(https_domain(url))
    feature.append(TinyUrl(url))
    feature.append(PSFix(url))
    
    # Content-based features
    try:
        response = requests.get(url, timeout=5, verify=True)
        feature.append(iFrame(response))
        feature.append(Mouse_Over(response))
        feature.append(Right_Click(response))
        feature.append(Web_Forwards(url))
    except Exception as e:
        print(f"Warning: Couldn't fetch URL content ({str(e)})")
        feature.extend([0, 0, 0, 0])  # Add default values for content-based features
    
    # Print extracted features with labels
    feature_names = ['IP Address', 'At Symbol', 'URL Length', 'URL Depth', 
                    'Redirection', 'HTTPS in Domain', 'TinyURL', 'Suspicious Fix',
                    'iFrame', 'Mouse Over', 'Right Click Disabled', 'Web Forwards']
    
    print("\nExtracted Features:")
    for name, value in zip(feature_names, feature):
        print(f"{name}: {value}")
    
    # Make prediction
    feature_array = np.array(feature).reshape(1, -1)
    with open('random_forest_model.pkl', 'rb') as file:
        rf_model = pickle.load(file)
    
    prediction = rf_model.predict(feature_array)[0]
    probability = rf_model.predict_proba(feature_array)[0]
    
    return prediction, probability, feature

# Example usage
if __name__ == "__main__":
    url = input('Enter the URL that needs to be evaluated: ')
    prediction, probability, features = evaluate_url(url)
    
    if prediction == 1:
        print(f"\nResult: ⚠️ Potential Phishing ({probability[1]:.2%} confidence)")
    else:
        print(f"\nResult: ✅ Legitimate ({probability[0]:.2%} confidence)")
        

Enter the URL that needs to be evaluated:  https://mail.google.com/mail/u/0/?tab=rm&ogbl#inbox/FMfcgzGsnBmKbNRFtQfMbdBqFhQFhgSF


Analyzing URL: https://mail.google.com/mail/u/0/?tab=rm&ogbl#inbox/FMfcgzGsnBmKbNRFtQfMbdBqFhQFhgSF

Extracted Features:
IP Address: 0
At Symbol: 0
URL Length: 1
URL Depth: 0
Redirection: 0
HTTPS in Domain: 0
TinyURL: 0
Suspicious Fix: 0
iFrame: 0
Mouse Over: 0
Right Click Disabled: 1
Web Forwards: 1

Result: ✅ Legitimate (56.98% confidence)
