Feature Extraction

In [1]:
from urllib.parse import urlparse,urlencode
import ipaddress 

3.1 IP Address in the URL
Checks for the presence of IP address in the URL. URLs may have IP address instead of domain name. If an IP address is used as an alternative of the domain name in the URL, we can be sure that someone is trying to steal personal information with this URL.

If the domain part of URL has IP address, the value assigned to this feature is 1 (phishing) or else 0 (legitimate).

re.search() function is used to search for specific patters in a url.

In [2]:
def has_ip_address(url):
    # Regular expression pattern to match IPv4 addresses
    ipv4_pattern = r'\b(?:\d{1,3}\.){3}\d{1,3}\b'
    
    # Regular expression pattern to match IPv6 addresses
    ipv6_pattern = r'\b(?:[A-F0-9]{1,4}:){7}[A-F0-9]{1,4}\b'
    
    # Search for IPv4 and IPv6 patterns in the URL
    has_ipv4 = re.search(ipv4_pattern, url) is not None
    has_ipv6 = re.search(ipv6_pattern, url) is not None
    
    # Return True if either IPv4 or IPv6 address is found in the URL
    if (has_ipv4 | has_ipv6):
        ip = 1 #has ip address
    else:
        ip = 0 #doesn't have ip address
    
    return ip

3.2 Checking the length of the URL

if length of URL >= 54 then assigning it a value of 1 (phishing) else 0 (benign)

In [3]:
import re

def length(url):
    length = len(url)
    if length < 54:
        return 0
    else:
        return 1

3.3 If there is a @ in a url, everything after the @ is ignored by the browser
We assign at a value of 1 (phishing) if the url has @ else 0 (benign)

In [4]:
def checkAtSign(url):
    if "@" in url:
        at = 1
    else:
        at = 0
    return at

3.4 Domain Age: Phishing websites often have recently registered domains, while legitimate websites tend to have older domains. You can determine the age of the domain using WHOIS data and include it as a feature.

In [5]:
!pip install python-whois
from datetime import datetime 
import whois




In [6]:
def calc_domain_age(url, threshold=30):
    try:
        domain_information = whois.whois(domain)
        creation_date = domain_information.creation_date
        if isinstance(creation_date, list):
            creation_date = creation_date[0]
            
        if creation_date:
            current_date = datetime.now()
            domain_age = (current_date - creation_date).days
        else:
            domain_age = None

        if domain_age and domain_age <= threshold:
            recent_domain = 1
        else:
            recent_domain = 0
        return recent_domain
        
    except:
        return 0

3.5 URL shortening
URL shortening is a method on the “World Wide Web” in which a URL may be made considerably smaller in length and still lead to the required webpage. 

If the URL is using Shortening Services, the value assigned to this feature is 1 (phishing) or else 0 (legitimate).

In [7]:
#listing shortening services
shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
                      r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
                      r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
                      r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
                      r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
                      r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
                      r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
                      r"tr\.im|link\.zip\.net"

In [8]:
def check_shortening(url):
    match = re.search(shortening_services, url)
    if match:
        return 1
    else:
        return 0

In [9]:
!pip install python-whois



In [10]:
import re
from bs4 import BeautifulSoup
import whois
import urllib
import urllib.request
from datetime import datetime

3.6 Web traffic
Phishing websites usually live for a short period of time; they may not be recognised by the Alexa database. 

For our model, if the domain has no traffic or is not recognized by the Alexa database, it is classified as “Phishing”.


In [11]:
def web_traffic(url):
  try:
    #Filling the whitespaces in the URL if any
    url = urllib.parse.quote(url)
    rank = BeautifulSoup(urllib.request.urlopen("http://data.alexa.com/data?cli=10&dat=s&url=" + url).read(), "xml").find(
        "REACH")['RANK']
    rank = int(rank)
  except TypeError:
        return 1
  else:
      if rank <100000:
        return 1
      else:
        return 0

3.6 HTML and JS based features

In [12]:
import requests #this liab is used to make HTTP requests to URLS
from bs4 import BeautifulSoup #to parse the HTML docs


Iframe is an HTML tag used to display an additional webpage into one that is currently shown. Phishers can make use of the “iframe” tag and make it invisible i.e. without frame borders. In this regard, phishers make use of the “frameBorder” attribute which causes the browser to render a visual delineation. 

If the iframe is empty or repsonse is not found then, the value assigned to this feature is 1 (phishing) or else 0 (legitimat."""

In [13]:
def check_for_iframe(response):
    if response == "":
        return 1
    try:
        #response = requests.get(url) #sending an HTTP get request
        if response.status_code == 200:
            html_content = response.text
            soup = BeautifulSoup(html_content, 'html.parser')
            iframes = soup.find_all('iframe')
            if iframes:
                return 1 #phishing
            else:
                return 0 #benign 
        else:
            return 0
    except Exception as e:
        return 0
        print("An check_for_iframe error occurred:", str(e))

Phishers may use JS to show a fake URL in the status bar to users. To extract this feature, we try to find out the webpage source code, particularly the "onMouseOver" event and check if it makes any changes on the status bar.

In [14]:
def check_for_bar_manipulation(response):
    if response == "":
        return 1
    try:
        #response = requests.get(url) #sending an HTTP get request
        if response.status_code == 200:
            html_content = response.text
            js_code = re.findall(r'<script.*?onmouseover.*?>.*?</script>', html_content, re.DOTALL)
            if js_code:
                for code in js_code:
                    if "window.status" in code or "status=" in code: #these expressions are used to manipulate status bar
                        return 1 #phishing
                    else: 
                        return 0
        else:
            return 0
    except Exception as e:
        print("A check_for_bar_manipulation error occurred:", str(e))
        return 0

Phishers attempt to disable the right_click option to prevent users from saving and viewing the webpage source code. For this feature, we search for the event where event.button==2 in the web source code and check if the right click is disabled. 

In [15]:
# 15. IFrame Redirection (iFrame)
def iframe(response):
  if response == "":
      return 1
  else:
      if re.findall(r"[|]", response.text):
          return 0
      else:
          return 1

In [16]:
# 18.Checks the number of forwardings (Web_Forwards)    
def forwarding(response):
  if response == "":
    return 1
  else:
    if len(response.history) <= 2:
      return 0
    else:
      return 1

In [17]:
# 17.Checks the status of the right click attribute (Right_Click)
def rightClick(response):
  if response == "":
    return 1
  else:
    if re.findall(r"event.button ?== ?2", response.text):
      return 0
    else:
      return 1

In [18]:
# 16.Checks the effect of mouse over on status bar (Mouse_Over)
def mouseOver(response): 
  if response == "" :
    return 1
  else:
    if re.findall("", response.text):
      return 1
    else:
      return 0

In [19]:
def check_for_right_click_disabled(response):
    if response == "":
        return 1
    try:
        #response = requests.get(url) #sending an HTTP get request
        if response.status_code == 200:
            html_content = response.text
            js_code = re.findall(r'<script.*?>.*?</script>', html_content, re.DOTALL)
            if js_code:
                for code in js_code:
                    if "event.button==2" in code or "status=" in code:
                        return 1 #phishing
                    else: 
                        return 0
        else:
            return 0
    except Exception as e:
        print("A check_for_right_click_disabled error occurred:", str(e))
        return 0

The number of times a website has been redirected also differentiates a phishing website from a legitimate website. From the dataset, one observation was legitimate urls only redirected one time max whereas phishing websites have be redirected four or more times.

<span style="font-size: 20px;">Computing URL Features</span>

Create a list and a function that calls the other functions and stores all the features of the URL in the list. We will extract the features of each URL and append to this list.

In [20]:
def featureExtraction(url,target_label):
    features = []
    features.append(has_ip_address(url))
    features.append(length(url))
    features.append(checkAtSign(url))
    features.append(calc_domain_age(url, 30))
    features.append(check_shortening(url))

    try:
        response = requests.get(url)
    except:
        response = ""
        
    features.append(check_for_iframe(response))
    features.append(check_for_bar_manipulation(response))
    features.append(check_for_right_click_disabled(response))
    features.append(forwarding(response))
    features.append(target_label)
    
    return features

In [21]:
import pandas as pd

phishing = pd.read_csv("phishing_urls.csv")
legit = pd.read_csv("benign_urls.csv")
sample_size = 5000

In [22]:
# phish_features = []
# target_label = 1
# feature_names = ['has_ip_address', 'length', 'checkAtSign', 'calc_domain_age', 'check_shortening','check_for_iframe', 'check_for_bar_manipulation','check_for_right_click_disabled', 'forwarding', 'label']

# for i in range(0, 2000):
#     url = phishing["url"][i]
#     phish_features.append(featureExtraction(url,1))
#     phish_final = pd.DataFrame(phish_features, columns= feature_names)

In [23]:
# legit_features = []
# feature_names = ['has_ip_address', 'length', 'checkAtSign', 'calc_domain_age', 'check_shortening','check_for_iframe', 'check_for_bar_manipulation','check_for_right_click_disabled', 'forwarding', 'label']
# for i in range(710, 2000):
#     url = legit["url"][i]
#     legit_features.append(featureExtraction(url,0))
#     print(i)
#     legit_final = pd.DataFrame(legit_features, columns= feature_names)

In [24]:
import csv
def concatenate_csv_files(input_files, output_file):
    with open(output_file, 'w', newline='') as outfile:
        writer = csv.writer(outfile)

        # Iterate over each input file
        for input_file in input_files:
            with open(input_file, 'r', newline='') as infile:
                reader = csv.reader(infile)
                # Write each row from the input file to the output file
                for row in reader:
                    writer.writerow(row)

In [25]:
input_files = ['legit_features_file.csv', 'phish_features_file.csv']
output_file = 'urldata.csv'
concatenate_csv_files(input_files, output_file)

In [28]:
urldata = pd.read_csv("urldata.csv")

In [29]:
urldata.head()

Unnamed: 0,has_ip_address,length,checkAtSign,calc_domain_age,check_shortening,check_for_iframe,check_for_bar_manipulation,check_for_right_click_disabled,forwarding,label
0,0,0,0,0,0,1,1.0,1,1,0
1,0,0,0,0,0,1,1.0,1,1,0
2,0,1,0,0,0,0,0.0,0,0,0
3,0,0,0,0,0,1,1.0,1,1,0
4,0,0,0,0,1,1,1.0,1,1,0


In [30]:
urldata.tail()

Unnamed: 0,has_ip_address,length,checkAtSign,calc_domain_age,check_shortening,check_for_iframe,check_for_bar_manipulation,check_for_right_click_disabled,forwarding,label
3997,0,1,0,0,0,0,0.0,0.0,0,1
3998,0,1,0,0,1,1,1.0,1.0,1,1
3999,0,1,0,0,1,0,0.0,0.0,0,1
4000,0,1,0,0,1,1,1.0,1.0,1,1
4001,0,1,0,0,1,1,1.0,1.0,1,1
