#Importing Library

In [None]:
pip install python-whois

Collecting python-whois
  Downloading python_whois-0.9.4-py3-none-any.whl (103 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/103.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m102.4/103.3 kB[0m [31m3.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-whois
Successfully installed python-whois-0.9.4


In [None]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from urllib.parse import urlparse
import requests
import whois
from datetime import date, datetime
from functools import lru_cache
from concurrent.futures import ThreadPoolExecutor
import ssl
import socket

#First Dataset

Source: https://www.kaggle.com/datasets/sid321axn/malicious-urls-dataset

In [None]:
Kaggle_Dataset = pd.read_csv('/content/drive/MyDrive/Phishing Detection/Dataset/Unextracted Dataset/Kaggle Dataset 1.csv')

In [None]:
Kaggle_Dataset.shape

(651191, 2)

In [None]:
Kaggle_Dataset.head()

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


This dataset contains 4 types which are
*   Benign - safe URL
*   Defacement - URL that is meant to impersonate
*   Malware - URl that is meant to attack computer's security
*   Phishing - URL that is meant to steal personal information



In [None]:
Kaggle_Dataset['type'].value_counts()

type
benign        428103
defacement     96457
phishing       94111
malware        32520
Name: count, dtype: int64

#Second Dataset

Source: https://dataforseo.com/free-seo-stats/top-1000-websites

In [None]:
Ranked_Domain = pd.read_csv('/content/drive/MyDrive/Phishing Detection/Dataset/Unextracted Dataset/ranked_domains.csv')

In [None]:
Ranked_Domain.shape

(1000, 4)

This dataset contains top 1000 websites that is visited the most. Every url in this dataset is benign

In [None]:
Ranked_Domain.head()

Unnamed: 0,Rank,Domain,Keywords in SERPs,Estimated organic traffic
0,1,youtube.com,408855671,22665060000.0
1,2,facebook.com,282448378,11045170000.0
2,3,wikipedia.org,254297930,38834910000.0
3,4,reddit.com,189223735,3402545000.0
4,5,instagram.com,172748509,9263599000.0


Renaming and dropping unnecessary columns

In [None]:
Ranked_Domain.drop(columns=['Keywords in SERPs','Estimated organic traffic','Rank'],inplace=True)
Ranked_Domain.rename(columns = {'Domain':'url'},inplace=True)

Adding type to the dataset

In [None]:
Ranked_Domain['type'] = 'benign'

In [None]:
Ranked_Domain.head()

Unnamed: 0,url,type
0,youtube.com,benign
1,facebook.com,benign
2,wikipedia.org,benign
3,reddit.com,benign
4,instagram.com,benign


Reformating url so it is consistent for the entire dataset since benign url especially that is visited the most has 'https://www.' in the begining of it

In [None]:
def reformatURL(url):
  return 'https://www.'+url

Ranked_Domain['url'] = Ranked_Domain['url'].apply(lambda i : reformatURL(i))

In [None]:
Ranked_Domain.head()

Unnamed: 0,url,type
0,https://www.youtube.com,benign
1,https://www.facebook.com,benign
2,https://www.wikipedia.org,benign
3,https://www.reddit.com,benign
4,https://www.instagram.com,benign


# Feature Extraction

Availability of IP address in url

In [None]:
ipv4_regex = "([2][0-5]{2}|[01]?\d{2}|[1-9]\d|\d)\.([2][0-5]{2}|[01]?\d{2}|[1-9]\d|\d)\.([2][0-5]{2}|[01]?\d{2}|[1-9]\d|\d)\.([2][0-5]{2}|[01]?\d{2}|[1-9]\d|\d)"
ipv6_regex = "(?:[a-f-A-F0-9]{1,4}\:){7}[a-f-A-F0-9]{1,4}"
ipv4_in_hexadecimal_regex = "(0x[a-fA-F0-9]{1,2})\.(0x[a-fA-F0-9]{1,2})\.(0x[a-fA-F0-9]{1,2})\.(0x[a-fA-F0-9]{1,2})"

def have_ip_address(url):
  match = re.search(
      ipv4_regex + "|"+ipv6_regex + "|"+ipv4_in_hexadecimal_regex,url)
  if match:
    return 1
  else:
    return 0

Kaggle_Dataset['have_ip_address'] = Kaggle_Dataset['url'].apply(lambda i : have_ip_address(i))
Ranked_Domain['have_ip_address'] = Ranked_Domain['url'].apply(lambda i : have_ip_address(i))

Url length

In [None]:
def url_length(url):
  return len(str(url))

Kaggle_Dataset['url_length'] = Kaggle_Dataset['url'].apply(lambda i: url_length(i))
Ranked_Domain['url_length'] = Ranked_Domain['url'].apply(lambda i: url_length(i))

Shortening Services

The list of shortening services is taken from github

Source: https://github.com/Spamfighter666/Short-URL-Providers-List/blob/master/Short-URL-Providers-Annotated.csv


In [None]:
shortening_dataset = pd.read_csv("/content/drive/MyDrive/Phishing Detection/Dataset/Shortening Service Dataset.csv", encoding='latin1')

Reformating dataset into a regex pattern

In [None]:
shortening_url = shortening_dataset['FQDNS'].to_list()
shortening_url = [url[7:-1] for url in shortening_url]
first_part_url = [url for url in shortening_url if url[0]!='/']
second_part_url = [url.replace('/','') for url in shortening_url if url[0] == '/']
shortening_url = first_part_url + second_part_url
shortening_url = "|".join(url for url in shortening_url)
shortening_url = shortening_url.replace("t.co|","")
shortening_url = shortening_url.replace("a.co|","")
shortening_url = shortening_url.replace("x.co|","")
shortening_url = shortening_url.replace("apple.co|","apple.com")
shortening_url = shortening_url.replace("//?gtnjs=","")
shortening_url = shortening_url.replace("/cgi-sys/suspendedpage.cg","")
shortening_url = shortening_url.replace(".","\.")

In [None]:
def shortening_service(url):
  match = re.search(shortening_url,url)
  if match:
    return 1
  else:
    return 0
Kaggle_Dataset['shortening_service'] = Kaggle_Dataset['url'].apply(lambda i: shortening_service(i))
Ranked_Domain['shortening_service'] = Ranked_Domain['url'].apply(lambda i: shortening_service(i))

Count @ symbol

In [None]:
def countattratesymbol_url(url):
  return url.count('@')

Kaggle_Dataset['count@'] = Kaggle_Dataset['url'].apply(lambda i : countattratesymbol_url(i))
Ranked_Domain['count@'] = Ranked_Domain['url'].apply(lambda i : countattratesymbol_url(i))

Finding http

In [None]:
def count_http_url(url):
  result = urlparse(url)
  if result.scheme == 'http':
    return 1
  else:
    return 0
Kaggle_Dataset['count_http'] = Kaggle_Dataset['url'].apply(lambda i : count_http_url(i))
Ranked_Domain['count_http'] = Ranked_Domain['url'].apply(lambda i : count_http_url(i))


Finding https

In [None]:
def count_https_url(url):
  result = urlparse(url)
  if result.scheme == 'https':
    return 1
  else:
    return 0
Kaggle_Dataset['count_https'] = Kaggle_Dataset['url'].apply(lambda i : count_https_url(i))
Ranked_Domain['count_https'] = Ranked_Domain['url'].apply(lambda i : count_https_url(i))

Counting dot

In [None]:
def count_dot_url(url):
  return url.count('.')

Kaggle_Dataset['countdot'] = Kaggle_Dataset['url'].apply(lambda i : count_dot_url(i))
Ranked_Domain['countdot'] = Ranked_Domain['url'].apply(lambda i : count_dot_url(i))

Counting hyphen

In [None]:
def count_hyphen_url(url):
  return url.count('-')

Kaggle_Dataset['count-'] = Kaggle_Dataset['url'].apply(lambda i : count_hyphen_url(i))
Ranked_Domain['count-'] = Ranked_Domain['url'].apply(lambda i : count_hyphen_url(i))

Counting underline

In [None]:
def count_underline_url(url):
  return url.count('_')

Kaggle_Dataset['count_'] = Kaggle_Dataset['url'].apply(lambda i : count_underline_url(i))
Ranked_Domain['count_'] = Ranked_Domain['url'].apply(lambda i : count_underline_url(i))

Counting questionmark

In [None]:
def count_question_url(url):
  return url.count('?')

Kaggle_Dataset['count?'] = Kaggle_Dataset['url'].apply(lambda i : count_question_url(i))
Ranked_Domain['count?'] = Ranked_Domain['url'].apply(lambda i : count_question_url(i))

Counting path

In [None]:
def count_slash_url(url):
  path = str(urlparse(url).path)
  return path.count('/')

Kaggle_Dataset['count_path'] = Kaggle_Dataset['url'].apply(lambda i : count_slash_url(i))
Ranked_Domain['count_path'] = Ranked_Domain['url'].apply(lambda i : count_slash_url(i))

Counting equal symbol

In [None]:
def count_equal_url(url):
  return url.count('=')

Kaggle_Dataset['count='] = Kaggle_Dataset['url'].apply(lambda i: count_equal_url(i))
Ranked_Domain['count='] = Ranked_Domain['url'].apply(lambda i: count_equal_url(i))

Counting & symbol

In [None]:
def count_amp_url(url):
  return url.count('&')

Kaggle_Dataset['count&'] = Kaggle_Dataset['url'].apply(lambda i: count_amp_url(i))
Ranked_Domain['count&'] = Ranked_Domain['url'].apply(lambda i: count_amp_url(i))

Counting exclamation symbol

In [None]:
def count_exclam_url(url):
  return url.count('!')

Kaggle_Dataset['count!'] = Kaggle_Dataset['url'].apply(lambda i: count_exclam_url(i))
Ranked_Domain['count!'] = Ranked_Domain['url'].apply(lambda i: count_exclam_url(i))

Counting percent symbol

In [None]:
def count_percent_symbol(url):
  return url.count('%')

Kaggle_Dataset['count%'] = Kaggle_Dataset['url'].apply(lambda i: count_percent_symbol(i))
Ranked_Domain['count%'] = Ranked_Domain['url'].apply(lambda i: count_percent_symbol(i))

Count whitespace

In [None]:
def count_space(url):
  return url.count(" ")

Kaggle_Dataset['countspace'] = Kaggle_Dataset['url'].apply(lambda i: count_space(i))
Ranked_Domain['countspace'] = Ranked_Domain['url'].apply(lambda i: count_space(i))

Count comma

In [None]:
def count_comma(url):
  return url.count(",")

Kaggle_Dataset['countcomma'] = Kaggle_Dataset['url'].apply(lambda i: count_comma(i))
Ranked_Domain['countcomma'] = Ranked_Domain['url'].apply(lambda i: count_comma(i))

Count tilde

In [None]:
def count_tilde(url):
  return url.count("~")

Kaggle_Dataset['counttilde'] = Kaggle_Dataset['url'].apply(lambda i: count_tilde(i))
Ranked_Domain['counttilde'] = Ranked_Domain['url'].apply(lambda i: count_tilde(i))

Count plus

In [None]:
def count_plus(url):
  return url.count("+")

Kaggle_Dataset['countplus'] = Kaggle_Dataset['url'].apply(lambda i: count_plus(i))
Ranked_Domain['countplus'] = Ranked_Domain['url'].apply(lambda i: count_plus(i))

Count asterisk

In [None]:
def count_asterisk(url):
  return url.count("*")

Kaggle_Dataset['countasterisk'] = Kaggle_Dataset['url'].apply(lambda i: count_asterisk(i))
Ranked_Domain['countasterisk'] = Ranked_Domain['url'].apply(lambda i: count_asterisk(i))

Count hashtag

In [None]:
def count_hashtag(url):
  return url.count("#")

Kaggle_Dataset['counthashtag'] = Kaggle_Dataset['url'].apply(lambda i: count_hashtag(i))
Ranked_Domain['counthashtag'] = Ranked_Domain['url'].apply(lambda i: count_hashtag(i))

Count dollar

In [None]:
def count_dollar(url):
  return url.count("$")

Kaggle_Dataset['countdollar'] = Kaggle_Dataset['url'].apply(lambda i: count_dollar(i))
Ranked_Domain['countdollar'] = Ranked_Domain['url'].apply(lambda i: count_dollar(i))

Find email

In [None]:
def find_email(url):
  match = re.search('\S+@\S+\.com',url)
  if match:
    return 1
  else:
    return 0

Kaggle_Dataset['emailexist'] = Kaggle_Dataset['url'].apply(lambda i : find_email(i))
Ranked_Domain['emailexist'] = Ranked_Domain['url'].apply(lambda i : find_email(i))

Current columns

In [None]:
Kaggle_Dataset.head()

Unnamed: 0,url,type,have_ip_address,url_length,shortening_service,count@,count_http,count_https,countdot,count-,...,count!,count%,countspace,countcomma,counttilde,countplus,countasterisk,counthashtag,countdollar,emailexist
0,br-icloud.com.br,phishing,0,16,0,0,0,0,2,1,...,0,0,0,0,0,0,0,0,0,0
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
3,http://www.garage-pirenne.be/index.php?option=...,defacement,0,88,0,0,1,0,3,1,...,0,0,0,0,0,0,0,0,0,0
4,http://adventure-nicaragua.net/index.php?optio...,defacement,0,235,0,0,1,0,2,1,...,0,0,0,0,0,0,0,0,0,0


In [None]:
Ranked_Domain.head()

Unnamed: 0,url,type,have_ip_address,url_length,shortening_service,count@,count_http,count_https,countdot,count-,...,count!,count%,countspace,countcomma,counttilde,countplus,countasterisk,counthashtag,countdollar,emailexist
0,https://www.youtube.com,benign,0,23,0,0,0,1,2,0,...,0,0,0,0,0,0,0,0,0,0
1,https://www.facebook.com,benign,0,24,0,0,0,1,2,0,...,0,0,0,0,0,0,0,0,0,0
2,https://www.wikipedia.org,benign,0,25,0,0,0,1,2,0,...,0,0,0,0,0,0,0,0,0,0
3,https://www.reddit.com,benign,0,22,0,0,0,1,2,0,...,0,0,0,0,0,0,0,0,0,0
4,https://www.instagram.com,benign,0,25,0,0,0,1,2,0,...,0,0,0,0,0,0,0,0,0,0


Extracting age of domain and registration length of domain

In [None]:
@lru_cache(maxsize=None)
def get_domain_info(url):
    return whois.whois(url)

def age_of_domain(url):
    try:
        res = get_domain_info(url)
        current_date = datetime.combine(date.today(), datetime.min.time())
        creation_date = res.creation_date[0] if isinstance(res.creation_date, list) else res.creation_date
        # Calculate the domain age correctly
        domain_age = (current_date - creation_date).days/30
        return int(domain_age)
    except:
        return 0
def registration_length(url):
    try:
        res = get_domain_info(url)
        creation_date = res.creation_date[0] if isinstance(res.creation_date, list) else res.creation_date
        expiration_date = res.expiration_date[0] if isinstance(res.expiration_date, list) else res.expiration_date
        registration_length = (expiration_date - creation_date).days//30
        return int(registration_length)
    except:
        return 0
def batch_age_of_domain(urls):
    with ThreadPoolExecutor() as executor:
        age = list(executor.map(age_of_domain, urls))
    return age
def batch_registration_length(urls):
    with ThreadPoolExecutor() as executor:
        regis = list(executor.map(registration_length, urls))
    return regis

Extracting Domain Age and Registration Length

In [None]:
for i in range(0,len(Kaggle_Dataset),1000):
    batch = Kaggle_Dataset.iloc[i:i+1000].copy()
    batch['domain_age']= batch_age_of_domain(batch['url'])
    batch['regis_length'] = batch_registration_length(batch['url'])
    batch.to_csv("Kaggle Dataset 1 (Feature Extracted).csv", mode='a', header=False)

In [None]:
for i in range(0,len(Ranked_Domain),1000):
    batch['domain_age']= batch_age_of_domain(batch['url'])
    batch['regis_length'] = batch_registration_length(batch['url'])
    batch.to_csv("Ranked_Domain (Feature Extracted).csv",index=False)

In [None]:
Extracted_Kaggle = pd.read_csv('/content/drive/MyDrive/Phishing Detection/Dataset/Extracted Dataset/Kaggle Dataset 1 (Feature Extracted).csv')
columns = list(Extracted_Kaggle.columns)
columns.remove('Unnamed: 0')
columns.insert(0,'Index')
Extracted_Kaggle.columns = columns
Extracted_Kaggle.rename(columns={'type_Data':'type','url_Data':'url'},inplace=True)
Extracted_Kaggle.drop(columns=['Index'],inplace=True)

Some urls have been cut from this dataset due to some duplicate

In [None]:
Extracted_Kaggle.shape

(621037, 27)

Current columns

In [None]:
Extracted_Kaggle.head()

Unnamed: 0,url,type,have_ip_address,url_length,shortening_service,count@,count_http,count_https,countdot,count-,...,countspace,countcomma,counttilde,countplus,countasterisk,counthashtag,countdollar,emailexist,domain_age,regis_length
0,br-icloud.com.br,phishing,0,16,0,0,0,0,2,1,...,0,0,0,0,0,0,0,0,2.0,12.0
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,293.0,316.0
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,294.0,304.0
3,http://www.garage-pirenne.be/index.php?option=...,defacement,0,88,0,0,1,0,3,1,...,0,0,0,0,0,0,0,0,0.0,0.0
4,http://adventure-nicaragua.net/index.php?optio...,defacement,0,235,0,0,1,0,2,1,...,0,0,0,0,0,0,0,0,0.0,0.0


In [None]:
Ranked_Domain.head()

Unnamed: 0,url,type,have_ip_address,url_length,shortening_service,count@,count_http,count_https,countdot,count-,...,countspace,countcomma,counttilde,countplus,countasterisk,counthashtag,countdollar,emailexist,domain_age,regis_length
0,https://www.youtube.com,benign,0,23,0,0,0,1,2,0,...,0,0,0,0,0,0,0,0,234,243
1,https://www.facebook.com,benign,0,24,0,0,0,1,2,0,...,0,0,0,0,0,0,0,0,330,438
2,https://www.wikipedia.org,benign,0,25,0,0,0,1,2,0,...,0,0,0,0,0,0,0,0,284,292
3,https://www.reddit.com,benign,0,22,0,0,0,1,2,0,...,0,0,0,0,0,0,0,0,231,255
4,https://www.instagram.com,benign,0,25,0,0,0,1,2,0,...,0,0,0,0,0,0,0,0,242,340


Before extracting SSL certificate, some adjusment is added

In [None]:
Extracted_Kaggle['type'].value_counts()

type
benign        428103
defacement     96457
phishing       63957
malware        32520
Name: count, dtype: int64

Dropping defacement and malware category since the primary objective is to detect phishing url

In [None]:
Extracted_Kaggle = Extracted_Kaggle[Extracted_Kaggle['type'].isin(['benign','phishing'])]

In [None]:
Extracted_Kaggle['type'].value_counts()

type
benign      428103
phishing     63957
Name: count, dtype: int64

Dropping every url that has 0 domain age and 0 registration length since it basically means the url is no longer accessible or it is a typo in the dataset

In [None]:
Extracted_Kaggle = Extracted_Kaggle[Extracted_Kaggle['domain_age']>0]
Extracted_Kaggle = Extracted_Kaggle[Extracted_Kaggle['regis_length']>0]

In [None]:
Extracted_Kaggle['type'].value_counts()

type
benign      360189
phishing     35174
Name: count, dtype: int64

Since the dataset is inbalance, Under sampling method is used

In [None]:
from imblearn.under_sampling import RandomUnderSampler

RUS = RandomUnderSampler(random_state=24)
x = Extracted_Kaggle.drop(columns=['type'])
y = Extracted_Kaggle['type']

x_resampled,y_resampled = RUS.fit_resample(x,y)

Extracted_Kaggle = pd.concat([x_resampled, y_resampled], axis=1, join='inner')

In [None]:
Extracted_Kaggle['type'].value_counts()

type
benign      35174
phishing    35174
Name: count, dtype: int64

In [None]:
def verify_ssl_certificate(url, timeout=5):
    hostname = urlparse(url).netloc
    context = ssl.create_default_context()
    try:
        # Resolve the hostname first
        address_info = socket.getaddrinfo(hostname, 443, proto=socket.IPPROTO_TCP)
        address = address_info[0][4]  # Extract the address tuple

        # Create a socket connection with a timeout
        with socket.create_connection(address, timeout=timeout) as sock:
            with context.wrap_socket(sock, server_hostname=hostname) as ssock:
                ssock.do_handshake()
                cert = ssock.getpeercert()
                return 1
    except Exception as e:
        return 0

In [None]:
for i in range(0,len(Extracted_Kaggle),100):
    batch = Extracted_Kaggle.iloc[i:i+100].copy()
    batch['SSL_certificate'] = batch['url'].apply(lambda i: verify_ssl_certificate(i))
    batch.to_csv('SSL_Kaggle.csv',mode='a',header=False,index=False)

In [None]:
for i in range(0,len(Ranked_Domain),100):
    batch = Ranked_Domain.iloc[i:i+100].copy()
    batch['SSL_certificate'] = batch['url'].apply(lambda i: verify_ssl_certificate(i))
    batch.to_csv('SSL_topweb.csv',mode='a',header=False,index=False)

In [None]:
col = list(Extracted_Kaggle.columns)
col.append('SSL_certificate')
Extracted_Kaggle = pd.read_csv('/content/drive/MyDrive/Phishing Detection/Dataset/Extracted Dataset/SSL_Kaggle.csv',header=None)
Extracted_Kaggle.columns = col

Current columns

In [None]:
Extracted_Kaggle.head()

Unnamed: 0,url,have_ip_address,url_length,shortening_service,count@,count_http,count_https,countdot,count-,count_,...,counttilde,countplus,countasterisk,counthashtag,countdollar,emailexist,domain_age,regis_length,type,SSL_certificate
0,lima.info/history.htm,0,21,0,0,0,0,2,0,0,...,0,0,0,0,0,0,248.0,255.0,benign,0
1,music.yahoo.com/roy-c-hammond/,0,30,0,0,0,0,2,2,0,...,0,0,0,0,0,0,355.0,365.0,benign,1
2,islanders.nhl.com/club/player.htm?id=8474690,0,44,0,0,0,0,3,0,0,...,0,0,0,0,0,0,358.0,365.0,benign,1
3,http://jalopnik.com/the-hermit-kingdom-an-insi...,0,85,0,0,1,0,1,10,0,...,0,0,0,0,0,0,238.0,243.0,benign,1
4,http://mic.com/articles/115266/it-s-official-h...,0,85,0,0,1,0,1,8,0,...,0,0,0,0,0,0,371.0,450.0,benign,1


In [None]:
Ranked_Domain = pd.read_csv('/content/drive/MyDrive/NLP Project/Dataset/SSL_Topweb.csv',header=None)
Ranked_Domain.columns = col

In [None]:
Ranked_Domain.head()

Unnamed: 0,url,have_ip_address,url_length,shortening_service,count@,count_http,count_https,countdot,count-,count_,...,counttilde,countplus,countasterisk,counthashtag,countdollar,emailexist,domain_age,regis_length,type,SSL_certificate
0,https://www.youtube.com,0,23,0,0,0,1,2,0,0,...,0,0,0,0,0,0,234,243,benign,1
1,https://www.facebook.com,0,24,0,0,0,1,2,0,0,...,0,0,0,0,0,0,330,438,benign,1
2,https://www.wikipedia.org,0,25,0,0,0,1,2,0,0,...,0,0,0,0,0,0,284,292,benign,1
3,https://www.reddit.com,0,22,0,0,0,1,3,0,0,...,0,0,0,0,0,0,231,255,benign,1
4,https://www.instagram.com,0,25,0,0,0,1,2,0,0,...,0,0,0,0,0,0,242,340,benign,1


In [None]:
data = pd.concat([Extracted_Kaggle, Ranked_Domain], ignore_index=True)
data['type'].value_counts()

type
benign      36174
phishing    35174
Name: count, dtype: int64

I am dropping every benign url that has no SSL certificate. Ideally, every benign url SHOULD have a valid SSL certificate. Hence, it is necessary to drop said benign url that has no SSL certificate to further improve the accuracy of the model.

In [None]:
phishing = data[data.type=='phishing'].copy()
benign = data[data.type=='benign'].copy()

benign = benign[benign.SSL_certificate==1].copy()

data = pd.concat([benign, phishing], ignore_index=True)
data['type'].value_counts()

type
phishing    35174
benign      27865
Name: count, dtype: int64

In [None]:
from imblearn.under_sampling import RandomUnderSampler

RUS = RandomUnderSampler(random_state=24)
x = data.drop(columns=['type'])
y = data['type']

x_resampled,y_resampled = RUS.fit_resample(x,y)

data = pd.concat([x_resampled, y_resampled], axis=1, join='inner')

In [None]:
data['type'].value_counts()

type
benign      27865
phishing    27865
Name: count, dtype: int64

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Phishing Detection/Dataset/Extracted Dataset/Preprocessed Dataset.csv')

In [None]:
data.head()

Unnamed: 0,url,have_ip_address,url_length,shortening_service,count@,count_http,count_https,countdot,count-,count_,...,counttilde,countplus,countasterisk,counthashtag,countdollar,emailexist,domain_age,regis_length,SSL_certificate,type
0,music.yahoo.com/roy-c-hammond/,0,30,0,0,0,0,2,2,0,...,0,0,0,0,0,0,355.0,365.0,1,benign
1,islanders.nhl.com/club/player.htm?id=8474690,0,44,0,0,0,0,3,0,0,...,0,0,0,0,0,0,358.0,365.0,1,benign
2,http://jalopnik.com/the-hermit-kingdom-an-insi...,0,85,0,0,1,0,1,10,0,...,0,0,0,0,0,0,238.0,243.0,1,benign
3,http://mic.com/articles/115266/it-s-official-h...,0,85,0,0,1,0,1,8,0,...,0,0,0,0,0,0,371.0,450.0,1,benign
4,music.yahoo.com/ajda-pekkan/albums/cool-kadin-...,0,59,0,0,0,0,2,5,0,...,0,0,0,0,0,0,355.0,365.0,1,benign


Converting back to days since it is more accurate as a representation

In [None]:
def actualValue(month):
  return month * 30

data['domain_age'] = data['domain_age'].apply(lambda i: actualValue(i))
data['regis_length'] = data['regis_length'].apply(lambda i: actualValue(i))

Exporting Dataset

In [None]:
data.to_csv('/content/drive/MyDrive/Phishing Detection/Dataset/Extracted Dataset/Preprocessed Dataset.csv',index=False)