In [1]:
#Phishing Domain Detection



In [2]:
#1) Problem Statement 
#The goal of this project is to create and put into practice a machine learning model that can distinguish between legal and phishing domain URLs with accuracy. The model needs to learn to recognize the various traits and patterns that differentiate authentic domains from phishing domains given a dataset of domain URLs.

In [3]:
# 2) Data Collection
#  Dataset Source : https://data.mendeley.com/public-files/datasets/72ptz43s9v/files/26197eb8-15bc-4e06-a269-aa10ddc286f0/file_downloaded

In [4]:
# 2.1 Importing Data And Required Packages

In [5]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [22]:
#Import the CSV Data as Pandas DataFrame
df = pd.read_csv("c:\\Users\\neo\\Downloads\\dataset_full.csv")


In [23]:
#Show Top 5 Records
df.head()


Unnamed: 0,qty_dot_url,qty_hyphen_url,qty_underline_url,qty_slash_url,qty_questionmark_url,qty_equal_url,qty_at_url,qty_and_url,qty_exclamation_url,qty_space_url,...,qty_ip_resolved,qty_nameservers,qty_mx_servers,ttl_hostname,tls_ssl_certificate,qty_redirects,url_google_index,domain_google_index,url_shortened,phishing
0,3,0,0,1,0,0,0,0,0,0,...,1,2,0,892,0,0,0,0,0,1
1,5,0,1,3,0,3,0,2,0,0,...,1,2,1,9540,1,0,0,0,0,1
2,2,0,0,1,0,0,0,0,0,0,...,1,2,3,589,1,0,0,0,0,0
3,4,0,2,5,0,0,0,0,0,0,...,1,2,0,292,1,0,0,0,0,1
4,2,0,0,0,0,0,0,0,0,0,...,1,2,1,3597,0,1,0,0,0,0


In [25]:
#Shape of the dataset
df.shape

(88647, 112)

In [38]:
# 3) Data Checks to Perform 

#Check Missing values
#Check Duplicates
#Check data type
#Check the number of unique values of each column
#Check statistics of data set

In [27]:
# 3.1) Checking for missing values 

df.isna().sum()

qty_dot_url             0
qty_hyphen_url          0
qty_underline_url       0
qty_slash_url           0
qty_questionmark_url    0
                       ..
qty_redirects           0
url_google_index        0
domain_google_index     0
url_shortened           0
phishing                0
Length: 112, dtype: int64

In [32]:
# 3.2) Check Duplicates

df.duplicated().sum()

1438

In [34]:
# since duplicates were identified we drop them

df = df.drop_duplicates()


In [35]:
df.duplicated().sum()

0

In [36]:
# 3.3 Check data types

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 87209 entries, 0 to 88646
Columns: 112 entries, qty_dot_url to phishing
dtypes: float64(1), int64(111)
memory usage: 75.2 MB


In [37]:
# 3.4 Checking the number of unique values of each column

df.nunique()

qty_dot_url             23
qty_hyphen_url          30
qty_underline_url       22
qty_slash_url           25
qty_questionmark_url     6
                        ..
qty_redirects           15
url_google_index         3
domain_google_index      3
url_shortened            2
phishing                 2
Length: 112, dtype: int64

In [39]:
for column in df.columns:
    unique_values = df[column].unique()
    print(f"Column: {column}")
    print(unique_values)
    print()


Column: qty_dot_url
[ 3  5  2  4  1  6  9  7  8 10 12 15 11 14 18 16 20 13 17 23 24 19 22]

Column: qty_hyphen_url
[ 0  1  2  4  3  6  5  8  7 12 10  9 19 11 14 17 15 18 13 20 26 25 35 23
 34 24 21 31 16 27]

Column: qty_underline_url
[ 0  1  2  3  5  4 10  7 21  8 16  6  9 17 12 14 19 11 13 15 18 20]

Column: qty_slash_url
[ 1  3  5  0  2  4  6 10  7 12  9 11  8 14 19 16 18 13 17 44 21 22 15 29
 20]

Column: qty_questionmark_url
[0 1 2 3 9 7]

Column: qty_equal_url
[ 0  3  1  4  2  7 12  6  5  8 10 11 17  9 14 13 16 15 23 20]

Column: qty_at_url
[ 0  1  3  4 10  2 43  5 14 12 11 15  7  6  8]

Column: qty_and_url
[ 0  2  3  1  6 11  5  4  7  8  9 15 10 12 13 14 19 17 26 16 22]

Column: qty_exclamation_url
[ 0  1  8  2  5  3  4  9  6 10]

Column: qty_space_url
[0 1 4 9 3 2 7 6]

Column: qty_tilde_url
[0 1 5 3 4 2]

Column: qty_comma_url
[ 0  1  4  3  2  5 11  7]

Column: qty_plus_url
[ 0  2  1  5  4  3  6  9 19  7]

Column: qty_asterisk_url
[ 0  2 14  1 22  8 12 20 10 21  3 19  4  5 60 

In [40]:
# removing columns which have only one unique  value or are constant 

columns_to_remove = ['qty_slash_domain', 'qty_questionmark_domain', 'qty_equal_domain','qty_and_domain','qty_exclamation_domain','qty_space_domain','qty_tilde_domain','qty_comma_domain','qty_plus_domain','qty_asterisk_domain','qty_hashtag_domain','qty_dollar_domain','qty_percent_domain']
df = df.drop(columns=columns_to_remove)


In [42]:
#3.5 Check statistics of data set

df.describe()

Unnamed: 0,qty_dot_url,qty_hyphen_url,qty_underline_url,qty_slash_url,qty_questionmark_url,qty_equal_url,qty_at_url,qty_and_url,qty_exclamation_url,qty_space_url,qty_tilde_url,qty_comma_url,qty_plus_url,qty_asterisk_url,qty_hashtag_url,qty_dollar_url,qty_percent_url,qty_tld_url,length_url,qty_dot_domain,qty_hyphen_domain,qty_underline_domain,qty_at_domain,qty_vowels_domain,domain_length,domain_in_ip,server_client_domain,qty_dot_directory,qty_hyphen_directory,qty_underline_directory,qty_slash_directory,qty_questionmark_directory,qty_equal_directory,qty_at_directory,qty_and_directory,qty_exclamation_directory,qty_space_directory,qty_tilde_directory,qty_comma_directory,qty_plus_directory,qty_asterisk_directory,qty_hashtag_directory,qty_dollar_directory,qty_percent_directory,directory_length,qty_dot_file,qty_hyphen_file,qty_underline_file,qty_slash_file,qty_questionmark_file,qty_equal_file,qty_at_file,qty_and_file,qty_exclamation_file,qty_space_file,qty_tilde_file,qty_comma_file,qty_plus_file,qty_asterisk_file,qty_hashtag_file,qty_dollar_file,qty_percent_file,file_length,qty_dot_params,qty_hyphen_params,qty_underline_params,qty_slash_params,qty_questionmark_params,qty_equal_params,qty_at_params,qty_and_params,qty_exclamation_params,qty_space_params,qty_tilde_params,qty_comma_params,qty_plus_params,qty_asterisk_params,qty_hashtag_params,qty_dollar_params,qty_percent_params,params_length,tld_present_params,qty_params,email_in_url,time_response,domain_spf,asn_ip,time_domain_activation,time_domain_expiration,qty_ip_resolved,qty_nameservers,qty_mx_servers,ttl_hostname,tls_ssl_certificate,qty_redirects,url_google_index,domain_google_index,url_shortened,phishing
count,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0,87209.0
mean,2.194189,0.331514,0.115252,1.292688,0.009082,0.206768,0.022383,0.141912,0.002993,0.001032,0.003279,0.001582,0.002798,0.00461,0.000516,0.001926,0.108441,1.048172,36.468897,1.870621,0.11524,0.000768,1.1e-05,5.449438,18.581832,0.002225,0.004564,-0.316917,-0.354172,-0.472463,0.72926,-0.531172,-0.524006,-0.527732,-0.526586,-0.529521,-0.53037,-0.52795,-0.530587,-0.530083,-0.527411,-0.531172,-0.529842,-0.477829,10.946955,-0.361912,-0.472394,-0.505716,-0.531172,-0.531172,-0.529498,-0.530874,-0.529727,-0.530117,-0.530737,-0.530874,-0.530737,-0.530393,-0.52998,-0.531172,-0.531172,-0.493367,2.789792,-0.809584,-0.877765,-0.860978,-0.887397,-0.907383,-0.725372,-0.897362,-0.787316,-0.915066,-0.91549,-0.915513,-0.914573,-0.913942,-0.915479,-0.915571,-0.915181,-0.861345,5.286301,-0.89078,-0.757961,0.018519,0.823458,-0.018221,31631.639533,3433.823355,356.827128,1.169742,2.764153,1.767696,6260.446273,0.51474,0.36604,0.00164,0.002259,0.005573,0.3497
std,1.239118,1.126436,0.661569,1.896041,0.111737,0.958605,0.281739,0.929997,0.088057,0.073249,0.078767,0.069545,0.111661,0.304127,0.062162,0.100549,1.736294,0.25612,46.249869,0.701647,0.423404,0.032647,0.003386,2.539517,6.595036,0.047113,0.067402,0.901277,1.10661,0.68448,2.217893,0.49903,0.517446,0.552878,0.539137,0.504432,0.504359,0.508529,0.501016,0.508136,0.580039,0.49903,0.508895,1.541626,24.427717,0.768477,0.804379,0.606203,0.49903,0.49903,0.503341,0.499669,0.502119,0.501364,0.501807,0.49999,0.500663,0.506309,0.546396,0.49903,0.49903,1.449809,13.669287,0.984864,0.636808,0.659537,0.548599,0.321302,1.115645,0.361115,1.020438,0.283678,0.278688,0.278324,0.287406,0.290572,0.278746,0.278032,0.281236,1.057316,35.022139,0.383243,0.944891,0.134818,1.462144,0.563934,45460.319969,3040.271865,601.471653,0.861314,1.320729,1.706354,11532.219887,0.499786,0.77095,0.056131,0.061005,0.074443,0.476878
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,0.0,0.0
25%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,17.0,1.0,0.0,0.0,0.0,4.0,14.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.250804,0.0,13335.0,-1.0,-1.0,1.0,2.0,1.0,293.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,22.0,2.0,0.0,0.0,0.0,5.0,18.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.475466,0.0,20454.0,3144.0,173.0,1.0,2.0,1.0,2537.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,2.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,39.0,2.0,0.0,0.0,0.0,7.0,22.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.886569,0.0,35916.0,6455.0,356.0,1.0,4.0,2.0,10987.0,1.0,1.0,0.0,0.0,0.0,1.0
max,24.0,35.0,21.0,44.0,9.0,23.0,43.0,26.0,10.0,9.0,5.0,11.0,19.0,60.0,13.0,10.0,174.0,12.0,4165.0,21.0,11.0,3.0,1.0,61.0,231.0,1.0,1.0,19.0,23.0,17.0,22.0,0.0,5.0,43.0,26.0,9.0,9.0,5.0,5.0,19.0,60.0,0.0,10.0,174.0,1286.0,12.0,21.0,17.0,0.0,0.0,3.0,2.0,3.0,4.0,9.0,4.0,5.0,19.0,60.0,0.0,0.0,174.0,1232.0,23.0,35.0,21.0,43.0,9.0,23.0,10.0,22.0,10.0,4.0,1.0,11.0,6.0,4.0,0.0,4.0,65.0,4094.0,1.0,23.0,1.0,38.402411,1.0,395754.0,17775.0,22574.0,24.0,20.0,20.0,604800.0,1.0,17.0,1.0,1.0,1.0,1.0


In [43]:
df.shape

(87209, 99)