In [None]:
'''
Key Features of the Dataset¶

url_length: The length of the URL.

n_slash: The count of ‘/’ characters in the URL.

n_questionmark: The count of ‘?’ characters in the URL.

n_equal: The count of ‘=’ characters in the URL.

n_at: The count of ‘@’ characters in the URL.

n_and: The count of ‘&’ characters in the URL.

n_exclamation: The count of ‘!’ characters in the URL.

n_asterisk: The count of ‘*’ characters in the URL.

n_hastag: The count of ‘#’ characters in the URL.

n_percent: The count of ‘%’ characters in the URL.

dots_per_length: The amount of '.' per URL query.

hyphens_per_length: The amount of '-' per URL query.

is_long_url: Is the URL query an abnormally long string.

has_many_dots: Does it have abnomormal amounts of '.'

has_ssl: Does it have an SSL certificate.

is_cloudflare_protected: Is the URL Cloudflare protected.

special_char_density: Ratio of special characters (*&@#) within URL.

suspicious_tld_risk: Risk of URL containing suspicious extensions, domains, and patterns.

has_redirects: Does URL have redirects.

risk_score: Ultimate risk score of URL based on characteristics.

url_complexity: Ultimate URL complexity based on characteristics.

phishing: The Labels of the URL. 1 is phishing and 0 is legitimate.
'''

In [None]:
# necessary imports
import pandas as pd
import numpy as np

In [8]:
# Import and Read Data

data_df = pd.read_csv('web-page-phishing.csv')

print("Number of rows within data: ", data_df.shape[0])
print("Number of columns within data: ", data_df.shape[1])
display(data_df.head())



Number of rows within data:  100077
Number of columns within data:  20


Unnamed: 0,url_length,n_dots,n_hypens,n_underline,n_slash,n_questionmark,n_equal,n_at,n_and,n_exclamation,n_space,n_tilde,n_comma,n_plus,n_asterisk,n_hastag,n_dollar,n_percent,n_redirection,phishing
0,37,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,77,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
2,126,4,1,2,0,1,3,0,2,0,0,0,0,0,0,0,0,0,1,1
3,18,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,55,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [None]:
# Data Cleaning and Preprocessing

In [None]:
# Create a copy of data and then shuffle
data_copy = data_df.copy()
data_copy.sample(frac=1).reset_index(drop=True)
data_copy.head()

In [None]:
# Convert no. of characters (dots, hyphens, etc.) into a ratio with respect to its url length
# The ratio gets put into a new column
data_copy['dots_per_length'] = data_copy['n_dots'] / (data_copy['url_length'] + 1)
data_copy['hyphens_per_length'] = data_copy['n_hypens'] / (data_copy['url_length'] + 1)
data_copy['is_long_url'] = (data_copy['url_length'] > 75).astype(int)
data_copy['has_many_dots'] = (data_copy['n_dots'] > 4).astype(int)
data_copy['has_redirects'] = (data_copy['n_redirection'] > 0).astype(int)


In [None]:
# The data does not indicate whether a url has ssl certificates or cloudflare protection
# Having any one of these gives urls credibility
# for further/future training, these values will still be included in the data as per conditions:
#   has_ssl will have a value of 1 for 70% of non-phishing rows and 30% of phishing data
#   is_cloudflare_protected will have a value of 1 for 40% of non phishing data and 5% phishing data

# Add has_ssl column (1 for all legitimate URLs, random for phishing)
data_copy['has_ssl'] = 1
phishing_mask = data_copy['phishing'] == 1
data_copy.loc[phishing_mask, 'has_ssl'] = np.random.choice(
    [0, 1],
    size=sum(phishing_mask),
    p=[0.7, 0.3]
)

# Add cloudflare protection (40% - non phishing data; 5% - non phishing data)
data_copy['is_cloudflare_protected'] = 0
legitimate_mask = data_copy['phishing'] == 0
data_copy.loc[legitimate_mask, 'is_cloudflare_protected'] = np.random.choice(
    [0, 1],
    size=sum(legitimate_mask),
    p=[0.6, 0.4]
)
data_copy.loc[phishing_mask, 'is_cloudflare_protected'] = np.random.choice(
    [0, 1],
    size=sum(phishing_mask),
    p=[0.40, 0.05]
)