# Normalization
In this file we normalize the data from the previous step (clean_data.csv) and write it to a new csv (norm_data.csv)

## Imports

In [1]:
from IPython import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

import pandas as pd

## Reading the CSV
We use pandas to read the csv with the correct options.

In [2]:
df = pd.read_csv('../data/clean_data.csv', header=0, decimal='.')

df.head()

Unnamed: 0,url,label,url_length,starts_with_ip,url_entropy,has_punycode,digit_letter_ratio,dot_count,at_count,dash_count,tld_count,domain_has_digits,subdomain_count,nan_char_entropy,has_internal_links,domain_age_days
0,https://keepo.io/upate___,phishing,25,False,3.612879,False,0.0,1,0,0,0,False,0,1.105643,False,1374.0
1,rr1---sn-g5cp3c5ob0o-jb3e.googlevideo.com,legitimate,41,False,4.104282,False,0.214286,2,0,5,0,False,1,0.212564,False,7777.0
2,http://webapps.accountverificationvbv.servenab...,phishing,61,False,4.334701,False,0.038462,3,0,0,0,True,2,0.524678,False,4281.0
3,http://secre.chasebnak.craicean.in,phishing,34,False,3.74894,False,0.0,3,0,0,0,False,2,0.699114,False,869.0
4,https://b8roau.webwave.dev,phishing,26,False,3.979098,False,0.05,2,0,0,0,False,1,0.750085,False,2018.0


## Copy the df to a work dataframe
we'll be using X as the main data from the df and y for the url and label columns that don't need to be normalised

In [3]:
X = df.copy()

url = X.pop('url')
label = X.pop('label')

y = pd.DataFrame().assign(url=url, label=label)

In [4]:
X
y

Unnamed: 0,url_length,starts_with_ip,url_entropy,has_punycode,digit_letter_ratio,dot_count,at_count,dash_count,tld_count,domain_has_digits,subdomain_count,nan_char_entropy,has_internal_links,domain_age_days
0,25,False,3.612879,False,0.000000,1,0,0,0,False,0,1.105643,False,1374.0
1,41,False,4.104282,False,0.214286,2,0,5,0,False,1,0.212564,False,7777.0
2,61,False,4.334701,False,0.038462,3,0,0,0,True,2,0.524678,False,4281.0
3,34,False,3.748940,False,0.000000,3,0,0,0,False,2,0.699114,False,869.0
4,26,False,3.979098,False,0.050000,2,0,0,0,False,1,0.750085,False,2018.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2499987,57,False,4.529720,False,0.045455,2,0,1,0,True,0,0.643463,False,4281.0
2499988,26,False,4.132944,False,0.176471,2,0,1,0,False,1,0.750085,False,2070.0
2499989,29,False,4.116265,False,0.000000,1,0,0,0,False,0,0.673621,False,1696.0
2499990,13,False,3.180833,False,0.000000,1,0,0,0,False,0,0.284649,False,353.0


Unnamed: 0,url,label
0,https://keepo.io/upate___,phishing
1,rr1---sn-g5cp3c5ob0o-jb3e.googlevideo.com,legitimate
2,http://webapps.accountverificationvbv.servenab...,phishing
3,http://secre.chasebnak.craicean.in,phishing
4,https://b8roau.webwave.dev,phishing
...,...,...
2499987,http://fakesemo20.com/GOUV/rbc/cgi-bin/rbacces...,phishing
2499988,http://cksjw-a529d.web.app,phishing
2499989,https://rehabmedia.org/Linked,phishing
2499990,lianghecun.cn,legitimate


## Normalising the data
To train the model it is better if most of the values are an int.
Below we check which values are already of type int

In [5]:
discrete_features = X.dtypes == int

discrete_features

url_length             True
starts_with_ip        False
url_entropy           False
has_punycode          False
digit_letter_ratio    False
dot_count              True
at_count               True
dash_count             True
tld_count              True
domain_has_digits     False
subdomain_count        True
nan_char_entropy      False
has_internal_links    False
domain_age_days       False
dtype: bool

We can see that most of the columns are not of type int yet. Below we will convert these

### Convert the boolean types to an int

In [6]:
bools = []

for col in X.select_dtypes('bool'):
    bools.append(col)

bools

['starts_with_ip', 'has_punycode', 'domain_has_digits', 'has_internal_links']

The column names listed above have a boolean value. It is better to have these converted to an int (0, 1). We can see in the dataframe below that all the values show False and True

In [7]:
X[['starts_with_ip', 'has_punycode', 'domain_has_digits', 'has_internal_links']]

Unnamed: 0,starts_with_ip,has_punycode,domain_has_digits,has_internal_links
0,False,False,False,False
1,False,False,False,False
2,False,False,True,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
2499987,False,False,True,False
2499988,False,False,False,False
2499989,False,False,False,False
2499990,False,False,False,False


In [8]:
for col in X.select_dtypes("bool"):
    X[col] = X[col].astype(int)

In [9]:
bools = []

for col in X.select_dtypes('bool'):
    bools.append(col)

bools

[]

The list now turns up empty so the transformation worked. When we look at the values in X for the columns given in the first list we will now see 0's and 1's

In [10]:
X[['starts_with_ip', 'has_punycode', 'domain_has_digits', 'has_internal_links']]

Unnamed: 0,starts_with_ip,has_punycode,domain_has_digits,has_internal_links
0,0,0,0,0
1,0,0,0,0
2,0,0,1,0
3,0,0,0,0
4,0,0,0,0
...,...,...,...,...
2499987,0,0,1,0
2499988,0,0,0,0
2499989,0,0,0,0
2499990,0,0,0,0


In [11]:
discrete_features = X.dtypes == int

discrete_features

url_length             True
starts_with_ip         True
url_entropy           False
has_punycode           True
digit_letter_ratio    False
dot_count              True
at_count               True
dash_count             True
tld_count              True
domain_has_digits      True
subdomain_count        True
nan_char_entropy      False
has_internal_links     True
domain_age_days       False
dtype: bool

We can see above that most of the values are now of type int. We can try to convert the string types to an int now

### Convert strings to int
converting string to an int can be done by taking all the unique string values and giving it an int value. Below we will perform this action

In [12]:
objects = []
for col in X.select_dtypes("object"):
    objects.append(col)

objects

[]

We can see that the dataset has no object types to be converted, but we will perform the action to be sure. 

In [13]:
for col in X.select_dtypes("object"):
    X[col], _ = X[col].factorize()

### Convert big numbers to a normalised standard
We will convert the domain_age_days column to a range from 0 to 1

In [14]:
min_age = X['domain_age_days'].min()
age_range = X['domain_age_days'].max() - min_age

X['domain_age_days'] = (X['domain_age_days'] - min_age) / age_range

X

Unnamed: 0,url_length,starts_with_ip,url_entropy,has_punycode,digit_letter_ratio,dot_count,at_count,dash_count,tld_count,domain_has_digits,subdomain_count,nan_char_entropy,has_internal_links,domain_age_days
0,25,0,3.612879,0,0.000000,1,0,0,0,0,0,1.105643,0,0.030171
1,41,0,4.104282,0,0.214286,2,0,5,0,0,1,0.212564,0,0.170769
2,61,0,4.334701,0,0.038462,3,0,0,0,1,2,0.524678,0,0.094003
3,34,0,3.748940,0,0.000000,3,0,0,0,0,2,0.699114,0,0.019082
4,26,0,3.979098,0,0.050000,2,0,0,0,0,1,0.750085,0,0.044312
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2499987,57,0,4.529720,0,0.045455,2,0,1,0,1,0,0.643463,0,0.094003
2499988,26,0,4.132944,0,0.176471,2,0,1,0,0,1,0.750085,0,0.045454
2499989,29,0,4.116265,0,0.000000,1,0,0,0,0,0,0.673621,0,0.037241
2499990,13,0,3.180833,0,0.000000,1,0,0,0,0,0,0.284649,0,0.007751


## Join the normalised data and the url, label dataframe
After the normalisation we will join the 2 dataframes together again to save it to a new csv

In [15]:
norm_data = pd.concat([y, X], axis=1)

norm_data.head()

Unnamed: 0,url,label,url_length,starts_with_ip,url_entropy,has_punycode,digit_letter_ratio,dot_count,at_count,dash_count,tld_count,domain_has_digits,subdomain_count,nan_char_entropy,has_internal_links,domain_age_days
0,https://keepo.io/upate___,phishing,25,0,3.612879,0,0.0,1,0,0,0,0,0,1.105643,0,0.030171
1,rr1---sn-g5cp3c5ob0o-jb3e.googlevideo.com,legitimate,41,0,4.104282,0,0.214286,2,0,5,0,0,1,0.212564,0,0.170769
2,http://webapps.accountverificationvbv.servenab...,phishing,61,0,4.334701,0,0.038462,3,0,0,0,1,2,0.524678,0,0.094003
3,http://secre.chasebnak.craicean.in,phishing,34,0,3.74894,0,0.0,3,0,0,0,0,2,0.699114,0,0.019082
4,https://b8roau.webwave.dev,phishing,26,0,3.979098,0,0.05,2,0,0,0,0,1,0.750085,0,0.044312


## Save the new normalised data to a CSV
We use the option 'index=False' so the index column of the dataset isn't saved to the CSV

In [17]:
norm_data.to_csv('../data/norm_data.csv', index=False)