# Normalization
In this file we normalize the data from the previous step (clean_data.csv) and write it to a new csv (norm_data.csv)

## Imports

In [177]:
from IPython import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

import pandas as pd

## Reading the CSV
We use pandas to read the csv with the correct options.

In [178]:
df = pd.read_csv('../data/clean_data.csv', header=0, decimal='.')

df.head()

Unnamed: 0,url,label,url_length,starts_with_ip,url_entropy,has_punycode,digit_letter_ratio,dot_count,at_count,dash_count,tld_count,domain_has_digits,subdomain_count,nan_char_entropy,has_internal_links,domain_age_days
0,kitchenlaughter.com,legitimate,19,False,3.826875,False,0.0,1,0,0,0,False,0,0.223575,False,2465.0
1,foxberry.store,legitimate,14,False,3.182006,False,0.0,1,0,0,0,False,0,0.271954,False,389.0
2,www347.americanexpress.com,legitimate,26,False,3.873141,False,0.142857,2,0,0,0,False,1,0.284649,False,10689.0
3,cdn82020935.ahacdn.me,legitimate,21,False,3.725651,False,0.727273,2,0,0,0,False,1,0.323078,False,2818.0
4,http://bb445983.com,phishing,19,False,3.826875,False,0.666667,1,0,0,0,True,0,0.789037,False,4281.0


## Copy the df to a work dataframe
we'll be using X as the main data from the df and y for the url and label columns that don't need to be normalised

In [179]:
X = df.copy()

url = X.pop('url')
label = X.pop('label')

y = pd.DataFrame().assign(url=url, label=label)

In [180]:
X
y

Unnamed: 0,url_length,starts_with_ip,url_entropy,has_punycode,digit_letter_ratio,dot_count,at_count,dash_count,tld_count,domain_has_digits,subdomain_count,nan_char_entropy,has_internal_links,domain_age_days
0,19,False,3.826875,False,0.000000,1,0,0,0,False,0,0.223575,False,2465.0
1,14,False,3.182006,False,0.000000,1,0,0,0,False,0,0.271954,False,389.0
2,26,False,3.873141,False,0.142857,2,0,0,0,False,1,0.284649,False,10689.0
3,21,False,3.725651,False,0.727273,2,0,0,0,False,1,0.323078,False,2818.0
4,19,False,3.826875,False,0.666667,1,0,0,0,True,0,0.789037,False,4281.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2499987,17,False,3.852169,False,0.076923,2,0,1,0,False,1,0.363231,False,2011.0
2499988,61,False,4.469122,False,0.181818,2,0,3,0,False,1,0.472615,False,4281.0
2499989,503,False,4.479480,False,1.334951,3,0,6,0,True,1,0.200362,False,3042.0
2499990,25,False,3.673270,False,0.000000,3,0,0,0,False,2,0.367067,False,10792.0


Unnamed: 0,url,label
0,kitchenlaughter.com,legitimate
1,foxberry.store,legitimate
2,www347.americanexpress.com,legitimate
3,cdn82020935.ahacdn.me,legitimate
4,http://bb445983.com,phishing
...,...,...
2499987,euc1-turn.fpjs.io,legitimate
2499988,https://ebay-kleinanzeigen.check-pays-online.o...,phishing
2499989,http://hydra-headed-wear.000webhostapp.com/53R...,phishing
2499990,ucm.workforce.equifax.com,legitimate


## Normalising the data
To train the model it is better if most of the values are an int.
Below we check which values are already of type int

In [181]:
discrete_features = X.dtypes == int

discrete_features

url_length             True
starts_with_ip        False
url_entropy           False
has_punycode          False
digit_letter_ratio    False
dot_count              True
at_count               True
dash_count             True
tld_count              True
domain_has_digits     False
subdomain_count        True
nan_char_entropy      False
has_internal_links    False
domain_age_days       False
dtype: bool

We can see that most of the columns are not of type int yet. Below we will convert these

### Convert the boolean types to an int

Most models require a numerical input as they cannot directly handle boolean values. Not all models require this input to be numerical but we will do it to prevent future conflicts.

In [182]:
bools = []

for col in X.select_dtypes('bool'):
    bools.append(col)

bools

['starts_with_ip', 'has_punycode', 'domain_has_digits', 'has_internal_links']

The column names listed above have a boolean value. It is better to have these converted to an int (0, 1). We can see in the dataframe below that all the values show False and True

In [183]:
X[bools]

Unnamed: 0,starts_with_ip,has_punycode,domain_has_digits,has_internal_links
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,True,False
...,...,...,...,...
2499987,False,False,False,False
2499988,False,False,False,False
2499989,False,False,True,False
2499990,False,False,False,False


In [184]:
for col in X.select_dtypes("bool"):
    X[col] = X[col].astype(int)

In [185]:
bools_new = []

for col in X.select_dtypes('bool'):
    bools.append(col)

bools_new

[]

The list now turns up empty so the transformation worked. When we look at the values in X for the columns given in the first list we will now see 0's and 1's

In [186]:
X[bools]

Unnamed: 0,starts_with_ip,has_punycode,domain_has_digits,has_internal_links
0,0,0,0,0
1,0,0,0,0
2,0,0,0,0
3,0,0,0,0
4,0,0,1,0
...,...,...,...,...
2499987,0,0,0,0
2499988,0,0,0,0
2499989,0,0,1,0
2499990,0,0,0,0


In [187]:
discrete_features = X.dtypes == int

discrete_features

url_length             True
starts_with_ip         True
url_entropy           False
has_punycode           True
digit_letter_ratio    False
dot_count              True
at_count               True
dash_count             True
tld_count              True
domain_has_digits      True
subdomain_count        True
nan_char_entropy      False
has_internal_links     True
domain_age_days       False
dtype: bool

We can see above that most of the values are now of type int. We can try to convert the string types to an int now

### Convert strings to int
converting string to an int can be done by taking all the unique string values and giving it an int value. This needs to be done as a model can not understand string values and will try to convert the values itself to int values. It is better to do this ourselves for any possible strings. Below we will perform this action

In [188]:
objects = []
for col in X.select_dtypes("object"):
    objects.append(col)

objects

[]

We can see that the dataset has no object types to be converted, but we will perform the action to be sure. 

In [189]:
for col in X.select_dtypes("object"):
    X[col], _ = X[col].factorize()

### Convert big numbers to a normalised standard
We will convert numerical values that have a high value to a range between 0 and 1.

In [190]:
range_df = pd.DataFrame(data={
    "Min": X.min(),
    "Max": X.max(),
    "Range": X.max() - X.min()
}).sort_values("Range", ascending=False)

range_df

Unnamed: 0,Min,Max,Range
domain_age_days,0.0,45541.0,45541.0
url_length,4.0,25523.0,25519.0
dash_count,0.0,322.0,322.0
dot_count,0.0,211.0,211.0
tld_count,0.0,65.0,65.0
subdomain_count,0.0,43.0,43.0
at_count,0.0,32.0,32.0
digit_letter_ratio,0.0,20.84,20.84
url_entropy,0.100836,6.048781,5.947945
nan_char_entropy,0.016863,1.901504,1.884641


Above we can see that te min, max and range values of all the numerical values.
Below we will transform all the values with a range higher than 1 to a range between 0 and 1

In [191]:
for index in range_df[(range_df['Range'] > 1)].index:
    X[index] = (X[index] - range_df.loc[index]['Min']) / range_df.loc[index]['Range']
    
X

Unnamed: 0,url_length,starts_with_ip,url_entropy,has_punycode,digit_letter_ratio,dot_count,at_count,dash_count,tld_count,domain_has_digits,subdomain_count,nan_char_entropy,has_internal_links,domain_age_days
0,0.000588,0,0.626441,0,0.000000,0.004739,0.0,0.000000,0.0,0,0.000000,0.109682,0,0.054127
1,0.000392,0,0.518023,0,0.000000,0.004739,0.0,0.000000,0.0,0,0.000000,0.135353,0,0.008542
2,0.000862,0,0.634220,0,0.006855,0.009479,0.0,0.000000,0.0,0,0.023256,0.142089,0,0.234712
3,0.000666,0,0.609423,0,0.034898,0.009479,0.0,0.000000,0.0,0,0.023256,0.162479,0,0.061878
4,0.000588,0,0.626441,0,0.031990,0.004739,0.0,0.000000,0.0,1,0.000000,0.409720,0,0.094003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2499987,0.000509,0,0.630694,0,0.003691,0.009479,0.0,0.003106,0.0,0,0.023256,0.183785,0,0.044158
2499988,0.002234,0,0.734419,0,0.008724,0.009479,0.0,0.009317,0.0,0,0.023256,0.241824,0,0.094003
2499989,0.019554,0,0.736161,0,0.064057,0.014218,0.0,0.018634,0.0,1,0.023256,0.097365,0,0.066797
2499990,0.000823,0,0.600616,0,0.000000,0.014218,0.0,0.000000,0.0,0,0.046512,0.185820,0,0.236973


## Remove the URL from the y dataset
Because the URL doesn't have any significance for the models we will remove it in the normalized dataset. 

In [192]:
y.pop("url")

0                                        kitchenlaughter.com
1                                             foxberry.store
2                                 www347.americanexpress.com
3                                      cdn82020935.ahacdn.me
4                                        http://bb445983.com
                                 ...                        
2499987                                    euc1-turn.fpjs.io
2499988    https://ebay-kleinanzeigen.check-pays-online.o...
2499989    http://hydra-headed-wear.000webhostapp.com/53R...
2499990                            ucm.workforce.equifax.com
2499991             http://pokkerredemcode.blogspot.com/?m=1
Name: url, Length: 2499992, dtype: object

## Join the normalised data and the label dataframe
After the normalisation we will join the 2 dataframes together again to save it to a new csv

In [193]:
norm_data = pd.concat([y, X], axis=1)

norm_data.head()

Unnamed: 0,label,url_length,starts_with_ip,url_entropy,has_punycode,digit_letter_ratio,dot_count,at_count,dash_count,tld_count,domain_has_digits,subdomain_count,nan_char_entropy,has_internal_links,domain_age_days
0,legitimate,0.000588,0,0.626441,0,0.0,0.004739,0.0,0.0,0.0,0,0.0,0.109682,0,0.054127
1,legitimate,0.000392,0,0.518023,0,0.0,0.004739,0.0,0.0,0.0,0,0.0,0.135353,0,0.008542
2,legitimate,0.000862,0,0.63422,0,0.006855,0.009479,0.0,0.0,0.0,0,0.023256,0.142089,0,0.234712
3,legitimate,0.000666,0,0.609423,0,0.034898,0.009479,0.0,0.0,0.0,0,0.023256,0.162479,0,0.061878
4,phishing,0.000588,0,0.626441,0,0.03199,0.004739,0.0,0.0,0.0,1,0.0,0.40972,0,0.094003


## Save the new normalised data to a CSV
We use the option 'index=False' so the index column of the dataset isn't saved to the CSV

In [194]:
norm_data.to_csv('../data/norm_data.csv', index=False)