In [32]:
# standard libraries
from urllib.parse import urlparse
import re

# third-party libraries
import pandas as pd
import tldextract

In [33]:
# load training data
data = pd.read_csv('data/train.csv')

In [42]:
# checking data
display(pd.DataFrame({
    'Not Null %' : data.notnull().sum()/len(data),
    'Not Null' : data.notnull().sum(),
    'Unique Count' : data.nunique()
}).sort_values(by='Not Null %', ascending=False))

Unnamed: 0,Not Null %,Not Null,Unique Count
id,1.0,140401,140401
label,1.0,140401,2
IsDomainIP,0.953562,133881,2
NoOfSubDomain,0.951403,133578,8
DomainLength,0.948554,133178,83
IsHTTPS,0.891176,125122,2
URLLength,0.866205,121616,249
Domain,0.845015,118641,117908
IsResponsive,0.697011,97861,2
NoOfEmptyRef,0.695971,97715,255


In [35]:
# remove duplicate URL
duplicateURL = data['URL'].value_counts()[data['URL'].value_counts()>1].index.to_list()
for URL in duplicateURL:
    curURL = data[data['URL']==URL]
    for column in data.columns:
        options = curURL[column].dropna().to_list()
        if(len(options) > 0):
            data.loc[data['URL']==URL, column] = options[0]
data.drop_duplicates(inplace=True)

In [36]:
# [URL] -> [Domain]
URL_Domain = data[(data['URL'].notna()) & (data['Domain'].isna())]['URL']

# script
data.loc[(data['URL'].notna()) & (data['Domain'].isna()), 'Domain'] = [
    urlparse(URL).netloc for URL in URL_Domain
]

# preview
data[data['URL'].isin(URL_Domain)][['URL', 'Domain']]

Unnamed: 0,URL,Domain
1,http://uqr.to/1il1z,uqr.to
5,https://www.free-marine.com,www.free-marine.com
7,https://www.epner.com,www.epner.com
10,https://www.hellscent.com,www.hellscent.com
13,https://www.uncrazed.com,www.uncrazed.com
...,...,...
140387,https://www.mdx.edu.mt,www.mdx.edu.mt
140389,https://www.navigazionelaghi.it,www.navigazionelaghi.it
140395,https://www.inspiredherway.com,www.inspiredherway.com
140397,https://www.nnry.com,www.nnry.com


In [37]:
# [URL] -> [IsHTTPS]
URL_IsHTTPS = data[(data['URL'].notna()) & (data['IsHTTPS'].isna())]['URL']

# script
def IsHTTPS(URL):
    try:
        if urlparse(URL).scheme.lower() == 'https':
            return 1
        else: return 0
    except Exception as e:
        return 0
data.loc[(data['URL'].notna()) & (data['IsHTTPS'].isna()), 'IsHTTPS'] = [
    IsHTTPS(URL) for URL in URL_IsHTTPS
]

# preview
data[data['URL'].isin(URL_IsHTTPS)][['URL', 'IsHTTPS']]

Unnamed: 0,URL,IsHTTPS
0,https://www.northcm.ac.th,1.0
1,http://uqr.to/1il1z,0.0
2,https://www.woolworthsrewards.com.au,1.0
5,https://www.free-marine.com,1.0
8,https://www.az511.com,1.0
...,...,...
140391,https://www.sligoheritage.com,1.0
140400,https://www.greenmountainenergy.com,1.0
140401,https://www.leadcastingcall.com,1.0
140402,https://www.fedarb.com,1.0


In [38]:
# [URL] -> [URLLength]
URL_URLLength = data[(data['URL'].notna()) & (data['URLLength'].isna())]['URL']

# script
data.loc[(data['URL'].notna()) & (data['URLLength'].isna()), 'URLLength'] = [
    len(URL) for URL in URL_URLLength
]

# preview
data[data['URL'].isin(URL_URLLength)][['URL', 'URLLength']]

Unnamed: 0,URL,URLLength
1,http://uqr.to/1il1z,19.0
7,https://www.epner.com,21.0
8,https://www.az511.com,21.0
10,https://www.hellscent.com,25.0
13,https://www.uncrazed.com,24.0
...,...,...
140381,https://kg56rf-bur5g7.firebaseapp.com/,38.0
140383,https://www.pelagia.org,23.0
140384,https://www.ilostmydog.com,26.0
140385,https://www.expostandzone.com,29.0


In [39]:
# [Domain] -> [IsDomainIP]
Domain_IsDomainIP = data[(data['Domain'].notna()) & (data['IsDomainIP'].isna())]['Domain']

# script
def IsIP(domain):
    ipv4_pattern = r'^(\d{1,3}\.){3}\d{1,3}$'
    ipv6_pattern = r'^[0-9a-fA-F:]{2,39}$'
    if re.match(ipv4_pattern, domain) or re.match(ipv6_pattern, domain): 
        return 1
    else : return 0
data.loc[(data['Domain'].notna()) & (data['IsDomainIP'].isna()), 'IsDomainIP'] = [
    IsIP(Domain) for Domain in Domain_IsDomainIP
]

# preview
data[data['Domain'].isin(Domain_IsDomainIP)][['Domain', 'IsDomainIP']]

Unnamed: 0,Domain,IsDomainIP
1,uqr.to,0.0
5,www.free-marine.com,0.0
9,www.screenbeam.com,0.0
10,www.hellscent.com,0.0
14,www.scientistsforeu.uk,0.0
...,...,...
140389,www.navigazionelaghi.it,0.0
140395,www.inspiredherway.com,0.0
140401,www.leadcastingcall.com,0.0
140402,www.fedarb.com,0.0


In [40]:
# [Domain] -> [NoOfSubDomain]
Domain_NoOfSubDomain = data[(data['Domain'].notna()) & (data['NoOfSubDomain'].isna())]['Domain']

# script
def countSubDomain(Domain):
    ext = tldextract.extract(Domain)
    subdomains = ext.subdomain.split('.') if ext.subdomain else []
    return len(subdomains)
data.loc[(data['Domain'].notna()) & (data['NoOfSubDomain'].isna()), 'NoOfSubDomain'] = [
    countSubDomain(Domain) for Domain in Domain_NoOfSubDomain
]

# preview
data[data['Domain'].isin(Domain_NoOfSubDomain)][['Domain', 'NoOfSubDomain']]

Unnamed: 0,Domain,NoOfSubDomain
7,www.epner.com,1.0
8,www.az511.com,1.0
9,www.screenbeam.com,1.0
10,www.hellscent.com,1.0
11,www.marcosimoncellifondazione.it,1.0
...,...,...
140393,www.fites.net,1.0
140396,www.genevalakemuseum.org,1.0
140397,www.nnry.com,1.0
140400,www.greenmountainenergy.com,1.0


In [41]:
# [Domain] -> [DomainLength]
Domain_DomainLength = data[(data['Domain'].notna()) & (data['DomainLength'].isna())]['Domain']

# script
data.loc[(data['Domain'].notna()) & (data['DomainLength'].isna()), 'DomainLength'] = [
    len(Domain) for Domain in Domain_DomainLength
]

# preview
data[data['Domain'].isin(Domain_DomainLength)][['Domain', 'DomainLength']]

Unnamed: 0,Domain,DomainLength
1,uqr.to,6.0
7,www.epner.com,13.0
15,www.goldreserveinc.com,22.0
17,www.topografix.com,18.0
19,www.saveware.nl,15.0
...,...,...
140387,www.mdx.edu.mt,14.0
140394,www.bmvc2020-conference.com,27.0
140397,www.nnry.com,12.0
140399,www.slavevoyages.org,20.0
