In [150]:
# standard libraries packages
import ipaddress
from urllib.parse import urlparse

# third party packages
import pandas as pd
import tldextract

In [151]:
# load the training data
data = pd.read_csv("data/train.csv")

In [152]:
# viewing 'Type', 'Not Null', 'Unique'
display(pd.DataFrame({
        'Type' : data.dtypes,
        'Not Null %' : data.notnull().sum()/len(data)*100,
        'Not Null' : data.notnull().sum(),
        'Unique' : data.nunique(),
    }).sort_values(by=['Not Null %']))

Unnamed: 0,Type,Not Null %,Not Null,Unique
Domain,object,50.003561,70207,69832
NoOfExternalRef,float64,50.586166,71025,951
LineOfCode,float64,50.74713,71251,8311
HasSocialNet,float64,51.569044,72405,2
LargestLineLength,float64,51.619612,72476,17076
NoOfURLRedirect,float64,52.007065,73020,2
HasCopyrightInfo,float64,52.034842,73059,2
NoOfCSS,float64,52.185123,73270,178
NoOfObfuscatedChar,float64,52.424432,73606,8
NoOfSelfRedirect,float64,52.483547,73689,2


In [153]:
data[data['NoOfOtherSpecialCharsInURL'] < 2][['Domain', 'NoOfOtherSpecialCharsInURL']]

Unnamed: 0,Domain,NoOfOtherSpecialCharsInURL
4,www.nyprowrestling.com,1.0
6,www.ridemcts.com,1.0
8,www.az511.com,1.0
9,www.screenbeam.com,1.0
10,,1.0
...,...,...
140393,www.fites.net,1.0
140396,www.genevalakemuseum.org,1.0
140397,,1.0
140401,www.leadcastingcall.com,1.0


In [154]:
# remove duplicate URL, while choosing the available not-null values for each feature
dupeURL = data['URL'].value_counts()
dupeURL = dupeURL[dupeURL > 1].index.to_list()

for URL in dupeURL:
    curData = data[data['URL'] == URL].copy() 
    for column in curData.columns:
        option = curData[column].dropna().unique()
        if len(option) > 0:
            data.loc[data['URL'] == URL, column] = option[0]
data.drop_duplicates(inplace=True)

In [155]:
# URLLength
URL_URLLength = data[data['URL'].notna() & data['URLLength'].isna()]['URL'].to_list()

# script to fill
data.loc[data['URL'].notna() & data['URLLength'].isna(), 'URLLength'] = data['URL'].str.len()

# preview
data[data['URL'].isin(URL_URLLength)][['URL', 'URLLength']]

Unnamed: 0,URL,URLLength
1,http://uqr.to/1il1z,19.0
7,https://www.epner.com,21.0
8,https://www.az511.com,21.0
10,https://www.hellscent.com,25.0
13,https://www.uncrazed.com,24.0
...,...,...
140381,https://kg56rf-bur5g7.firebaseapp.com/,38.0
140383,https://www.pelagia.org,23.0
140384,https://www.ilostmydog.com,26.0
140385,https://www.expostandzone.com,29.0


In [156]:
# Domain
URL_Domain = data[data['URL'].notna() & data['Domain'].isna()]['URL'].to_list()

# script
def get_domain_name(url):
    return urlparse(str(url)).hostname
data.loc[data['URL'].notna() & data['Domain'].isna(), 'Domain'] = data['URL'].apply(get_domain_name)

# preview
data[data['URL'].isin(URL_Domain)][['URL', 'Domain']]

Unnamed: 0,URL,Domain
1,http://uqr.to/1il1z,uqr.to
5,https://www.free-marine.com,www.free-marine.com
7,https://www.epner.com,www.epner.com
10,https://www.hellscent.com,www.hellscent.com
13,https://www.uncrazed.com,www.uncrazed.com
...,...,...
140387,https://www.mdx.edu.mt,www.mdx.edu.mt
140389,https://www.navigazionelaghi.it,www.navigazionelaghi.it
140395,https://www.inspiredherway.com,www.inspiredherway.com
140397,https://www.nnry.com,www.nnry.com


In [157]:
# DomainLength
Domain_DomainLength = data[data['Domain'].notna() & data['DomainLength'].isna()]['Domain'].to_list()

# script
data.loc[data['Domain'].notna() & data['DomainLength'].isna(), 'DomainLength'] = data['Domain'].str.len()

# preview
data[data['Domain'].isin(Domain_DomainLength)][['Domain', 'DomainLength']]

Unnamed: 0,Domain,DomainLength
1,uqr.to,6.0
7,www.epner.com,13.0
15,www.goldreserveinc.com,22.0
17,www.topografix.com,18.0
19,www.saveware.nl,15.0
...,...,...
140387,www.mdx.edu.mt,14.0
140394,www.bmvc2020-conference.com,27.0
140397,www.nnry.com,12.0
140399,www.slavevoyages.org,20.0


In [158]:
# IsDomainIP
Domain_IsDomainIP = data[data['Domain'].notna() & data['IsDomainIP'].isna()]['Domain'].to_list()

# script
def IsIPCheck(obj):
    try:
        ipaddress.ip_address(str(obj))
        return 1 
    except ValueError:
        return 0 
data.loc[data['Domain'].notna() & data['IsDomainIP'].isna(), 'IsDomainIP'] = data['Domain'].apply(IsIPCheck)

# preview
data[data['Domain'].isin(Domain_IsDomainIP)][['Domain', 'IsDomainIP']]

Unnamed: 0,Domain,IsDomainIP
1,uqr.to,0.0
5,www.free-marine.com,0.0
9,www.screenbeam.com,0.0
10,www.hellscent.com,0.0
14,www.scientistsforeu.uk,0.0
...,...,...
140389,www.navigazionelaghi.it,0.0
140395,www.inspiredherway.com,0.0
140401,www.leadcastingcall.com,0.0
140402,www.fedarb.com,0.0


In [159]:
# NoOfSubDomain
Domain_NoOfSubDomain = data[data['Domain'].notna() & data['NoOfSubDomain'].isna()]['Domain'].to_list()

# script
def countSubDomain(obj):
    ext = tldextract.extract(str(obj))
    subdomains = ext.subdomain.split('.') if ext.subdomain else []
    return len(subdomains)
data.loc[data['Domain'].notna() & data['NoOfSubDomain'].isna(), 'NoOfSubDomain'] = data['Domain'].apply(countSubDomain)

# preview
data[data['Domain'].isin(Domain_NoOfSubDomain) & data['NoOfSubDomain']][['Domain', 'NoOfSubDomain']]

Unnamed: 0,Domain,NoOfSubDomain
7,www.epner.com,1.0
8,www.az511.com,1.0
9,www.screenbeam.com,1.0
10,www.hellscent.com,1.0
11,www.marcosimoncellifondazione.it,1.0
...,...,...
140393,www.fites.net,1.0
140396,www.genevalakemuseum.org,1.0
140397,www.nnry.com,1.0
140400,www.greenmountainenergy.com,1.0


In [160]:
# NoOfQMarkInURL
URL_NoOfQMarkInURL = data[data['URL'].notna() & data['NoOfQMarkInURL'].isna()]['URL']

# script
def countQMark(obj):
    s = str(obj)
    count = 0
    for char in s:
        if char == '?': count += 1
    return count
data.loc[data['URL'].notna() & data['NoOfQMarkInURL'].isna(), 'NoOfQMarkInURL'] = data['URL'].apply(countQMark)

# preview
data[data['URL'].isin(URL_NoOfQMarkInURL)][['URL','NoOfQMarkInURL']]

Unnamed: 0,URL,NoOfQMarkInURL
1,http://uqr.to/1il1z,0.0
7,https://www.epner.com,0.0
8,https://www.az511.com,0.0
9,https://www.screenbeam.com,0.0
13,https://www.uncrazed.com,0.0
...,...,...
140362,https://www.michel-desfayes.org,0.0
140374,https://www.restaumatic.com,0.0
140395,https://www.inspiredherway.com,0.0
140401,https://www.leadcastingcall.com,0.0


In [161]:
# NoOfAmpersandInURL
URL_NoOfAmpersandInURL = data[data['URL'].notna() & data['NoOfAmpersandInURL'].isna()]['URL'].to_list()

# script
def countAmp(obj):
    s = str(obj)
    count = 0
    for char in s:
        if char == '%': count += 1
    return count
data.loc[data['URL'].notna() & data['NoOfAmpersandInURL'].isna(), 'NoOfAmpersandInURL'] = data['URL'].apply(countAmp)

# preview
data[data['URL'].isin(URL_NoOfAmpersandInURL)][['URL', 'NoOfAmpersandInURL']]

Unnamed: 0,URL,NoOfAmpersandInURL
6,https://www.ridemcts.com,0.0
11,https://www.marcosimoncellifondazione.it,0.0
13,https://www.uncrazed.com,0.0
16,https://www.clare.fm,0.0
20,https://www.cse.uconn.edu,0.0
...,...,...
140379,https://www.123telugu.com,0.0
140385,https://www.expostandzone.com,0.0
140390,https://www.hicbc.com,0.0
140395,https://www.inspiredherway.com,0.0


In [162]:
# IsHTTPS
URL_IsHTTPS = data[data['URL'].notna() & data['IsHTTPS'].isna()]['URL'].to_list()

# script
def IsHTTPSCheck(obj):
    try:
        if urlparse(str(obj)).scheme == "https":
            return 1
        else: return 0
    except Exception as e:
        return 0
data.loc[data['URL'].notna() & data['IsHTTPS'].isna(), 'IsHTTPS'] = data['URL'].apply(IsHTTPSCheck)

# preview
data[data['URL'].isin(URL_IsHTTPS)][['URL', 'IsHTTPS']]

Unnamed: 0,URL,IsHTTPS
0,https://www.northcm.ac.th,1.0
1,http://uqr.to/1il1z,0.0
2,https://www.woolworthsrewards.com.au,1.0
5,https://www.free-marine.com,1.0
8,https://www.az511.com,1.0
...,...,...
140391,https://www.sligoheritage.com,1.0
140400,https://www.greenmountainenergy.com,1.0
140401,https://www.leadcastingcall.com,1.0
140402,https://www.fedarb.com,1.0


In [163]:
# TLD
Domain_TLD = data[data['Domain'].notna() & data['TLD'].isna()]['Domain'].to_list()

# script
def get_TLD(obj):
    return tldextract.extract(str(obj)).suffix.split('.')[-1]
data.loc[data['Domain'].notna() & data['TLD'].isna(), 'TLD'] = data['Domain'].apply(get_TLD)

# preview
data[data['Domain'].isin(Domain_TLD)][['Domain', 'TLD']]

Unnamed: 0,Domain,TLD
0,www.northcm.ac.th,th
4,www.nyprowrestling.com,com
9,www.screenbeam.com,com
11,www.marcosimoncellifondazione.it,it
12,www.marxist.se,se
...,...,...
140397,www.nnry.com,com
140399,www.slavevoyages.org,org
140401,www.leadcastingcall.com,com
140402,www.fedarb.com,com


In [None]:
# TLDLength
TLD_TLDLength = data[data['TLD'].notna() & data['TLDLength'].isna()]['TLD'].to_list()

# script
data.loc[data['TLD'].notna() & data['TLDLength'].isna(), 'TLDLength'] = data['TLD'].str.len()

# preview
data[data['TLD'].isin(TLD_TLDLength)][['TLD', 'TLDLength']]

In [164]:


# Fill TLDLength
data.loc[data['TLD'].notna() & data['TLDLength'].isna(), 'TLDLength'] = data['TLD'].str.len()

# Fill CharContinuationRate
def countCCR(obj1,obj2):
    url = str(obj1)
    tld = str(obj2)
    url = url.split('.')[0]
    url = url.split('.' + tld)[0]
    last = 'a' if url[0].isalpha() else 'n' if url[0].isnumeric() else 's'
    c1 = 1 if last == 'a' else 0
    c2 = 1 if last == 'n' else 0
    c3 = 1 if last == 's' else 0
    cc1 = 0
    cc2 = 0
    cc3 = 0
    for i in range(1,len(url)):
        if url[i].isalpha():
            c1 += 1
            c2 = 0
            c3 = 0
        elif url[i].isnumeric():
            c1 = 0
            c2 += 1
            c3 = 0
        else:
            c1 = 0
            c2 = 0
            c3 += 1
        cc1 = max(c1, cc1)
        cc2 = max(c2, cc2)
        cc3 = max(c3, cc3)
    return (cc1+cc2+cc3)/len(url)
data.loc[data['URL'].notna() & data['CharContinuationRate'].isna(), 'CharContinuationRate'] = countCCR(data['URL'], data['TLD'])
