In [2]:
# Inspecting a CSV file using pandas
import pandas as pd

urldata = pd.read_csv('../data/raw/urldata.csv')
urldata.head()

Unnamed: 0.1,Unnamed: 0,url,label,result
0,0,https://www.google.com,benign,0
1,1,https://www.youtube.com,benign,0
2,2,https://www.facebook.com,benign,0
3,3,https://www.baidu.com,benign,0
4,4,https://www.wikipedia.org,benign,0


In [3]:
# Removing the Unnamed column (run only once)
urldata = urldata.drop("Unnamed: 0", axis=1)
urldata.head()

Unnamed: 0,url,label,result
0,https://www.google.com,benign,0
1,https://www.youtube.com,benign,0
2,https://www.facebook.com,benign,0
3,https://www.baidu.com,benign,0
4,https://www.wikipedia.org,benign,0


In [4]:
# Checking for missing values
urldata.isnull().sum()

url       0
label     0
result    0
dtype: int64

In [5]:
urldata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450176 entries, 0 to 450175
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     450176 non-null  object
 1   label   450176 non-null  object
 2   result  450176 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 10.3+ MB


In [6]:
# Normalizing URLs before parsing
from urllib.parse import urlparse
def normalize_url(url):
    if not isinstance(url, str):
        return ''
    url = url.strip()
    url = url.replace("[.]", ".") # Replace obfuscated dots
    if not url.startswith(('http://', 'https://')): # Add scheme if missing
        url = 'http://' + url
    return url

# Data Preprocessing
#### 1. Lengths
- URL Length
- Hostname Length
- Path Length
- Fist Directory Length
- Top Level Domain Length
- Query Length

In [7]:
from urllib.parse import urlparse
from tld import get_tld
import os.path

# URL Length
urldata["url_length"] = urldata["url"].apply(lambda x: len(str(x)))

# Hostname Length
def hostname_length(url):
    url = normalize_url(url)
    parsed = urlparse(url)
    return len(parsed.hostname or '')
urldata["hostname_length"] = urldata["url"].apply(hostname_length)

# Path Length
def path_length(url):
    url = normalize_url(url)
    parsed = urlparse(url)
    return len(parsed.path)
urldata["path_length"] = urldata["url"].apply(path_length)

# First Directory Length
def first_directory_length(url):
    url = normalize_url(url)
    path = urlparse(url).path
    first_directory = path.split('/')[1] if len(path.split('/')) > 1 else ''
    return len(first_directory)
urldata["first_directory_length"] = urldata["url"].apply(first_directory_length)

# Top Level Domain Length
def tld_length(url):
    url = normalize_url(url)
    try:
        tld = get_tld(url, fail_silently=True)
        return len(tld) if tld else 0
    except:
        return 0
urldata["tld_length"] = urldata["url"].apply(tld_length)

# Query Length
def query_length(url):
    url = normalize_url(url)
    parsed = urlparse(url)
    return len(parsed.query)
urldata["query_length"] = urldata["url"].apply(query_length)

urldata.head()

Unnamed: 0,url,label,result,url_length,hostname_length,path_length,first_directory_length,tld_length,query_length
0,https://www.google.com,benign,0,22,14,0,0,3,0
1,https://www.youtube.com,benign,0,23,15,0,0,3,0
2,https://www.facebook.com,benign,0,24,16,0,0,3,0
3,https://www.baidu.com,benign,0,21,13,0,0,3,0
4,https://www.wikipedia.org,benign,0,25,17,0,0,3,0


#### 2. Counts
- Special Characters
    - !@#$%^&*()-_=+[]{}|;:'\",.<>?/\\`~
- Count Of 'www'
- Count Of Digits
- Count Of Letters
- Count Of Number Of Directories
- Count Of Number Of Subdomains
- Count Of Tokens In URL

In [8]:
import tldextract

def special_char_count(url):
    url = str(url)
    special_chars = "!@#$%^&*()-_=+[]{}|;:'\",.<>?/\\`~"
    count = sum(1 for c in url if c in special_chars)
    return count
urldata["special_char_count"] = urldata["url"].apply(special_char_count)

urldata["count-www"] = urldata["url"].apply(lambda x: str(x).count('www'))

# Digit Count
def digit_count(url):
    url = str(url)
    digits = sum(c.isdigit() for c in url)
    return digits
urldata["digit_count"] = urldata["url"].apply(digit_count)

# Letter Count
def letter_count(url):
    url = str(url)
    letters = sum(c.isalpha() for c in url)
    return letters
urldata["letter_count"] = urldata["url"].apply(letter_count)

# Directory Count
def dir_count(url):
    url = normalize_url(url)
    dir = urlparse(url).path
    return dir.count('/')
urldata["dir_count"] = urldata["url"].apply(dir_count)

# Subdomain Count
def subdomain_count(url):
    url = normalize_url(url)
    hostname = urlparse(url).hostname or ''
    ext = tldextract.extract(hostname)
    if not ext.subdomain:
        return 0
    return len(ext.subdomain.split('.'))
urldata["subdomain_count"] = urldata["url"].apply(subdomain_count)

# Query Parameter Count
def query_param_count(url):
    url = normalize_url(url)
    parsed = urlparse(url)
    query = parsed.query
    if query:
        return query.count('&') + 1
    return 0
urldata["query_param_count"] = urldata["url"].apply(query_param_count)

urldata.head()

Unnamed: 0,url,label,result,url_length,hostname_length,path_length,first_directory_length,tld_length,query_length,special_char_count,count-www,digit_count,letter_count,dir_count,subdomain_count,query_param_count
0,https://www.google.com,benign,0,22,14,0,0,3,0,5,1,0,17,0,1,0
1,https://www.youtube.com,benign,0,23,15,0,0,3,0,5,1,0,18,0,1,0
2,https://www.facebook.com,benign,0,24,16,0,0,3,0,5,1,0,19,0,1,0
3,https://www.baidu.com,benign,0,21,13,0,0,3,0,5,1,0,16,0,1,0
4,https://www.wikipedia.org,benign,0,25,17,0,0,3,0,5,1,0,20,0,1,0


#### 3. Binary Features
- Has IP
- Uses HTTPS
- Has Suspicious Extension
- Uses Shortener

In [None]:
import re
import ipaddress

# Has IP Address
def has_ip_address(url):
    url = normalize_url(url)
    host = urlparse(url).hostname or ''
    try:
        ipaddress.ip_address(host)
        return 1
    except ValueError:
        return 0
urldata["has_ip_address"] = urldata["url"].apply(has_ip_address)

# Uses HTTPS
def uses_https(url):
    url = normalize_url(url)
    return 1 if urlparse(url).scheme == 'https' else 0
urldata["uses_https"] = urldata["url"].apply(uses_https)

# Has Suspicious Extension
def suspicious_extension(url):
    url = normalize_url(url)
    suspicious_exts = ['.exe', '.zip', '.rar', '.scr', '.pif', '.bat', '.cmd', '.js', '.vbs']
    path = urlparse(url).path.lower()
    for ext in suspicious_exts:
        if path.endswith(ext):
            return 1
    return 0
urldata["suspicious_extension"] = urldata["url"].apply(suspicious_extension)

# Uses Shortening
shortening_services = re.compile( 
    r'bit\.ly|goo\.gl|shorte\.st|go2l\.ink|ow\.ly|t\.co|tinyurl|is\.gd|cli\.gs|' 
    r'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|' 
    r'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|' 
    r'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|lnkd\.in|' 
    r'db\.tt|qr\.ae|adf\.ly|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|ity\.im|' 
    r'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|' 
    r'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|' 
    r'tr\.im|link\.zip\.net' 
    , re.IGNORECASE               # added this flag becuase we're casting to lower case in uses_shortening
)

def uses_shortening(url): 
    url = normalize_url(url) 
    hostname = (urlparse(url).hostname or '').lower()
    return 1 if shortening_services.search(hostname) else 0
urldata["uses_shortening"] = urldata["url"].apply(uses_shortening)

urldata.head()

Unnamed: 0,url,label,result,url_length,hostname_length,path_length,first_directory_length,tld_length,query_length,special_char_count,count-www,digit_count,letter_count,dir_count,subdomain_count,query_param_count,has_ip_address,uses_https,suspicious_extension,uses_shortening
0,https://www.google.com,benign,0,22,14,0,0,3,0,5,1,0,17,0,1,0,0,1,0,0
1,https://www.youtube.com,benign,0,23,15,0,0,3,0,5,1,0,18,0,1,0,0,1,0,0
2,https://www.facebook.com,benign,0,24,16,0,0,3,0,5,1,0,19,0,1,0,0,1,0,0
3,https://www.baidu.com,benign,0,21,13,0,0,3,0,5,1,0,16,0,1,0,0,1,0,0
4,https://www.wikipedia.org,benign,0,25,17,0,0,3,0,5,1,0,20,0,1,0,0,1,0,0


#### 4. Entropy
- Shannon Entropy

In [10]:
# Shannon Entropy
import math
def shannon_entropy(url):
    url = str(url)
    if not url:
        return 0
    freq = {}
    for char in url:
        freq[char] = freq.get(char, 0) + 1
    entropy = 0.0
    length = len(url)
    for char, count in freq.items():
        p = count / length
        entropy -= p * math.log2(p)
    return entropy
urldata["shannon_entropy"] = urldata["url"].apply(shannon_entropy)

urldata.head()

Unnamed: 0,url,label,result,url_length,hostname_length,path_length,first_directory_length,tld_length,query_length,special_char_count,...,digit_count,letter_count,dir_count,subdomain_count,query_param_count,has_ip_address,uses_https,suspicious_extension,uses_shortening,shannon_entropy
0,https://www.google.com,benign,0,22,14,0,0,3,0,5,...,0,17,0,1,0,0,1,0,0,3.663533
1,https://www.youtube.com,benign,0,23,15,0,0,3,0,5,...,0,18,0,1,0,0,1,0,0,3.762267
2,https://www.facebook.com,benign,0,24,16,0,0,3,0,5,...,0,19,0,1,0,0,1,0,0,3.855389
3,https://www.baidu.com,benign,0,21,13,0,0,3,0,5,...,0,16,0,1,0,0,1,0,0,3.88018
4,https://www.wikipedia.org,benign,0,25,17,0,0,3,0,5,...,0,20,0,1,0,0,1,0,0,3.813661


#### 5. Ratios
- Digit Ratio
- Letter Ratio
- Special Character Ratio

In [11]:
# Digit Ratio
def digit_ratio(url):
    url = str(url)
    digits = sum(c.isdigit() for c in url)
    length = len(url)
    return digits / length if length > 0 else 0
urldata["digit_ratio"] = urldata["url"].apply(digit_ratio)

# Letter Ratio
def letter_ratio(url):
    url = str(url)
    letters = sum(c.isalpha() for c in url)
    length = len(url)
    return letters / length if length > 0 else 0
urldata["letter_ratio"] = urldata["url"].apply(letter_ratio)

# Special Character Ratio
def special_char_ratio(url):
    url = str(url)
    special_chars = "!@#$%^&*()-_=+[]{}|;:'\",.<>?/\\`~"
    special_count = sum(1 for c in url if c in special_chars)
    length = len(url)
    return special_count / length if length > 0 else 0
urldata["special_char_ratio"] = urldata["url"].apply(special_char_ratio)

urldata.head().T

Unnamed: 0,0,1,2,3,4
url,https://www.google.com,https://www.youtube.com,https://www.facebook.com,https://www.baidu.com,https://www.wikipedia.org
label,benign,benign,benign,benign,benign
result,0,0,0,0,0
url_length,22,23,24,21,25
hostname_length,14,15,16,13,17
path_length,0,0,0,0,0
first_directory_length,0,0,0,0,0
tld_length,3,3,3,3,3
query_length,0,0,0,0,0
special_char_count,5,5,5,5,5
