# Data Collection

All imports needed for this Python notebook:

In [1]:
# Request URL
import requests
import urllib
# Web Scraping
from bs4 import BeautifulSoup
# Managing Datasets
import pandas as pd

import json
# Managing temporary files
import os

## Whitelist Specific

In [2]:
# Hold whitelist links while retrieving them
whitelist_links = []

In [3]:
# Get first ten pages of Hypestat top sites ranking
# Goes up to 2000
max_page = 10

for i in range(10):
    page = i + 1
    if i == 1:
        page = ''
    result = requests.get(f'https://hypestat.com/top-sites/{page}')
    soup = BeautifulSoup(result.content, "html.parser")
    rows = soup.find_all('dt')
    rows = [r.find_all('a') for r in rows]
    for row in rows:
        link = row[len(row)-1].get('href')
        link = link.split('https://hypestat.com/info/')[1]
        whitelist_links.append(link)

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


In [4]:
# Get top 50 sites from all countries through Alexa ranking
country_codes = []

result = requests.get(f'https://www.alexa.com/topsites/countries')
soup = BeautifulSoup(result.content, "html.parser")
spans = soup.find_all("ul", {"class": "span3"})
lists = [r.find_all('li') for r in spans]
for list_ in lists:
    for listitem in list_:
        code = listitem.find('a').get('href')
        country_codes.append(code)     

for code in country_codes:
    result = requests.get(f'https://www.alexa.com/topsites/{code}')
    soup = BeautifulSoup(result.content, "html.parser")
    divs = soup.find_all("div", {"class": "DescriptionCell"})
    for link_cell in divs:
        link = link_cell.find('a').get('href')
        link = link.split('/siteinfo/')[1]
        whitelist_links.append(link)

In [5]:
# Remove whitespace and duplicates, print length of list
whitelist_links = [link for link in whitelist_links if link]
whitelist_links = list(set(whitelist_links))
len(whitelist_links)

5398

In [6]:
whitelist_df = pd.DataFrame({'url':whitelist_links})

In [7]:
whitelist_df['label']='good'
whitelist_df.head()

Unnamed: 0,url,label
0,finanzen.net,good
1,quizizz.com,good
2,vkmag.com,good
3,ae.com,good
4,viva.co.id,good


## Blacklist Specific

In [8]:
blacklist_urls = []

In [9]:
phishtank_urls = requests.get('http://data.phishtank.com/data/online-valid.json')
phishtank_urls = [u.get('url') for u in phishtank_urls.json()]
blacklist_urls.extend(phishtank_urls)

In [10]:
urlhaus_urls = requests.get('https://urlhaus.abuse.ch/downloads/text/')
urlhaus_urls = urlhaus_urls.text.split('\r\n')
urlhaus_urls = [url for url in urlhaus_urls if not str(url).startswith('#')]
blacklist_urls.extend(urlhaus_urls)

In [11]:
# Remove whitespace and duplicates, print length of list
blacklist_urls = [link for link in blacklist_urls if link]
blacklist_urls = list(set(blacklist_urls))
print(len(blacklist_urls))

1068571


In [12]:
blacklist_df = pd.DataFrame({'url':blacklist_urls})
blacklist_df['label'] = 'bad'
blacklist_df.head()

Unnamed: 0,url,label
0,http://185.244.25.239/OwO/Tsunami.x86,bad
1,http://42.230.88.134:49922/Mozi.a,bad
2,http://165.227.169.191/[x32],bad
3,http://117.242.210.128:46271/Mozi.m,bad
4,http://182.116.65.218:39149/Mozi.m,bad


## External Mixed

https://raw.githubusercontent.com/Hipo/university-domains-list/master/world_universities_and_domains.json

https://raw.githubusercontent.com/faizann24/Using-machine-learning-to-detect-malicious-URLs/master/data/data2.csv

https://raw.githubusercontent.com/faizann24/Using-machine-learning-to-detect-malicious-URLs/master/data/data.csv

In [13]:
uni_urls = 'https://raw.githubusercontent.com/Hipo/university-domains-list/master/world_universities_and_domains.json'

uni_domains = requests.get(uni_urls).content
uni_domains = json.loads(uni_domains)
uni_domains = [u.get('web_pages') for u in uni_domains]
uni_domains = [domain for sublist in uni_domains for domain in sublist]

In [14]:
external_df = pd.DataFrame({'url':uni_domains})
external_df['label'] = 'good'
external_df.head()

Unnamed: 0,url,label
0,http://www.marywood.edu,good
1,https://www.cstj.qc.ca,good
2,https://ccmt.cstj.qc.ca,good
3,https://ccml.cstj.qc.ca,good
4,http://www.lindenwood.edu/,good


In [15]:
url = 'https://raw.githubusercontent.com/faizann24/Using-machine-learning-to-detect-malicious-URLs/master/data/data.csv'
data1 = requests.get(url).text.split('\n')
with open('url_data.csv', 'w', encoding="utf-8") as file:
    for line in data1:
        file.write(line)
        file.write('\n')
        
data1_df = pd.read_csv('url_data.csv', sep=',')
#os.remove('url_data1.csv')
data1_df.head()

Unnamed: 0,url,label
0,diaryofagameaddict.com,bad
1,espdesign.com.au,bad
2,iamagameaddict.com,bad
3,kalantzis.net,bad
4,slightlyoffcenter.net,bad


In [16]:
# All labels in this dataset are bad
url = 'https://raw.githubusercontent.com/faizann24/Using-machine-learning-to-detect-malicious-URLs/master/data/data2.csv'
data2 = requests.get(url).text.split('\n')
with open('url_data2.csv', 'w', encoding="utf-8") as file:
    for line in data2:
        file.write(line)
        file.write('\n')
        
data2_df = pd.read_csv('url_data2.csv', sep=',', header=None, names=['url', 'label'])
#os.remove('url_data2.csv')
data2_df.head()

Unnamed: 0,url,label
0,hottraveljobs.com/forum/docs/info.php,bad
1,news.grouptumbler.com/news/feed.php,bad
2,info.leveldelta.com/php/text.php,bad
3,citroen-club.ch/n.exe,bad
4,zehir4.asp,bad


In [17]:
external_df = pd.concat([external_df, data1_df, data2_df])
external_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 463075 entries, 0 to 32875
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     463075 non-null  object
 1   label   463075 non-null  object
dtypes: object(2)
memory usage: 10.6+ MB


## Data Evaluation

In [18]:
def domain_scheme(row):
    if row['url'].startswith('http'):
        return row['url']
    
    domain = row['url']
    url = f'https://www.{domain}'
    try:
        requests.get(url, timeout = 5)
        return url
    except:
        return f'http://www.{domain}'

In [19]:
url_df = pd.concat([blacklist_df, whitelist_df, external_df])
print(url_df.groupby('label').nunique())
url_df.info()

           url
label         
bad    1135019
good    359886
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1537044 entries, 0 to 32875
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   url     1537044 non-null  object
 1   label   1537044 non-null  object
dtypes: object(2)
memory usage: 35.2+ MB


In [20]:
minority_count = len(url_df.loc[url_df['label'] == 'good'])
minority_count

359954

In [21]:
url_df.duplicated(subset=['url'])
url_df.drop_duplicates(subset=['url'])
url_df.head()

Unnamed: 0,url,label
0,http://185.244.25.239/OwO/Tsunami.x86,bad
1,http://42.230.88.134:49922/Mozi.a,bad
2,http://165.227.169.191/[x32],bad
3,http://117.242.210.128:46271/Mozi.m,bad
4,http://182.116.65.218:39149/Mozi.m,bad


Here we reduce and balance the dataset because of two reasons. Firstly, the dataset is quite imbalanced and secondly there is too much data and processing it takes too long.

In [22]:
good_df = url_df[url_df.label == "good"].sample(minority_count)
bad_df = url_df[url_df.label == "bad"].sample(minority_count)
url_df = pd.concat([good_df, bad_df])
print(url_df.groupby('label').nunique())
url_df.info()

          url
label        
bad    355208
good   359886
<class 'pandas.core.frame.DataFrame'>
Int64Index: 719908 entries, 156712 to 166477
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     719908 non-null  object
 1   label   719908 non-null  object
dtypes: object(2)
memory usage: 16.5+ MB


In [None]:
url_df['url'] = url_df.apply(lambda row: domain_scheme(row), axis=1)
url_df.head()

In [None]:
url_df.to_csv('raw_data/urls.csv', index=False)