In [1]:
import time
import os
import requests
import random
import sqlite3

import tensorflow as tf         # noqa: F401
import tensorflow_text as text  # noqa: F401
import tensorflow_hub as hub    # noqa: F401
import numpy as np              # noqa: F401
from bs4 import BeautifulSoup   # noqa: F401

In [2]:
#helper functions

def load_topics() -> list[str]:
    with open("top_1000_websites.csv", "r") as f:
        idx, url, rating = f.readline().split(",")
        return [url.strip().replace('"', "") for idx, url, rating in [line.split(",") for line in f.readlines()]]

def get_random_header():
    headers = [
        { #make a somewhat realistic user agent
            "Connection": "keep-alive", #means: keep the connection open
            "Cache-Control": "max-age=0", #means: don't cache
            "sec-ch-ua": f'"Chromium";v="{random.randint(90, 95)}", "Google Chrome";v="{random.randint(90, 95)}", ";Not A Brand";v="99"', #means: I'm using Chrome 94
            "sec-ch-ua-mobile": "?0", #means: I'm not using a mobile device
            "sec-ch-ua-platform": f'"{random.choice(["macOS", "Windows", "Linux", "iPhone", "iPad", "Android"])}"', #means: I'm using macOS
            "Upgrade-Insecure-Requests": "1", #means: I want to be redirected to https if possible
            "User-Agent": f"Mozilla/5.0 ({random.choice(['Macintosh', 'Windows', 'Linux', 'iPhone', 'iPad', 'Android'])}; Intel Mac OS X 10_{random.randint(10, 15)}_{random.randint(1, 9)}) AppleWebKit/{random.randint(500, 599)}.36 (KHTML, like Gecko)", #means: I'm using macOS and Chrome 94
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8", #means: I accept all kinds of files
            "Sec-Fetch-Site": "none", #means: I'm not on a different site
            "Sec-Fetch-Mode": "navigate", #means: I'm navigating to a different site
            "Sec-Fetch-User": "?1", #means: I'm not using a mobile device
            "Sec-Fetch-Dest": "document", #means: I'm navigating to a document
            "Accept-Encoding": "gzip, deflate, br", #means: I accept all kinds of encodings
            "Accept-Language": "en-US, ar, zh-CN, fr, de, it, ja, ko, ru, es", #means: I accept all kinds of languages
        },
        {#make a firefox/windows user agent
            "Connection": "keep-alive", #means: keep the connection open
            "Cache-Control": "max-age=0", #means: don't cache
            "sec-ch-ua": f'"Mozilla";v="{random.randint(80, 89)}.0", "Firefox";v="{random.randint(80, 89)}.0"', #means: I'm using Firefox
            "sec-ch-ua-mobile": "?0", #means: I'm not using a mobile device
            "sec-ch-ua-platform": f'"{random.choice(["Windows", "Linux", "Macintosh"])}"', #means: I'm using Windows
            "Upgrade-Insecure-Requests": "1", #means: I want to be redirected to https if possible
            "User-Agent": f"Mozilla/5.0 ({random.choice(['Windows NT 10.0; Win64; x64', 'Windows NT 10.0; Win64; x64; rv:89.0', 'Windows NT 6.1; Win64; x64', 'Windows NT 6.1; Win64; x64; rv:89.0', 'Windows NT 6.3; Win64; x64', 'Windows NT 6.3; Win64; x64; rv:89.0'])}) Gecko/20100101 Firefox/{random.randint(80, 89)}.0", #means: I'm using Windows and Firefox
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", #means: I accept all kinds of files
            "Sec-Fetch-Site": "none", #means: I'm not on a different site
            "Sec-Fetch-Mode": "navigate", #means: I'm navigating to a different site
            "Sec-Fetch-User": "?1", #means: I'm not using a mobile device
            "Sec-Fetch-Dest": "document", #means: I'm navigating to a document
            "Accept-Encoding": "gzip, deflate, br", #means: I accept all kinds of encodings
            "Accept-Language": "en-US, ar, zh-CN, fr, de, it, ja, ko, ru, es", #means: I accept all kinds of languages
        },
        {#make a edge/windows user agent
            "Connection": "keep-alive", #means: keep the connection open
            "Cache-Control": "max-age=0", #means: don't cache
            "sec-ch-ua": f'"Microsoft Edge";v="{random.randint(80, 89)}", "Edg";v="{random.randint(80, 89)}"', #means: I'm using Edge
            "sec-ch-ua-mobile": "?0", #means: I'm not using a mobile device
            "sec-ch-ua-platform": f'"{random.choice(["Windows", "Linux", "Macintosh"])}"', #means: I'm using Windows
            "Upgrade-Insecure-Requests": "1", #means: I want to be redirected to https if possible
            "User-Agent": f"Mozilla/5.0 ({random.choice(['Windows NT 10.0; Win64; x64', 'Windows NT 10.0; Win64; x64; rv:89.0', 'Windows NT 6.1; Win64; x64', 'Windows NT 6.1; Win64; x64; rv:89.0', 'Windows NT 6.3; Win64; x64', 'Windows NT 6.3; Win64; x64; rv:89.0'])}) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{random.randint(90, 95)}.0.0.0 Safari/537.36 Edg/{random.randint(80, 89)}.0.{random.randint(1000, 9999)}.0", #means: I'm using Windows and Edge
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", #means: I accept all kinds of files
            "Sec-Fetch-Site": "none", #means: I'm not on a different site
            "Sec-Fetch-Mode": "navigate", #means: I'm navigating to a different site
            "Sec-Fetch-User": "?1", #means: I'm not using a mobile device
            "Sec-Fetch-Dest": "document", #means: I'm navigating to a document
            "Accept-Encoding": "gzip, deflate, br", #means: I accept all kinds of encodings
            "Accept-Language": "en-US, ar, zh-CN, fr, de, it, ja, ko, ru, es", #means: I accept all kinds of languages
        },
        {#make a chromium/Ubuntu user agent
            "Connection": "keep-alive", #means: keep the connection open
            "Cache-Control": "max-age=0", #means: don't cache
            "sec-ch-ua": f'"Chromium";v="{random.randint(90, 95)}", "Google Chrome";v="{random.randint(90, 95)}", ";Not A Brand";v="99"', #means: I'm using Chrome 94
            "sec-ch-ua-mobile": "?0", #means: I'm not using a mobile device
            "sec-ch-ua-platform": f'"{random.choice(["Ubuntu", "Linux", "Macintosh"])}"', #means: I'm using Ubuntu
            "Upgrade-Insecure-Requests": "1", #means: I want to be redirected to https if possible
            "User-Agent": f"Mozilla/5.0 ({random.choice(['X11; Ubuntu Linux x86_64', 'X11; Linux x86_64', 'Macintosh; Intel Mac OS X 10_15_7'])}) AppleWebKit/{random.randint(500, 599)}.36 (KHTML, like Gecko) Chrome/{random.randint(90, 95)}.0.0.0 Safari/{random.randint(500, 599)}.36", #means: I'm using Ubuntu and Chrome
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", #means: I accept all kinds of files
            "Sec-Fetch-Site": "none", #means: I'm not on a different site
            "Sec-Fetch-Mode": "navigate", #means: I'm navigating to a different site
            "Sec-Fetch-User": "?1", #means: I'm not using a mobile device
            "Sec-Fetch-Dest": "document", #means: I'm navigating to a document
            "Accept-Encoding": "gzip, deflate, br", #means: I accept all kinds of encodings
            "Accept-Language": "en-US, ar, zh-CN, fr, de, it, ja, ko, ru, es", #means: I accept all kinds of languages
        },
        {#make a chromebook user agent
            "Connection": "keep-alive", #means: keep the connection open
            "Cache-Control": "max-age=0", #means: don't cache
            "sec-ch-ua": f'"Chromium";v="{random.randint(90, 95)}", "Google Chrome";v="{random.randint(90, 95)}", ";Not A Brand";v="99"', #means: I'm using Chrome 94
            "sec-ch-ua-mobile": "?1", #means: I'm using a mobile device
            "sec-ch-ua-platform": f'"CrOS";v="{random.randint(10000, 99999)}.{random.randint(0, 9)}.{random.randint(0, 9999)}.{random.randint(0, 999)}"', #means: I'm using a Chromebook
            "Upgrade-Insecure-Requests": "1", #means: I want to be redirected to https if possible
            "User-Agent": f"Mozilla/5.0 ({random.choice(['X11; CrOS x86_64', 'X11; CrOS armv7l', 'X11; CrOS aarch64'])}) AppleWebKit/{random.randint(500, 599)}.36 (KHTML, like Gecko) Chrome/{random.randint(90, 95)}.0.0.0 Safari/{random.randint(500, 599)}.36", #means: I'm using a Chromebook and Chrome
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", #means: I accept all kinds of files
            "Sec-Fetch-Site": "none", #means: I'm not on a different site
            "Sec-Fetch-Mode": "navigate", #means: I'm navigating to a different site
            "Sec-Fetch-User": "?1", #means: I'm using a mobile device
            "Sec-Fetch-Dest": "document", #means: I'm navigating to a document
            "Accept-Encoding": "gzip, deflate, br", #means: I accept all kinds of encodings
            "Accept-Language": "en-US, ar, zh-CN, fr, de, it, ja, ko, ru, es", #means: I accept all kinds of languages
        },
        {#make a chrome/windows 10 user agent
            "Connection": "keep-alive", #means: keep the connection open
            "Cache-Control": "max-age=0", #means: don't cache
            "sec-ch-ua": f'"Chromium";v="{random.randint(90, 95)}", "Google Chrome";v="{random.randint(90, 95)}", ";Not A Brand";v="99"', #means: I'm using Chrome 94
            "sec-ch-ua-mobile": "?0", #means: I'm not using a mobile device
            "sec-ch-ua-platform": f'"Windows";v="{random.choice(["10.0", "8.1", "8", "7", "Vista", "XP"])}"', #means: I'm using Windows 10
            "Upgrade-Insecure-Requests": "1", #means: I want to be redirected to https if possible
            "User-Agent": f"Mozilla/5.0 (Windows NT {random.choice(['10.0', '6.1', '6.3'])}; Win64; x64) AppleWebKit/{random.randint(500, 599)}.36 (KHTML, like Gecko) Chrome/{random.randint(90, 95)}.0.0.0 Safari/{random.randint(500, 599)}.36", #means: I'm using Windows 10 and Chrome
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", #means: I accept all kinds of files
            "Sec-Fetch-Site": "none", #means: I'm not on a different site
            "Sec-Fetch-Mode": "navigate", #means: I'm navigating to a different site
            "Sec-Fetch-User": "?1", #means: I'm not using a mobile device
            "Sec-Fetch-Dest": "document", #means: I'm navigating to a document
            "Accept-Encoding": "gzip, deflate, br", #means: I accept all kinds of encodings
            "Accept-Language": "en-US, ar, zh-CN, fr, de, it, ja, ko, ru, es", #means: I accept all kinds of languages
        },
    ]
    h = random.choice(headers)
    
    #append "charset=utf-8" to the "Content-Type" header
    h["Content-Type"] = "text/html; charset=utf-8"
    return h

In [3]:
def make_request(url: str) -> str:
    try:
        r = requests.get(
            url,
            headers = get_random_header(),
            timeout = 10, #means: wait 10 seconds before timing out
            )
        if r.status_code == 200:
            if r.encoding == None:
                r.encoding = "utf-8"
            return r.text.encode(r.encoding).decode("utf-8", "ignore")
        else:
            print(f"Error: {r.status_code} for {url}")
            return ""
    except Exception as e:
        print(f"Error: {e} for {url}")
        return ""


In [4]:
def get_sitemap_from_robotstxt(robots_txt_content: str) -> list[str]:
    sitemap_urls = []
    for line in robots_txt_content.split("\n"):
        if line.startswith("Sitemap:"):
            try:
                sitemap_urls.append(line.split(" ")[1])
            except:
                pass
    return sitemap_urls

In [5]:
def links_from_xml(xml_content: str) -> list[str]:
    soup = BeautifulSoup(xml_content, "lxml")
    return [loc.text for loc in soup.find_all("loc")]

def sitemaps_from_xml(xml_content: str) -> list[str]:
    soup = BeautifulSoup(xml_content, "lxml")
    return [loc.text for loc in soup.find_all("sitemap")]

In [6]:
#sqlite setup

db = sqlite3.connect("./data/links.db")

#setup tables
cmd = """
CREATE TABLE IF NOT EXISTS links (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    url TEXT NOT NULL UNIQUE
)
"""

curser = db.cursor()
curser.execute(cmd)
db.commit()

In [15]:
#sqlite functions

if db:
    print("Database connection established")
else:
    print("run sqlite setup block first")

def add_url(url: str) -> bool:
    """add a single url string to the database
    returns False if the url already exists in the database
    """
    
    cmd = f"SELECT * FROM links WHERE url = '{url}'"
    cursor = db.cursor()
    cursor.execute(cmd)
    result = cursor.fetchall()
    if len(result) == 0:
        cmd = f"INSERT INTO links (url) VALUES ('{url}')"
        cursor.execute(cmd)
        db.commit()
        return True
    else:
        return False

def batch_add_urls(urls: list[str]) -> tuple[int, float]:
    """add multiple urls to the database"""
    t1 = time.time()

    # add the URLs to the database
    cmd = "INSERT OR IGNORE INTO links (url) VALUES (?)"
    cursor = db.cursor()
    cursor.executemany(cmd, [(url,) for url in urls])
    db.commit()

    return len(urls), time.time() - t1

def delete_url(url: str) -> bool:
    """delete a specific url from the database"""
    
    cmd = f"SELECT * FROM links WHERE url = '{url}'"
    cursor = db.cursor()
    cursor.execute(cmd)
    result = cursor.fetchall()
    
    if len(result) == 0:
        return False #url doesn't exist in the database
    else:
        cmd = f"DELETE FROM links WHERE url = '{url}'"
        cursor.execute(cmd)
        db.commit()
        return True #url was deleted from the database

def get_url(id:int) -> str:
    """get a url from the database by id"""
    
    cmd = f"SELECT url FROM links WHERE id = {id}"
    cursor = db.cursor()
    cursor.execute(cmd)
    result = cursor.fetchall()
    try:
        return result[0][0]
    except:
        return ""

def get_length() -> int:
    """get the current number of rows in the database
    
    This count is not exactly correct for the tradeoff of speed.
    """
    
    curser = db.cursor()
    #curser.execute("SELECT COUNT(*) FROM links") #this is slow
    curser.execute("SELECT MAX(id) FROM links") #this is fast
    result = curser.fetchall()
    return result[0][0]

def get_random_url(num_results: int = 1) -> str:
    """get (a) random url(s) from the entire database"""
    
    #instead of using SELECT * FROM links ORDER BY RANDOM() LIMIT 1
    #we get the current number of rows in the database and select num_results random ids
    #then we select the urls with those ids
    
    num_rows = get_length()
    
    urls = []
    
    for _ in range(num_results):
        id = random.randint(1, num_rows)
        cmd = f"SELECT url FROM links WHERE id = {id}"
        cursor = db.cursor()
        cursor.execute(cmd)
        result = cursor.fetchall()
        
        #check how many results we got
        if len(result) == 0:
            #if we got no results, try again
            urls.append(get_random_url())
        else:
            #if we got results, add them to the list
            urls.append(result[0][0])
    
    if len(set(urls)) == len(urls):
        #if all the urls are unique, return them
        if num_results == 1:
            return urls[0]
        else:
            return urls
    else:
        #if there are duplicates, remove those and get more urls
        urls = list(set(urls))
        urls.extend(get_random_url(num_results - len(urls)))
        if num_results == 1:
            return urls[0]
        else:
            return urls

def get_sitemap_ids() -> list[int]:
    """get all the ids of urls that end in *.xml"""
    
    #TODO make this more efficient
    
    cmd = "SELECT id FROM links WHERE url LIKE '%.xml'"
    cursor = db.cursor()
    cursor.execute(cmd)
    result = cursor.fetchall()
    return [id[0] for id in result]

def get_random_sitemap(sitemap_ids) -> tuple[int, str]:
    id = random.choice(sitemap_ids)
    
    cmd = f"SELECT url FROM links WHERE id = {id}"
    cursor = db.cursor()
    cursor.execute(cmd)
    result = cursor.fetchall()
    
    #return id, url
    return id, result[0][0]

Database connection established


In [8]:
#call get_sitemap_ids() and write it to a file
try:
    sitemap_ids = sitemap_ids  # noqa
except: #noqa
    print("sitemap_ids not defined yet")
    sitemap_ids = get_sitemap_ids()

with open("sitemap_ids.txt", "w") as f:
    for line in sitemap_ids:
        f.write(f"{line}\n")

sitemap_ids not defined yet


In [9]:
#benchmark database functions
test_url = "https://linushorn.dev"

t1 = time.time()

add_url(test_url)

print(f"add_url: {time.time() - t1}")
t1 = time.time()

delete_url(test_url)

print(f"delete_url: {time.time() - t1}")
t1 = time.time()

print(get_length())

print(f"get_length: {time.time() - t1}")
t1 = time.time()

print(len(get_random_url(1_000)))

print(f"get_random_url: {time.time() - t1}")

add_url: 0.01600790023803711
delete_url: 0.010517358779907227
747232651
get_length: 0.0010006427764892578
1000
get_random_url: 0.8016164302825928


In [10]:
#logic starts now

#load the topics
topics = load_topics()
robots_txt_urls = [
    url + "/robots.txt" if not url.endswith("/") else url[:-1] + "/robots.txt"
    for url in topics
]
robots_txt_urls = [
    "https://" + url if not url.startswith("https://") else url
    for url in robots_txt_urls
]

In [11]:
#scrape the robots.txt files

for url in robots_txt_urls:
    
    r = make_request(url)
    links = get_sitemap_from_robotstxt(r)
    
    batch_add_urls(links)
    
    print(f"added {len(links)} links from {url}")

added 0 links from https://facebook.com/robots.txt
added 1 links from https://twitter.com/robots.txt
added 0 links from https://google.com/robots.txt
added 0 links from https://youtube.com/robots.txt
added 0 links from https://s.w.org/robots.txt
added 0 links from https://instagram.com/robots.txt
Error: 404 for https://googletagmanager.com/robots.txt
added 0 links from https://googletagmanager.com/robots.txt
added 0 links from https://linkedin.com/robots.txt
added 0 links from https://ajax.googleapis.com/robots.txt
added 0 links from https://plus.google.com/robots.txt
added 0 links from https://gmpg.org/robots.txt
added 0 links from https://pinterest.com/robots.txt
Error: 404 for https://fonts.gstatic.com/robots.txt
added 0 links from https://fonts.gstatic.com/robots.txt
added 0 links from https://wordpress.org/robots.txt
added 0 links from https://en.wikipedia.org/robots.txt
added 0 links from https://youtu.be/robots.txt
added 0 links from https://maps.google.com/robots.txt
added 15 l

In [17]:

with open("./sitemap_ids.txt", "r") as f:
    sitemap_ids = [int(line.strip()) for line in f.readlines()]

sitemaps_remaining = len(sitemap_ids)
for _ in range(sitemaps_remaining):
    id, sitemap_url = get_random_sitemap(sitemap_ids)
    sitemap_ids.remove(id)
    r = make_request(sitemap_url)
    links = links_from_xml(r)
    batch_add_urls(links)
    print(f"added {len(links)} links from {sitemap_url} \t {len(sitemap_ids)} sitemaps remaining", end="\r")

Error: 403 for https://www.bloomberg.com/feeds/equality/sitemap_2017_5.xml0_93.xml 	 369959 sitemaps remainings remainingxml 	 369963 sitemaps remainingps remainingration-and-the-c4-pathway/sitemap.xml 	 370014 sitemaps remaining
Error: 404 for https://www.iheart.com/sitemap/2023-10-29T20-01-17-683Z/data/artists/artists-063-000.xmltemaps remaining
Error: 404 for https://www.hp.com/my-en/shop/sitemap-1-1.xmln-c-sitemap.xml 	 369917 sitemaps remainingmaininginingsitemap.xml 	 369924 sitemaps remainingaining
Error: HTTPSConnectionPool(host='www.cbsnews.com', port=443): Read timed out. (read timeout=10) for https://www.cbsnews.com/xml-sitemap-video/tech/2021-03.xmlng
Error: 406 for https://www.cbsnews.com/xml-sitemap/local-news-twin-cities/2023-07.xmlaps remainingmaps remainings remaining/sitemap.xml 	 369902 sitemaps remaining
Error: 406 for https://www.cbsnews.com/xml-sitemap/tech/2004-01.xmlwin-cities/2023-07.xml 	 369895 sitemaps remaining
Error: 406 for https://www.cbsnews.com/xml-sit