# INFO 4271 - Group Project

Issued: June 11, 2024

Due: July 22, 2024

Please submit a link to your code base (ideally with a branch that does not change anymore after the submission deadline) and your 4-page report via email to carsten.eickhoff@uni-tuebingen.de by the due date. One submission per team.

---

# 1. Web Crawling & Indexing
Crawl the web to discover **English content related to Tübingen**. The crawled content should be stored locally. If interrupted, your crawler should be able to re-start and pick up the crawling process at any time.

In [24]:
import sqlite3
import requests
from bs4 import BeautifulSoup
import datetime
from langdetect import detect, LangDetectException
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urljoin, urlparse

DB_NAME = "index.db"
NUM_WORKERS = 10
FILTER_CONTENT = True
TIMEOUT = 15
TUEBINGEN_KEYWORDS = ['tübingen', 'tubingen', 'tuebingen', 'neckar', 'schwaben', 'swabia', 'university', 'uni',
                      't%c3%bcbingen']

def setup_database(db_name=DB_NAME, drop_existing=False):
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()

    # option to reset the database
    if drop_existing:
        cursor.execute("DROP TABLE IF EXISTS frontier")
        cursor.execute("DROP TABLE IF EXISTS documents")
        cursor.execute("DROP TABLE IF EXISTS incoming_links")

    cursor.execute('''
    CREATE TABLE IF NOT EXISTS frontier (
        url TEXT PRIMARY KEY,
        crawled INTEGER DEFAULT 0
    )''')
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS documents (
        url TEXT PRIMARY KEY,
        title TEXT,
        content TEXT,
        outgoing_links TEXT,
        timestamp TEXT
    )''')
    conn.commit()
    conn.close()

def index_doc(doc, index_path):
    conn = sqlite3.connect(index_path)
    cursor = conn.cursor()
    cursor.execute('''
    INSERT OR IGNORE INTO documents (url, title, content, outgoing_links, timestamp)
    VALUES (?, ?, ?, ?, ?)
    ''', (doc['url'], doc['title'], doc['content'], ','.join(doc['outgoing_links']), doc['timestamp']))
    conn.commit()

def crawl_page(url):
    try:

        response = requests.get(url, timeout=TIMEOUT)  # Fetch the web page
        if response.status_code != 200:
            print(f"Error while fetching response from {url} (response status {response.status_code})")

            # remove URL from frontier if status code is 404
            if response.status_code == 404:
                return "404"

            return None

        soup = BeautifulSoup(response.text, 'html.parser')  # Parse the HTML content
        title = soup.title.string if soup.title else "N/A"
        content = ' '.join(soup.stripped_strings)  # Using stripped_strings to clean up the text

        # Filter out pages that do not contain "tuebingen" in their content or are not in English
        if FILTER_CONTENT:
            if not any(word in content.lower() for word in ['tübingen', 'tubingen', 'tuebingen']):
                return None

            try:
                if detect(content) != 'en':
                    return None
            except LangDetectException:
                return None

        # get all outgoing links and transform them into absolute URLs
        links = [link.get('href') for link in soup.find_all('a', href=True)]
        absolute_links = []
        for link in links:
            # Create absolute URL and parse it
            abs_url = urljoin(url, link)
            parsed_url = urlparse(abs_url)

            # Check if the URL is valid (i.e., it has a network location part and scheme)
            if parsed_url.scheme and parsed_url.netloc:
                absolute_links.append(abs_url)

        links = list(set(absolute_links))  # Remove duplicates

        doc = {
            'url': url,
            'title': title,
            'content': content,
            'outgoing_links': links,
            'timestamp': datetime.datetime.now().isoformat()
        }

    except requests.RequestException as e:
        print(f"Request exception encountered at {url}: {e}")
        return None
    except Exception as e:
        print(f"Unexpected exception encountered at {url}: {e}")
        return None

    return doc

def crawl(index_path):
    conn = sqlite3.connect(index_path)
    cursor = conn.cursor()

    # Calculate total number of URLs to be crawled
    cursor.execute("SELECT COUNT(*) FROM frontier WHERE crawled = 0")
    total_to_crawl = cursor.fetchone()[0]

    with tqdm(total=total_to_crawl, desc="Crawling Progress", unit="page") as pbar:
        while True:
            cursor.execute("SELECT url FROM frontier WHERE crawled = 0 LIMIT 10")
            rows = cursor.fetchall()
            if not rows:
                break

            urls = [row[0] for row in rows]
            with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
                results = executor.map(lambda url: crawl_page(url), urls)

            for url, doc in zip(urls, results):
                if doc:

                    # Remove URL from frontier if status code is 404
                    if doc == "404":
                        cursor.execute("DELETE FROM frontier WHERE url = ?", (url,))
                        conn.commit()
                        pbar.update(1)
                        continue

                    cursor.execute("SELECT 1 FROM documents WHERE url = ? LIMIT 1", (doc['url'],))
                    if cursor.fetchone() is None:
                        index_doc(doc, index_path)
                        pbar.update(1)

                cursor.execute("UPDATE frontier SET crawled = 1 WHERE url = ?", (url,))
                conn.commit()
                if doc:
                    for link in doc['outgoing_links']:
                        cursor.execute("INSERT OR IGNORE INTO frontier (url) VALUES (?)", (link,))
                        conn.commit()
    
    conn.close()

def initialize_frontier(initial_urls, db_name=DB_NAME):
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    for url in initial_urls:
        cursor.execute("INSERT OR IGNORE INTO frontier (url) VALUES (?)", (url,))
    conn.commit()
    conn.close()

    print(f"Inserted {len(initial_urls)} URLs into the frontier table")

    return None

def calculate_incoming_links(db_name=DB_NAME):
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()

    # Create a temporary table to store incoming links
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS incoming_links (
        url TEXT PRIMARY KEY,
        incoming_count INTEGER DEFAULT 0
    )''')

    # Clear any existing data
    cursor.execute("DELETE FROM incoming_links")

    # Retrieve all documents and their outgoing links
    cursor.execute("SELECT url, outgoing_links FROM documents")
    rows = cursor.fetchall()

    for row in rows:
        url, outgoing_links = row
        outgoing_links_list = outgoing_links.split(',')

        for link in outgoing_links_list:
            cursor.execute('''
            INSERT INTO incoming_links (url, incoming_count)
            VALUES (?, 1)
            ON CONFLICT(url) DO UPDATE SET incoming_count = incoming_count + 1
            ''', (link,))

    conn.commit()
    conn.close()

# Initialize and run the crawler
initial_urls = [
    "https://www.tuebingen.de/en/",
    "https://en.wikipedia.org/wiki/T%C3%BCbingen",
    "https://www.uni-tuebingen.de/en.html"
]

setup_database()
initialize_frontier(initial_urls)
crawl("index.db")
calculate_incoming_links()

Inserted 3 URLs into the frontier table


Crawling Progress:   0%|          | 0/126497 [00:00<?, ?page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=J_Hermann_Siemer&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Max_Schulze-Vorberg&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Karl_Moersch&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 1/126497 [00:01<48:10:23,  1.37s/page]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Berthold_Martin&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Dionys_Jobst&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 5/126497 [00:02<18:25:22,  1.91page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hans_Katzer&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Franz_Xaver_Unertl&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hermann_Oetting&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Martin_Horstmeier&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 9/126497 [00:04<15:18:03,  2.30page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Thomas_Ruf&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Peter_Petersen_(German_politician)&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Paul_Heinrich_Simon&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=G%C3%BCnther_M%C3%BCller_(politician)&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 15/126497 [00:05<10:58:46,  3.20page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Kurt_Mattick&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 23/126497 [00:07<9:51:05,  3.57page/s] 

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Philip_Rosenthal_(politician)&action=edit&redlink=1 (response status 404)Error while fetching response from https://en.wikipedia.org/w/index.php?title=Karl_Heinz_Gierenstein&action=edit&redlink=1 (response status 404)

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Walter_L%C3%B6hr&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Willi_Peiter&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Gerhard_Reddemann&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 30/126497 [00:08<8:46:30,  4.00page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Alois_Rainer_(politician,_born_1921)&action=edit&redlink=1 (response status 404)Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hermann_Barche&action=edit&redlink=1 (response status 404)

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Leo_Gottesleben&action=edit&redlink=1 (response status 404)Error while fetching response from https://en.wikipedia.org/w/index.php?title=Friedrich_Vogel_(politician)&action=edit&redlink=1 (response status 404)

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Paul_L%C3%B6her&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 34/126497 [00:10<10:05:14,  3.48page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hans_H%C3%B6rmann&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Dietrich_Rollmann&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Rudolf_Hauck&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 40/126497 [00:16<19:52:12,  1.77page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Otto_Menth&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Heinrich_Schr%C3%B6der_(politician)&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=G%C3%BCnter_Schlichting-von_R%C3%B6nn&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 45/126497 [00:19<20:28:53,  1.71page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Karl-Heinz_Walkhoff&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Josef_Schmitt_(politician)&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Karl_Krammig&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Friedrich_Freiwald&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 58/126497 [00:23<14:58:23,  2.35page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Alexander_Menne&action=edit&redlink=1 (response status 404)Error while fetching response from http://www.wzb.eu/bal/spt/default.de.htm (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hans_B%C3%BCchler&action=edit&redlink=1 (response status 404)

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Herbert_Schneider_(politician)&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Ferdinand_Erpenbeck&action=edit&redlink=1 (response status 404)
Error while fetching response from https://www.jstor.org/stable/2094817 (response status 420)


Crawling Progress:   0%|          | 67/126497 [00:26<14:55:13,  2.35page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Burkhard_Ritz&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Albrecht_Haas&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Will_Rasner&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 73/126497 [00:27<8:59:02,  3.91page/s] 

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Friedrich_Kempfler&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hildegard_Schimschok&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 75/126497 [00:28<12:10:27,  2.88page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hugo_Collet&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 78/126497 [00:29<13:00:52,  2.70page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hans-J%C3%BCrgen_Junghans&action=edit&redlink=1 (response status 404)
Error while fetching response from https://api.semanticscholar.org/CorpusID:145360857 (response status 202)


Crawling Progress:   0%|          | 80/126497 [00:31<14:36:23,  2.40page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Gerhard_Orga%C3%9F&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Horst_Gerlach&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Franz_Sauter&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Annemarie_Griesinger&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Elisabeth_Orth&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=J%C3%BCrgen_Wohlrabe&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Ludwig_FellerMayer&action=edit&redlink=1 (response status 404)
Error while fetching res

Crawling Progress:   0%|          | 84/126497 [00:41<43:27:44,  1.24s/page]

Error while fetching response from https://www.nytimes.com/2009/06/22/world/europe/22dahrendorf.html (response status 403)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hedwig_Meermann&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hans_August_L%C3%BCcker&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 90/126497 [00:43<28:11:36,  1.25page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Richard_Wurbs&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Leo_Ernesti&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Anton_Ott&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Paul_Neumann_(politician)&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Adolf_M%C3%BCller-Emmert&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 95/126497 [00:45<22:37:42,  1.55page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hans_Wolfgang_Rubin&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hermann_Josef_Unland&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 104/126497 [00:48<17:49:14,  1.97page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Georg_Kliesing&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Rudolf_M%C3%BCller_(politician)&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Franz-Lorenz_von_Thadden&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Willy_Bartsch&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Bernhard_Bu%C3%9Fmann&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Heinrich_M%C3%BCller_(politician)&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Horst_Krockert&action=edit&redlink=1 (response s

Crawling Progress:   0%|          | 110/126497 [00:49<14:08:11,  2.48page/s]

Request exception encountered at http://globetrotter.berkeley.edu/conversations/Elberg/Dahrendorf/dahrendorf0.html: HTTPConnectionPool(host='globetrotter.berkeley.edu', port=80): Max retries exceeded with url: /conversations/Elberg/Dahrendorf/dahrendorf0.html (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x11b6e6a90>: Failed to resolve 'globetrotter.berkeley.edu' ([Errno 8] nodename nor servname provided, or not known)"))


Crawling Progress:   0%|          | 116/126497 [00:50<11:28:42,  3.06page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Heinrich_Welslau&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Werner_Marx_(politician)&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Max_Seidel&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hans_Lautenschlager&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Egon_H%C3%B6hmann&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hans_Georg_Schachtschabel&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Fritz_Logemann&action=edit&redlink=1 (response status 404)
Error while fe

Crawling Progress:   0%|          | 125/126497 [00:52<8:57:52,  3.92page/s] 

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Linus_Memmel&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hans-J%C3%BCrgen_Klinker&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Reinhold_Kreile&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Alo_Hauser&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Erwin_Lange&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Victor_Kirst&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 127/126497 [00:56<17:58:32,  1.95page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Detlef_Kleinert&action=edit&redlink=1 (response status 404)Error while fetching response from https://en.wikipedia.org/w/index.php?title=Christoph_Schiller&action=edit&redlink=1 (response status 404)

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Karl_Haehser&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 133/126497 [00:57<13:55:46,  2.52page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hans_de_With&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Georg_G%C3%B6lter&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Franz_Weigl&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Wenzel_Bredl&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 140/126497 [01:00<15:11:04,  2.31page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Edeltraud_Kuchtner&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Karl_Heinz_Lemmrich&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 142/126497 [01:02<18:26:50,  1.90page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=G%C3%BCnther_Metzger&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Karl_Hofmann_(German_politician)&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Karl_Herold&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 145/126497 [01:04<18:54:20,  1.86page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Lothar_Wrede&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hans-Joachim_Baeuchle&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Philipp_Seibert&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Albrecht_Schlee&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Richard_Kohlberger&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 150/126497 [01:06<15:42:32,  2.23page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Fritz-Joachim_Gn%C3%A4dinger&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Christa_Schroeder_(politician)&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hans-Eberhard_Urbaniak&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Helmut_Esters&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Benno_Erhard&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 157/126497 [01:21<43:22:45,  1.24s/page]

Request exception encountered at https://archive.today/20090801162802/http://www.wzb.eu/bal/spt/default.de.htm: HTTPSConnectionPool(host='archive.today', port=443): Read timed out. (read timeout=15)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Helmut_Prassler&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hermann_Spillecke&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Karl_Liedtke&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 162/126497 [01:23<33:13:11,  1.06page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=G%C3%BCnter_Jaschke&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 168/126497 [01:27<29:40:56,  1.18page/s]

Error while fetching response from http://www.firstmagazine.com/Awards (response status 403)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=21._Jahrhundert&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hermann_Josef_Russe&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 170/126497 [01:28<28:47:35,  1.22page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hans_Bay&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Konrad_Kraske&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Irma_T%C3%BCbler&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Kurt_Ross&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Karl-Heinz_Saxowski&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 174/126497 [01:30<24:44:48,  1.42page/s]

Error while fetching response from https://www.thegazette.co.uk/London/issue/53377/page/12151 (response status 403)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Emil_Solke&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Franz_Xaver_Geisenhofer&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 184/126497 [01:33<18:19:50,  1.91page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Heinrich_Junker&action=edit&redlink=1 (response status 404)Error while fetching response from https://en.wikipedia.org/w/index.php?title=Uwe_Looft&action=edit&redlink=1 (response status 404)

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Max_Vehar&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=G%C3%BCnter_Wichert&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Friedrich_Beermann&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Udo_Giulini&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 190/126497 [01:37<19:42:37,  1.78page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hugo_Brandt&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Wilhelm_Dr%C3%B6scher&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Wilhelm_N%C3%B6lling&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 194/126497 [01:39<18:31:18,  1.89page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Werner_Marquardt&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hans_Hubrig&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Claus_Grobecker&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 197/126497 [01:41<18:45:04,  1.87page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Clemens_Riedel_(politician)&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Juan_Iglesias_Santos&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Georg_Kahn-Ackermann&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 204/126497 [01:43<14:53:32,  2.36page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Olaf_Baron_von_Wrangel&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Heinz_Hartnack&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Karl-Heinz_Mursch&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 210/126497 [01:46<16:18:32,  2.15page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Liselotte_Pieser&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Walter_Langebeck&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Johann_Peter_Josten&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hans_Gertzen&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Harry_Tallert&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 217/126497 [01:48<12:08:22,  2.89page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hermann_Buschfort&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 221/126497 [01:50<14:42:03,  2.39page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Manfred_Schmidt_(politician)&action=edit&redlink=1 (response status 404)Error while fetching response from https://en.wikipedia.org/w/index.php?title=Karl-Heinz_Hansen&action=edit&redlink=1 (response status 404)

Error while fetching response from https://en.wikipedia.org/w/index.php?title=J%C3%BCrgen_Anbuhl&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Manfred_Ge%C3%9Fner&action=edit&redlink=1 (response status 404)
Error while fetching response from https://doi.org/10.1080%2F13621021003594973 (response status 403)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Wolfgang_Schwabe&action=edit&redlink=1 (response status 404)Error while fetching response from https://en.wikipedia.org/w/index.php?title=Gerhard_Fl%C3%A4mig&action=edit&redlink=1 (response status 404)



Crawling Progress:   0%|          | 228/126497 [01:52<12:07:41,  2.89page/s]

Error while fetching response from https://doi.org/10.2307%2F2094817 (response status 420)
Request exception encountered at http://id.bnportugal.gov.pt/aut/catbnp/36694: HTTPSConnectionPool(host='urn.bn.pt', port=443): Max retries exceeded with url: /nca/unimarc-authorities/html?id=36694 (Caused by SSLError(SSLCertVerificationError(1, "[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: Hostname mismatch, certificate is not valid for 'urn.bn.pt'. (_ssl.c:1129)")))
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Gerhard_Jungmann&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 235/126497 [01:56<15:22:42,  2.28page/s]

Error while fetching response from http://www.bath.ac.uk/ceremonies/hongrads/ (response status 404)


Crawling Progress:   0%|          | 236/126497 [01:57<17:56:16,  1.96page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hans_Bals&action=edit&redlink=1 (response status 404)Error while fetching response from https://en.wikipedia.org/w/index.php?title=Isidor_Fr%C3%BCh&action=edit&redlink=1 (response status 404)

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hans_Hermsdorf&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Helmuth_M%C3%B6hring&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Kurt_W%C3%BCster&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Otto_Wulff&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Herbert_Baack&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 241/126497 [02:00<18:14:29,  1.92page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Harry_Liehr&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Herbert_Hermesdorf&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hans_M%C3%BCthling&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Werner_Buchstaller&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 246/126497 [02:02<16:25:20,  2.14page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=G%C3%BCnter_von_Nordenskj%C3%B6ld&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Friedhelm_Dohmann&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Franz_Josef_Zebisch&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Peter-Michael_Koenig&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 257/126497 [02:05<11:49:05,  2.97page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Heinz_P%C3%B6hler&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hans_Stefan_Seifriz&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Siegfried_Zoglmann&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Werner_Staak&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hans_Richarts&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Klaus-Peter_Schulz&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Dietrich_Sperling&action=edit&redlink=1 (response status 404)
Error while fetching resp

Crawling Progress:   0%|          | 263/126497 [02:06<10:26:58,  3.36page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Diedrich_Schr%C3%B6der&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Anton_Stark&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 270/126497 [02:08<9:44:33,  3.60page/s] 

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Gottfried_K%C3%B6ster&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 272/126497 [02:08<8:57:07,  3.92page/s]

Error while fetching response from https://www.uni-konstanz.de/en/university/about-the-university-of-konstanz/profile/50-years-university-of-konstanz/50-years-of-university-history/ (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hermann_Schmitt-Vockenhausen&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 274/126497 [02:10<12:41:37,  2.76page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hannsheinz_Bauer&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hermann_Stahlberg&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Heinz_Frehsee&action=edit&redlink=1 (response status 404)Error while fetching response from https://en.wikipedia.org/w/index.php?title=Wolfgang_Pohle&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Roelf_Heyen&action=edit&redlink=1 (response status 404)

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Horst_Seefeld&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 285/126497 [02:14<11:46:52,  2.98page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Helmut_Lenders&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Friedel_Schirmer_(politician)&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=G%C3%BCnther_Eckerland&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Gerd_Ritgen&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 287/126497 [02:15<14:43:45,  2.38page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hans_Bardens&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Udo_Hein&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hans_Batz&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Adolf_Scheu&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=G%C3%BCnther_Wuttke&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 294/126497 [02:17<11:27:24,  3.06page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Heinrich_Franke&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 300/126497 [02:19<11:52:59,  2.95page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Manfred_Wende&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Lambert_Huys&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Erwin_Schoettle&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Claus_Arndt&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 309/126497 [02:25<17:39:09,  1.99page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Alphons_Horten&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Karl-Heinz_Schmitz&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Kurt_H%C3%A4rzschel&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hermann_Hansing&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 312/126497 [02:40<52:27:53,  1.50s/page]

Request exception encountered at https://search.amphilsoc.org/memhist/search?creator=Ralf+Dahrendorf&title=&subject=&subdiv=&mem=&year=&year-max=&dead=&keyword=&smode=advanced: HTTPSConnectionPool(host='search.amphilsoc.org', port=443): Read timed out. (read timeout=15)


Crawling Progress:   0%|          | 318/126497 [02:41<35:22:56,  1.01s/page]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Maria_Jacobi&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Wilhelm_Maybaum&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Ulrich_D%C3%BCbber&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Albert_T%C3%B6njes&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Rudolf_Werner&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hans_Wissebach&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 324/126497 [02:44<27:09:46,  1.29page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Herbert_Kriedemann&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Vera_Dahrendorf&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Rudi_Lotze&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Alex_H%C3%B6sl&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 326/126497 [02:45<26:20:53,  1.33page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Detlef_Haase&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Alwin_Kulawig&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 331/126497 [02:46<20:32:50,  1.71page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Kurt_Wawrzik&action=edit&redlink=1 (response status 404)Error while fetching response from https://en.wikipedia.org/w/index.php?title=Josef_Mick&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hansheinrich_Schmidt&action=edit&redlink=1 (response status 404)

Error while fetching response from https://api.semanticscholar.org/CorpusID:228893228 (response status 202)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Erika_Wolf&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Josef_L%C3%B6bbert&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Ferdinand_Schmidt_(politician)&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedi

Crawling Progress:   0%|          | 336/126497 [02:47<16:39:44,  2.10page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Ernst_Haar&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Otto_Wittmann&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hubert_Weber_(politician)&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Felix_von_Eckardt&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 343/126497 [02:49<13:28:04,  2.60page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Herbert_Prochazka&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Werner_Wilhelm&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 350/126497 [02:53<14:27:24,  2.42page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Georg_Schlaga&action=edit&redlink=1 (response status 404)Error while fetching response from https://en.wikipedia.org/w/index.php?title=Wolfgang_Vogt&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Josef_Rommerskirchen&action=edit&redlink=1 (response status 404)

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Gerhard_Koch&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=G%C3%B6ke_Frerichs&action=edit&redlink=1 (response status 404)Error while fetching response from https://en.wikipedia.org/w/index.php?title=Franz_Gleissner_(politician)&action=edit&redlink=1 (response status 404)



Crawling Progress:   0%|          | 355/126497 [02:54<13:14:31,  2.65page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Friedhelm_Farthmann&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hellmut_Sieglerschmidt&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 359/126497 [02:56<13:43:24,  2.55page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Erwin_K._Scheuch&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Werner_Mertes&action=edit&redlink=1 (response status 404)Error while fetching response from https://en.wikipedia.org/w/index.php?title=Rodrigo_Ur%C3%ADa_Gonz%C3%A1lez&action=edit&redlink=1 (response status 404)



Crawling Progress:   0%|          | 364/126497 [02:58<12:35:08,  2.78page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Helmut_Kater&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Oscar_Schneider&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Rudolf_Opitz_(politician)&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=G%C3%BCnter_Slotta&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 366/126497 [03:01<22:02:33,  1.59page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Alfons_Bayerl&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Bruno_Wiefel&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Lothar_Krall&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 372/126497 [03:02<15:38:40,  2.24page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=German_Otto_Stehle&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Martin_Hirsch_(politician)&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Wilhelm_Michels&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Alfred_Ollesch&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Elfriede_Seppi&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 378/126497 [03:05<14:30:50,  2.41page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Heinz_Pensky&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Ernst_Schellenberg&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 383/126497 [03:06<12:33:43,  2.79page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Walter_Picard&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Franz_Lenze&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Wolfgang_Rutschke&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 388/126497 [03:09<14:34:11,  2.40page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Erich_Wolfram&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Helmuth_Becker&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Heinrich_K%C3%B6ppler&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Reinhard_B%C3%BChling&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 393/126497 [03:10<12:44:23,  2.75page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Manfred_Schulte&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=G%C3%BCnter_Volmer&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Luis_S%C3%A1nchez_Agesta&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Friedhelm_Halfmeier&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Ellen_Lauterbach&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 397/126497 [03:11<12:23:55,  2.83page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Carl_Reinhard&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Rolf_Meinecke&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Fritz_Rinderspacher&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Karl_Miltner&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Heinrich_Draeger&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 406/126497 [03:13<10:13:21,  3.43page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hans_Roser_(politician)&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Peter_B%C3%BCchner&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 410/126497 [03:15<11:17:19,  3.10page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Friedrich_Sch%C3%A4fer&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Johann_Wuwer&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Walter_Fritsch_(politician)&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Marie-Elisabeth_Klee&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 412/126497 [03:16<12:28:13,  2.81page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Udo_Fiebig&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Gisbert_Kley&action=edit&redlink=1 (response status 404)
Error while fetching response from https://www.birmingham.ac.uk/schools/historycultures/departments/history/news/2021/ralf-dahrendorf-prize-children-born-of-war.aspx (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Edith_Krappe&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Joachim_Raffert&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 416/126497 [03:17<12:12:09,  2.87page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hubert_Lemper&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Friedrich_Schonhofen&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Erich_Meinike&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hans_Lemp&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Rudolf_Kaffka&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Erich_Henke&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 423/126497 [03:20<12:33:52,  2.79page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Maria_Henze&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 429/126497 [03:22<13:21:15,  2.62page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Erwin_Folger&action=edit&redlink=1 (response status 404)Error while fetching response from https://en.wikipedia.org/w/index.php?title=Eberhard_Pohlmann&action=edit&redlink=1 (response status 404)

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hermann_D%C3%BCrr&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Peter_W%C3%BCrtz&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 432/126497 [03:25<16:19:32,  2.14page/s]

Error while fetching response from http://www.emeraldinsight.com/Insight/viewContentItem.do;jsessionid=9F328F753CC45ECD41DD5147379E5983?contentType=Article&hdAction=lnkpdf&contentId=1702699&history=true (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Heinrich_Gewandt&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 437/126497 [03:26<14:38:56,  2.39page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Martin_Schmidt_(politician)&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Kurt_Matthes&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 440/126497 [03:28<16:22:09,  2.14page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Richard_Tambl%C3%A9&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Gerhard_Kienbaum&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Albert_Leicht&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 443/126497 [03:30<17:27:42,  2.01page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Gerd_Springorum&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Lothar_Haase&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Karl_Gatzen&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 446/126497 [03:35<25:51:22,  1.35page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Georg_Kotowski&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 450/126497 [03:36<21:32:07,  1.63page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Kurt_Schober&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hermann_Haage&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=G%C3%BCnter_Biermann&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Herbert_Gruhl&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Franz_Varelmann&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=August_Hanz&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 456/126497 [03:38<18:35:59,  1.88page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Paul_R%C3%B6hner&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Winfried_Pinger&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Eugen_Glombig&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Paul_Gerlach&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 461/126497 [03:40<15:29:33,  2.26page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Georg_Neemann&action=edit&redlink=1 (response status 404)Error while fetching response from https://en.wikipedia.org/w/index.php?title=Horst_Schmidt&action=edit&redlink=1 (response status 404)



Crawling Progress:   0%|          | 463/126497 [03:40<13:47:47,  2.54page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Dieter_Hussing&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Otto_Schmidt_(politician)&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 475/126497 [03:43<11:14:05,  3.12page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hans_Evers&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Maria_Stommel&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Franz_Marx_(politician)&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Franz_Seume&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Walther_Hellige&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hans-Ulrich_Brand&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Erwin_H%C3%A4ussler&action=edit&redlink=1 (response status 404)
Error while fetching response f

Crawling Progress:   0%|          | 482/126497 [03:59<44:03:47,  1.26s/page]

Request exception encountered at http://www.jafi.org.il/education/hasbara/headlines/a4-4.html: HTTPConnectionPool(host='www.jafi.org.il', port=80): Max retries exceeded with url: /education/hasbara/headlines/a4-4.html (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x11e9371f0>, 'Connection to www.jafi.org.il timed out. (connect timeout=15)'))
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Gustav_Stein&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Klaus_Schmid-Burgk&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Kurt_Spitzm%C3%BCller&action=edit&redlink=1 (response status 404)Error while fetching response from https://en.wikipedia.org/w/index.php?title=Manfred_Luda&action=edit&redlink=1 (response status 404)



Crawling Progress:   0%|          | 489/126497 [04:00<29:30:28,  1.19page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hugo_Hammans&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Hermann_Schmidt&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Siegfried_Meister_(politician)&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 494/126497 [04:03<25:33:30,  1.37page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Georg_Schulhoff&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 498/126497 [04:05<23:30:47,  1.49page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Willi_B%C3%A4uerle&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Klaus_Richter&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Albert_Schedl&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 502/126497 [04:06<20:14:51,  1.73page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Josef_R%C3%B6sing&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Otto_Freiherr_von_Fircks&action=edit&redlink=1 (response status 404)
Error while fetching response from https://en.wikipedia.org/w/index.php?title=Josef_M%C3%BCller_(CDU_politician)&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 509/126497 [04:08<15:58:33,  2.19page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Heinz_Eyrich&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 517/126497 [04:11<14:10:32,  2.47page/s]

Error while fetching response from https://en.wikipedia.org/w/index.php?title=Peter_S%C3%A4ckl&action=edit&redlink=1 (response status 404)


Crawling Progress:   0%|          | 553/126497 [04:51<25:57:20,  1.35page/s]

In [19]:
def print_first_10_urls(db_name=DB_NAME):
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()

    cursor.execute("SELECT title, url FROM documents LIMIT 20")
    rows = cursor.fetchall()
    for row in rows:
        print(row[0], row[1])

    conn.close()

# Call the function to print the first 10 URLs
print_first_10_urls()

Welcome to Tübingen - City of Tuebingen https://www.tuebingen.de/en/
Tübingen - Wikipedia https://en.wikipedia.org/wiki/T%C3%BCbingen
Culture and Leisure - City of Tuebingen https://www.tuebingen.de/en/3515.html
Welcome to Tübingen - City of Tuebingen https://www.tuebingen.de/en
Portrait of the City - City of Tuebingen https://www.tuebingen.de/en/1815.html
Imprint - City of Tuebingen https://www.tuebingen.de/en/1821.html
Immigration office - City of Tuebingen https://www.tuebingen.de/en/42067.html
European Elections 2024 - City of Tuebingen https://www.tuebingen.de/en/42070.html
City and Guests - City of Tuebingen https://www.tuebingen.de/en/3494.html
Ralf Dahrendorf - Wikipedia https://en.wikipedia.org/wiki/Ralf_Dahrendorf
Tübingen - Wikipedia https://en.wikipedia.org/wiki/T%C3%BCbingen#cite_ref-14
Johann Friedrich Cotta - Wikipedia https://en.wikipedia.org/wiki/Johann_Friedrich_Cotta
Pfrondorf - Wikipedia https://en.wikipedia.org/wiki/Pfrondorf
Tübingen - Wikipedia https://en.wikiped

In [21]:
def get_total_indexed_docs(db_name=DB_NAME):
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()

    cursor.execute("SELECT count(*) FROM documents")
    total_count = cursor.fetchone()
    print(f"Total number of documents indexed: {total_count[0]}")

    conn.close()

get_total_indexed_docs()

Total number of documents indexed: 489


In [22]:
def count_remaining_frontier(db_name=DB_NAME):
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()

    cursor.execute("SELECT count(*) FROM frontier WHERE crawled = 0")
    total_count = cursor.fetchone()
    print(f"Total number of URLs remaining in the frontier: {total_count[0]}")

    conn.close()

count_remaining_frontier()

Total number of URLs remaining in the frontier: 126406


In [5]:
#Add a document to the index. You need (at least) two parameters:
	#doc: The document to be indexed.
	#index: The location of the local index storing the discovered documents.
def index_doc(doc, index_path):
    conn = sqlite3.connect(index_path)
    cursor = conn.cursor()
    cursor.execute('''
    INSERT OR IGNORE INTO documents (url, title, content, outgoing_links, timestamp)
    VALUES (?, ?, ?, ?, ?)
    ''', (doc['url'], doc['title'], doc['content'], ','.join(doc['outgoing_links']), doc['timestamp']))
    conn.commit()
    
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

def is_relevant(content):
    # Check for simple keyword presence
    if not any(word in content for word in ['Tübingen', 'Tubingen', 'Tuebingen', 'tübingen', 'tubingen', 'tuebingen']):
        return False
    
    # Perform NER and check for geographic relevance
    doc = nlp(content)
    for ent in doc.ents:
        if ent.label_ == "GPE" and "tübingen" in ent.text.lower():
            return True
    return False

def index_doc(doc, index_path):
    if not is_relevant(doc['content']):
        #print("Skipping non-relevant document.")
        return

    conn = sqlite3.connect(index_path)
    cursor = conn.cursor()
    cursor.execute('''
    INSERT OR IGNORE INTO documents (url, title, content, outgoing_links, timestamp)
    VALUES (?, ?, ?, ?, ?)
    ''', (doc['url'], doc['title'], doc['content'], ','.join(doc['outgoing_links']), doc['timestamp']))
    conn.commit()
    conn.close()

    conn.close()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as Service
from selenium.webdriver.chrome.options import Options
#from webdriver_manager.chrome import ChromeDriverManager
import time
from bs4 import BeautifulSoup
import datetime
from langdetect import detect, LangDetectException
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

def crawl_page(url, index_path):
    options = Options()
    options.headless = True
    options.add_argument('--no-sandbox')
    service = Service("/opt/homebrew/Caskroom/chromedriver/126.0.6478.126/chromedriver-mac-arm64/chromedriver")
    driver = webdriver.Chrome(service=service, options=options)
    try:
        driver.get(url)
        time.sleep(3)  # Wait for the page to load

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        title = soup.title.string if soup.title else ""
        content = soup.get_text(separator=' ', strip=True)

        # Check if the page contains the word "Tübingen"
        if not any(word in content for word in ['Tübingen', 'Tubingen', 'Tuebingen', 'tübingen', 'tubingen', 'tuebingen']):
            #print(f"Skipping url {url} Page does not contain Tübingen...")
            return None

        # Check if English content is present
        if not any(word in content for word in ['the', 'and', 'is', 'in']):
            #print(f"Skipping url {url} Page does not contain English content")
            return None

        # Language detection
        try:
            if detect(content) != 'en':
                #print(f"Skipping url {url} Page does not contain English content")
                return None
        except LangDetectException:
            return None

        links = [link.get_attribute("href") for link in driver.find_elements(By.TAG_NAME, "a") if link.get_attribute("href") and "tuebingen" in link.get_attribute("href").lower()]
        doc = {
            'url': url,
            'title': title,
            'content': content,
            'outgoing_links': links,
            'timestamp': datetime.datetime.now().isoformat()
        }

    except (StaleElementReferenceException, TimeoutException, NoSuchElementException) as e:
        print(f"Exception {type(e).__name__} encountered at {url}: {e}")
        return None
    except Exception as e:
        print(f"Unexpected exception encountered at {url}: {e}")
        return None
    finally:
        driver.quit()

    return doc

def crawl(frontier, index_path):
    conn = sqlite3.connect(index_path)
    cursor = conn.cursor()

    # Calculate total number of URLs to be crawled
    cursor.execute("SELECT COUNT(*) FROM frontier WHERE crawled = 0")
    total_to_crawl = cursor.fetchone()[0]

    with tqdm(total=total_to_crawl, desc="Crawling Progress", unit="page") as pbar:
        while True:
            cursor.execute("SELECT url FROM frontier WHERE crawled = 0 LIMIT 10")
            rows = cursor.fetchall()
            if not rows:
                break

            urls = [row[0] for row in rows]
            with ThreadPoolExecutor(max_workers=2) as executor:
                results = executor.map(lambda url: crawl_page(url, index_path), urls)

            for url, doc in zip(urls, results):
                if doc:
                    cursor.execute("SELECT 1 FROM documents WHERE url = ? LIMIT 1", (doc['url'],))
                    if cursor.fetchone() is None:
                        index_doc(doc, index_path)
                        pbar.update(1)
                cursor.execute("UPDATE frontier SET crawled = 1 WHERE url = ?", (url,))
                conn.commit()
                if doc:
                    for link in doc['outgoing_links']:
                        cursor.execute("INSERT OR IGNORE INTO frontier (url) VALUES (?)", (link,))
                        conn.commit()
    
    conn.close()

def initialize_frontier(initial_urls, db_name="crawler.db"):
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    for url in initial_urls:
        cursor.execute("INSERT OR IGNORE INTO frontier (url) VALUES (?)", (url,))
    conn.commit()
    conn.close()

def calculate_incoming_links(db_name="crawler.db"):
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()

    # Create a temporary table to store incoming links
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS incoming_links (
        url TEXT PRIMARY KEY,
        incoming_count INTEGER DEFAULT 0
    )''')

    # Clear any existing data
    cursor.execute("DELETE FROM incoming_links")

    # Retrieve all documents and their outgoing links
    cursor.execute("SELECT url, outgoing_links FROM documents")
    rows = cursor.fetchall()

    for row in rows:
        url, outgoing_links = row
        outgoing_links_list = outgoing_links.split(',')

        for link in outgoing_links_list:
            cursor.execute('''
            INSERT INTO incoming_links (url, incoming_count)
            VALUES (?, 1)
            ON CONFLICT(url) DO UPDATE SET incoming_count = incoming_count + 1
            ''', (link,))

    conn.commit()
    conn.close()

# Initialize and run the crawler
initial_urls = [
    "https://www.tuebingen.de/en/",
    #"https://en.wikipedia.org/wiki/T%C3%BCbingen",
    #"https://www.uni-tuebingen.de/en.html"
]

setup_database()
initialize_frontier(initial_urls)
crawl("crawler.db", "crawler.db")
calculate_incoming_links()

ModuleNotFoundError: No module named 'selenium'

In [7]:
import requests
from bs4 import BeautifulSoup
import datetime
from langdetect import detect, LangDetectException

def crawl_page(url, index_path):
    try:
        response = requests.get(url, timeout=10)  # Fetch the web page
        if response.status_code != 200:
            return None

        soup = BeautifulSoup(response.text, 'html.parser')
        title = soup.title.string if soup.title else ""
        content = ' '.join(soup.stripped_strings)  # Using stripped_strings to clean up the text

        # Check if the page contains the word "Tübingen"
        if not any(word in content.lower() for word in ['tübingen', 'tubingen', 'tuebingen']):
            return None

        # Check if English content is present
        if not any(word in content.lower() for word in ['the', 'and', 'is', 'in']):
            return None

        # Language detection
        try:
            if detect(content) != 'en':
                return None
        except LangDetectException:
            return None

        # Extract outgoing links that contain "tuebingen"
        links = [a['href'] for a in soup.find_all('a', href=True) if "tuebingen" in a['href'].lower()]

        doc = {
            'url': url,
            'title': title,
            'content': content,
            'outgoing_links': links,
            'timestamp': datetime.datetime.now().isoformat()
        }

    except requests.RequestException as e:
        print(f"Request exception encountered at {url}: {e}")
        return None
    except Exception as e:
        print(f"Unexpected exception encountered at {url}: {e}")
        return None

    return doc

ModuleNotFoundError: No module named 'bs4'

In [4]:
def print_first_10_urls(db_name="crawler.db"):
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()

    cursor.execute("SELECT title, url FROM documents LIMIT 20")
    rows = cursor.fetchall()
    for row in rows:
        print(row[0], row[1])

    conn.close()

# Call the function to print the first 10 URLs
print_first_10_urls()

Welcome to Tübingen - City of Tuebingen https://www.tuebingen.de/en/
Tübingen - Wikipedia https://en.wikipedia.org/wiki/T%C3%BCbingen
Welcome to Tübingen - City of Tuebingen https://www.tuebingen.de/en
Immigration office - City of Tuebingen https://www.tuebingen.de/en/42067.html
European Elections 2024 - City of Tuebingen https://www.tuebingen.de/en/42070.html
Portrait of the City - City of Tuebingen https://www.tuebingen.de/en/1815.html
City and Guests - City of Tuebingen https://www.tuebingen.de/en/3494.html
Culture and Leisure - City of Tuebingen https://www.tuebingen.de/en/3515.html
Imprint - City of Tuebingen https://www.tuebingen.de/en/1821.html
Welcome to Tübingen - City of Tuebingen https://www.tuebingen.de/en/#
File:Altstadt-tuebingen-1.jpg - Wikipedia https://en.wikipedia.org/wiki/File:Altstadt-tuebingen-1.jpg
File:Wappen Tuebingen.svg - Wikipedia https://en.wikipedia.org/wiki/File:Wappen_Tuebingen.svg
File:TuebingenNeckar.jpg - Wikipedia https://en.wikipedia.org/wiki/File:Tu

In [4]:
import math
from collections import Counter

def fetch_documents(index_path):
    conn = sqlite3.connect(index_path)
    cursor = conn.cursor()
    cursor.execute("SELECT content FROM documents")
    documents = [doc[0] for doc in cursor.fetchall()]
    conn.close()
    return documents

def bm25(documents, query, k1=1.5, b=0.75):
    # Tokenize documents and query
    tokenized_docs = [doc.lower().split() for doc in documents]
    tokenized_query = query.lower().split()

    # Calculate document frequencies
    df = {}
    for word in tokenized_query:
        df[word] = sum(word in doc for doc in tokenized_docs)

    # Calculate IDF
    N = len(documents)
    idf = {word: math.log((N - df[word] + 0.5) / (df[word] + 0.5)) for word in df}

    # Compute BM25 scores
    scores = []
    avgdl = sum(len(doc) for doc in tokenized_docs) / N
    for index, doc in enumerate(tokenized_docs):
        doc_len = len(doc)
        f = Counter(doc)
        score = sum(idf[word] * f[word] * (k1 + 1) / (f[word] + k1 * (1 - b + b * doc_len / avgdl)) for word in tokenized_query if word in f)
        scores.append((index, score))
    
    scores.sort(key=lambda x: x[1], reverse=True)
    return scores

documents = fetch_documents("crawler.db")
query = "Tuebingen"
results = bm25(documents, query)
print(results)

[(642, 4.775923434800474), (640, 4.772457630479728), (908, 4.767770040902589), (906, 4.764316055753173), (517, 4.730409989359467), (515, 4.726479081441341), (188, 4.054680750861058), (1464, 4.054680750861058), (1465, 4.054680750861058), (1466, 4.054680750861058), (1467, 4.054680750861058), (1468, 4.054680750861058), (1469, 4.054680750861058), (1470, 4.054680750861058), (1471, 4.054680750861058), (1472, 4.054680750861058), (1473, 4.054680750861058), (1474, 4.054680750861058), (1475, 4.054680750861058), (6, 3.9613292950762737), (33, 3.9613292950762737), (764, 3.8941387383641075), (753, 3.806703113051436), (111, 3.744614494818716), (749, 3.744614494818716), (752, 3.744614494818716), (763, 3.744614494818716), (1596, 3.679187373316038), (201, 3.589702882214677), (1044, 3.5636051636334285), (1048, 3.56168557335207), (914, 3.5532638862560453), (923, 3.454863322129475), (150, 3.4530590656906055), (22, 3.418466973778695), (169, 3.418466973778695), (37, 3.405789140766824), (212, 3.40578914076682

In [1]:
import torch
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')

def semantic_search(documents, query):
    doc_embeddings = model.encode(documents, convert_to_tensor=True)
    query_embedding = model.encode(query, convert_to_tensor=True)
    scores = util.pytorch_cos_sim(query_embedding, doc_embeddings)[0]
    ranked_results = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
    return ranked_results

documents = fetch_documents("crawler.db")
query = "Example search query"
results = semantic_search(documents, query)
print(results)

  from .autonotebook import tqdm as notebook_tqdm


AttributeError: module 'torch.backends' has no attribute 'mps'

# 2. Query Processing 
Process a textual query and return the 100 most relevant documents from your index. Please incorporate **at least one retrieval model innovation** that goes beyond BM25 or TF-IDF. Please allow for queries to be entered either individually in an interactive user interface (see also #3 below), or via a batch file containing multiple queries at once. The batch file will be formatted to have one query per line, listing the query number, and query text as tab-separated entries. An example of the batch file for the first two queries looks like this:

```
1   tübingen attractions
2   food and drinks
```

In [None]:
#Retrieve documents relevnt to a query. You need (at least) two parameters:
	#query: The user's search query
	#index: The location of the local index storing the discovered documents.
def retrieve(query, index):
    #TODO: Implement me
	pass

# 3. Search Result Presentation
Once you have a result set, we want to return it to the searcher in two ways: a) in an interactive user interface. For this user interface, please think of **at least one innovation** that goes beyond the traditional 10-blue-links interface that most commercial search engines employ. b) as a text file used for batch performance evaluation. The text file should be formatted to produce one ranked result per line, listing the query number, rank position, document URL and relevance score as tab-separated entries. An example of the first three lines of such a text file looks like this:

```
1   1   https://www.tuebingen.de/en/3521.html   0.725
1   2   https://www.komoot.com/guide/355570/castles-in-tuebingen-district   0.671
1   3   https://www.unimuseum.uni-tuebingen.de/en/museum-at-hohentuebingen-castle   0.529
...
1   100 https://www.tuebingen.de/en/3536.html   0.178
2   1   https://www.tuebingen.de/en/3773.html   0.956
2   2   https://www.tuebingen.de/en/4456.html   0.797
...
```

In [None]:
#TODO: Implement an interactive user interface for part a of this exercise.

#Produce a text file with 100 results per query in the format specified above.
def batch(results): 
    #TODO: Implement me.    
    pass

# 4. Performance Evaluation 
We will evaluate the performance of our search systems on the basis of five queries. Two of them are avilable to you now for engineering purposes:
- `tübingen attractions`
- `food and drinks`

The remaining three queries will be given to you during our final session on July 23rd. Please be prepared to run your systems and produce a single result file for all five queries live in class. That means you should aim for processing times of no more than ~1 minute per query. We will ask you to send carsten.eickhoff@uni-tuebingen.de that file.

# Grading
Your final projects will be graded along the following criteria:
- 25% Code correctness and quality (to be delivered on this sheet)
- 25% Report (4 pages, PDF, explanation and justification of your design choices)
- 25% System performance (based on how well your system performs on the 5 queries relative to the other teams in terms of nDCG)
- 15% Creativity and innovativeness of your approach (in particular with respect to your search system #2 and user interface #3 innovations)
- 10% Presentation quality and clarity

# Permissible libraries
You can use any general-puprose ML and NLP libraries such as scipy, numpy, scikit-learn, spacy, nltk, but please stay away from dedicated web crawling or search engine toolkits such as scrapy, whoosh, lucene, terrier, galago and the likes. Pretrained models are fine to use as part of your system, as long as they have not been built/trained for retrieval. 
