In [6]:
from urllib.parse import urlparse, urljoin
from concurrent.futures import ThreadPoolExecutor, wait
import multiprocessing
from queue import Queue, Empty
from pathlib import Path
import os
import pickle
import requests
from bs4 import BeautifulSoup, Comment
import json


## Develop a simple multithreaded web crawler (pg 36-41)

In [7]:
class MultiThreadedCrawler:
    def __init__(self, base_url, depth):
        self.base_url = base_url
        extracted_url = urlparse(self.base_url)
        parent = extracted_url.path[: extracted_url.path.rfind("/") + 1]
        self.root_url = f"{extracted_url.scheme}://{extracted_url.netloc}{parent}"
        self.pool = ThreadPoolExecutor(max_workers=multiprocessing.cpu_count())
        self.to_crawl = Queue()
        self.to_crawl.put({self.base_url: depth})
        self.stored_folder = Path(os.path.abspath("")) / "crawled/"
        self.pickle_name="url_list.pickle"
        self.pending_crawl = set()
        self.failed_crawl = set()
        self.depth = depth
        print(self.stored_folder)
        if not Path(self.stored_folder).exists():
            Path.mkdir(self.stored_folder)
        if Path(self.stored_folder / self.pickle_name).exists():
            with open(self.stored_folder / self.pickle_name, "rb") as f:
                self.crawled_pages = pickle.load(f)
            print(self.crawled_pages)
        else:
            self.crawled_pages = set()

    def extract_page(self, obj):
        if obj.result():
            result, url, depth = obj.result()
            self.pending_crawl.remove(url)
            if result and result.status_code == 200:
                self.crawled_pages.add(url)
                url_lists = self.parse_links(result.text, depth)
                self.parse_contents(url, result.text, url_lists)

    def get_page(self, url, depth):
        try:
            res = requests.get(url, timeout=(3, 30))
            return res, url, depth
        except requests.RequestException as e:
            self.failed_crawl.add(url)
            print(e)
            return

    def parse_links(self, html, depth):
        soup = BeautifulSoup(html, "html.parser")
        links = soup.find_all("a", href=True)
        url_lists = []
        for link in links:
            url = link["href"]
            url = urljoin(self.root_url, url)
            if url not in self.crawled_pages.union(self.pending_crawl).union(self.failed_crawl) and depth >= 0 and r"." not in url.split(r"/")[-1]:
                print("Adding@{}: {}".format(depth, url))
                self.to_crawl.put({url: depth})
            url_lists.append(url)
        return url_lists

    def parse_contents(self, url, html, url_lists):
        def tag_visible(element):
            if element.parent.name in [
                "style",
                "script",
                "head",
                "title",
                "meta",
                "[document]",
            ]:
                return False
            if isinstance(element, Comment):
                return False
            return True

        try:
            soup = BeautifulSoup(html, "html.parser")
            texts = soup.findAll(string=True)
            visible_texts = filter(tag_visible, texts)
            title = ""
            if soup.find("title"):
                title = soup.find("title").string.strip()
            text = " ".join(t.strip() for t in visible_texts).strip()
            with open(
                self.stored_folder / (str(hash(url)) + ".json"), "w", encoding="utf-8"
            ) as f:
                json.dump(
                    {"url": url, "title": title, "text": text, "url_lists": url_lists},
                    f,
                    ensure_ascii=False,
                    indent=4,
                )
        except:
            pass

    def run_scraper(self):
        while True:
            try:
                target = self.to_crawl.get(timeout=30)
                url, depth = [(k, target[k]) for k in target][0]
                if (url not in self.crawled_pages.union(self.pending_crawl).union(self.failed_crawl) and depth >= 0) or depth == self.depth:
                    self.pending_crawl.add(url)
                    job = self.pool.submit(self.get_page, url, depth - 1)
                    job.add_done_callback(self.extract_page)
            except Empty:
                with open(self.stored_folder / self.pickle_name, "wb") as f:
                    pickle.dump(self.crawled_pages, f, pickle.HIGHEST_PROTOCOL)
                with open(self.stored_folder / self.pickle_name, "rb") as f:
                    print(pickle.load(f))
                break
            except Exception as e:
                print(e)
                continue


In [8]:
s = MultiThreadedCrawler(r"https://www.med.cmu.ac.th/web/", 4)
s.run_scraper()


/home/mansmooth/ir-py/hands-on/06/crawled
Adding@3: https://www.facebook.com/medcmuth
Adding@3: https://twitter.com/medcmuofficial
Adding@3: https://www.youtube.com/channel/UCfTy3lgVupz8SJpw4Cc-EQw
Adding@3: https://www.instagram.com/medcmuth/
Adding@3: https://line.me/R/ti/p/@medcmu
Adding@3: https://mail.cmu.ac.th/
Adding@3: https://vpn.med.cmu.ac.th/
Adding@3: https://www.med.cmu.ac.th/web/covid19/
Adding@3: https://www.med.cmu.ac.th/web/med-donate/
Adding@3: https://www.med.cmu.ac.th/main/
Adding@3: https://www.med.cmu.ac.th/web/linkweb/
Adding@3: https://www.med.cmu.ac.th/en/
Adding@3: https://www.med.cmu.ac.th/web/about-medcmu/
Adding@3: javascript:void(0);
Adding@3: https://www.med.cmu.ac.th/web/about-medcmu/the-history-of-med-cmu/
Adding@3: https://www.med.cmu.ac.th/web/about-medcmu/timeline-of-discovery/
Adding@3: https://www.med.cmu.ac.th/web/about-medcmu/message-from-the-dean/
Adding@3: https://www.med.cmu.ac.th/web/about-medcmu/leadership/
Adding@3: javascript:void(0);
Addi

KeyboardInterrupt: 

Adding@0: https://w2.med.cmu.ac.th/omics/staff-th/
Adding@0: https://w2.med.cmu.ac.th/omics/dumnoensun-pruksakorn/
Adding@0: https://w2.med.cmu.ac.th/omics/content-th/
Adding@0: https://w2.med.cmu.ac.th/omics/guildline/
Adding@0: https://w2.med.cmu.ac.th/omics/sru-start-your-biobanking-with-sru/
Adding@0: https://w2.med.cmu.ac.th/omics/sru-the-first-official-human-biobank-in-cmu/
Adding@0: https://w2.med.cmu.ac.th/omics/sru-3-what-you-need-to-know/
Adding@0: https://w2.med.cmu.ac.th/omics/content-th/
Adding@0: https://w2.med.cmu.ac.th/omics/sru-start-your-biobanking-with-sru/
Adding@0: https://www.med.cmu.ac.th/web/#1653651194383-5fd02870-ae4e
No connection adapters were found for 'tel:053936606-7'
Adding@0: https://www.med.cmu.ac.th/web/#vc_images-carousel-1-1707726036
Adding@0: https://www.med.cmu.ac.th/web/#vc_images-carousel-1-1707726036
Adding@0: https://w2.med.cmu.ac.th/omics/clc-genomics-workbench-20-qiagen/
Adding@0: https://w2.med.cmu.ac.th/omics/workshop-gut-microbiota-2/
Add

In [9]:
with open(s.stored_folder / s.pickle_name, "wb") as f:
    pickle.dump(s.crawled_pages, f, pickle.HIGHEST_PROTOCOL)

len(s.crawled_pages)


3223

Adding@0: https://w2.med.cmu.ac.th/trauma-center/event/asset-2566/
Adding@0: https://w2.med.cmu.ac.th/trauma-center/event/asset-2566/
Adding@0: https://w2.med.cmu.ac.th/trauma-center/event/asset-2566/
Adding@0: https://w2.med.cmu.ac.th/trauma-center/event/atls-2-2566-2/
Adding@0: https://w2.med.cmu.ac.th/trauma-center/event/atls-2-2566-2/
Adding@0: https://w2.med.cmu.ac.th/trauma-center/event/atls-2-2566-2/
Adding@0: https://w2.med.cmu.ac.th/trauma-center/event/atls-2-2566/
Adding@0: https://w2.med.cmu.ac.th/trauma-center/event/atls-2-2566/
Adding@0: https://w2.med.cmu.ac.th/trauma-center/event/atls-2-2566/
Adding@0: https://w2.med.cmu.ac.th/trauma-center/event/atls1-2566/
Adding@0: https://w2.med.cmu.ac.th/trauma-center/event/atls1-2566/
Adding@0: https://w2.med.cmu.ac.th/trauma-center/event/atls1-2566/
Adding@0: https://w2.med.cmu.ac.th/trauma-center/event/%e0%b8%8b%e0%b9%89%e0%b8%ad%e0%b8%a1%e0%b9%81%e0%b8%9c%e0%b8%99%e0%b8%af-%e0%b8%9b%e0%b8%b5-2566/
Adding@0: https://w2.med.cmu.ac