In [3]:
from urllib.parse import urlparse, urljoin
from concurrent.futures import ThreadPoolExecutor, wait
import multiprocessing
from queue import Queue, Empty
from pathlib import Path
import os
import pickle
import requests
from bs4 import BeautifulSoup, Comment
import json
import time


## Develop a simple multithreaded web crawler (pg 36-41)

In [4]:
class MultiThreadedCrawler:
    def __init__(self, base_url, depth):
        self.base_url = base_url
        extracted_url = urlparse(self.base_url)
        parent = extracted_url.path[: extracted_url.path.rfind("/") + 1]
        self.root_url = f"{extracted_url.scheme}://{extracted_url.netloc}{parent}"
        self.pool = ThreadPoolExecutor(max_workers=multiprocessing.cpu_count())
        self.to_crawl = Queue()
        self.to_crawl.put({self.base_url: depth})
        self.stored_folder = Path(os.path.abspath("")) / "crawled/"
        self.pickle_name = "url_list.pickle"
        self.pending_crawl = set()
        self.failed_crawl = set()
        self.depth = depth
        self.job_counter = 0
        print(self.stored_folder)
        if not Path(self.stored_folder).exists():
            Path.mkdir(self.stored_folder)
        if Path(self.stored_folder / self.pickle_name).exists():
            with open(self.stored_folder / self.pickle_name, "rb") as f:
                self.crawled_pages = pickle.load(f)
            print(self.crawled_pages)
        else:
            self.crawled_pages = set()

    def extract_page(self, obj):
        if obj.result():
            result, url, depth = obj.result()
            self.pending_crawl.remove(url)
            if result and result.status_code == 200:
                self.crawled_pages.add(url)
                url_lists = self.parse_links(result.text, depth)
                self.parse_contents(url, result.text, url_lists)
        self.job_counter -= 1

    def get_page(self, url, depth):
        try:
            res = requests.get(url, timeout=(3, 30))
            return res, url, depth
        except requests.RequestException as e:
            self.failed_crawl.add(url)
            print(e)
            return

    def parse_links(self, html, depth):
        soup = BeautifulSoup(html, "html.parser")
        links = soup.find_all("a", href=True)
        url_lists = []
        for link in links:
            url = link["href"]
            url = urljoin(self.root_url, url)
            if (
                url
                not in self.crawled_pages.union(self.pending_crawl).union(
                    self.failed_crawl
                )
                and depth >= 0
                and r"." not in url.split(r"/")[-1]
            ):
                print("Adding@{}: {}".format(depth, url))
                self.to_crawl.put({url: depth})
            url_lists.append(url)
        return url_lists

    def parse_contents(self, url, html, url_lists):
        def tag_visible(element):
            if element.parent.name in [
                "style",
                "script",
                "head",
                "title",
                "meta",
                "[document]",
            ]:
                return False
            if isinstance(element, Comment):
                return False
            return True

        try:
            soup = BeautifulSoup(html, "html.parser")
            texts = soup.findAll(string=True)
            visible_texts = filter(tag_visible, texts)
            title = ""
            if soup.find("title"):
                title = soup.find("title").string.strip()
            text = " ".join(t.strip() for t in visible_texts).strip()
            with open(
                self.stored_folder / (str(hash(url)) + ".json"), "w", encoding="utf-8"
            ) as f:
                json.dump(
                    {"url": url, "title": title, "text": text, "url_lists": url_lists},
                    f,
                    ensure_ascii=False,
                    indent=4,
                )
        except:
            pass

    def run_scraper(self):
        while True:
            try:
                target = self.to_crawl.get(timeout=30)
                url, depth = [(k, target[k]) for k in target][0]
                if (
                    url
                    not in self.crawled_pages.union(self.pending_crawl).union(
                        self.failed_crawl
                    )
                    and depth >= 0
                ) or depth == self.depth:
                    self.pending_crawl.add(url)
                    self.job_counter += 1
                    job = self.pool.submit(self.get_page, url, depth - 1)
                    job.add_done_callback(self.extract_page)
            except Empty:
                while self.job_counter > 0:
                    time.sleep(1)
                    print(f"{self.job_counter = }")
                    pass
                with open(self.stored_folder / self.pickle_name, "wb") as f:
                    pickle.dump(self.crawled_pages, f, pickle.HIGHEST_PROTOCOL)
                with open(self.stored_folder / self.pickle_name, "rb") as f:
                    print(pickle.load(f))
                break
            except Exception as e:
                print(e)
                continue


In [5]:
s = MultiThreadedCrawler(r"https://www.camt.cmu.ac.th/index.php/th/", 3)
s.run_scraper()


/home/mansmooth/ir-py/hands-on/06/crawled


Adding@2: https://service.camt.cmu.ac.th/studentForm/
Adding@2: https://docs.google.com/forms/d/e/1FAIpQLSed6GzXMcp02eiQybekUX0FNg8BR6e46SqIZE1Vxog-szqaiw/viewform
Adding@2: https://www.camt.cmu.ac.th/index.php/en/
Adding@2: https://www.camt.cmu.ac.th/index.php/th/component/users/?view=remind&Itemid=101
Adding@2: https://www.camt.cmu.ac.th/index.php/th/component/users/?view=reset&Itemid=101
Adding@2: https://ditc.camt.cmu.ac.th/
Adding@2: http://meetingroom.camt.cmu.ac.th/
Adding@2: https://smartoffice.camt.cmu.ac.th/v1r
Adding@2: https://service.camt.cmu.ac.th/onestop
Adding@2: https://service.camt.cmu.ac.th/studentForm/
Adding@2: https://lib.camt.cmu.ac.th/
Adding@2: https://cmu.to/CAMT-SLRF
Adding@2: https://pandit.camt.cmu.ac.th/
Adding@2: https://service.camt.cmu.ac.th/iso/
Adding@2: https://service.camt.cmu.ac.th/gifted
Adding@2: https://web.microsoftstream.com/channel/a9f73640-9088-4ee8-b1eb-098e15ca8a68
Adding@2: https://cmu.to/camt-files
Adding@2: https://www.camt.cmu.ac.th/
A

In [6]:
with open(s.stored_folder / s.pickle_name, "wb") as f:
    pickle.dump(s.crawled_pages, f, pickle.HIGHEST_PROTOCOL)

len(s.crawled_pages)


359