In [119]:
from urllib.parse import urlparse, urljoin
from concurrent.futures import ThreadPoolExecutor, wait
import multiprocessing
from queue import Queue, Empty
from pathlib import Path
import os
import pickle
import requests
from bs4 import BeautifulSoup, Comment
import json


## Develop a simple multithreaded web crawler (pg 36-41)

In [120]:
class MultiThreadedCrawler:
    def __init__(self, base_url, depth):
        self.base_url = base_url
        extracted_url = urlparse(self.base_url)
        parent = extracted_url.path[: extracted_url.path.rfind("/") + 1]
        self.root_url = f"{extracted_url.scheme}://{extracted_url.netloc}{parent}"
        self.pool = ThreadPoolExecutor(max_workers=multiprocessing.cpu_count())
        self.to_crawl = Queue()
        self.to_crawl.put({self.base_url: depth})
        self.stored_folder = Path(os.path.abspath("")) / "crawled/"
        self.pickle_name=f"{str(hash(self.root_url))}@{depth}.pickle"
        self.pending_crawl = set()
        self.depth = depth
        print(self.stored_folder)
        if not Path(self.stored_folder).exists():
            Path.mkdir(self.stored_folder)
        if Path(self.stored_folder / self.pickle_name).exists():
            with open(self.stored_folder / self.pickle_name, "rb") as f:
                self.crawled_pages = pickle.load(f)
            print(self.crawled_pages)
        else:
            self.crawled_pages = set()

    def extract_page(self, obj):
        if obj.result():
            result, url, depth = obj.result()
            self.pending_crawl.remove(url)
            if result and result.status_code == 200:
                self.crawled_pages.add(url)
                url_lists = self.parse_links(result.text, depth)
                self.parse_contents(url, result.text, url_lists)

    def get_page(self, url, depth):
        try:
            res = requests.get(url, timeout=(3, 30))
            return res, url, depth
        except requests.RequestException as e:
            print(e)
            return

    def parse_links(self, html, depth):
        soup = BeautifulSoup(html, "html.parser")
        links = soup.find_all("a", href=True)
        url_lists = []
        for link in links:
            url = link["href"]
            url = urljoin(self.root_url, url)
            if url not in self.crawled_pages.union(self.pending_crawl) and depth >= 0 and r"." not in url.split(r"/")[-1]:
                print("Adding@{}: {}".format(depth, url))
                self.to_crawl.put({url: depth})
            url_lists.append(url)
        return url_lists

    def parse_contents(self, url, html, url_lists):
        def tag_visible(element):
            if element.parent.name in [
                "style",
                "script",
                "head",
                "title",
                "meta",
                "[document]",
            ]:
                return False
            if isinstance(element, Comment):
                return False
            return True

        try:
            soup = BeautifulSoup(html, "html.parser")
            texts = soup.findAll(string=True)
            visible_texts = filter(tag_visible, texts)
            title = ""
            if soup.find("title"):
                title = soup.find("title").string.strip()
            text = " ".join(t.strip() for t in visible_texts).strip()
            with open(
                self.stored_folder / (str(hash(url)) + ".json"), "w", encoding="utf-8"
            ) as f:
                json.dump(
                    {"url": url, "title": title, "text": text, "url_lists": url_lists},
                    f,
                    ensure_ascii=False,
                    indent=4,
                )
        except:
            pass

    def run_scraper(self):
        while True:
            try:
                target = self.to_crawl.get(timeout=10)
                url, depth = [(k, target[k]) for k in target][0]
                if (url not in self.crawled_pages.union(self.pending_crawl) and depth >= 0) or depth == self.depth:
                    self.pending_crawl.add(url)
                    job = self.pool.submit(self.get_page, url, depth - 1)
                    job.add_done_callback(self.extract_page)
            except Empty:
                self.pool.shutdown(wait=True)
                with open(self.stored_folder / self.pickle_name, "wb") as f:
                    pickle.dump(self.crawled_pages, f, pickle.HIGHEST_PROTOCOL)
                with open(self.stored_folder / self.pickle_name, "rb") as f:
                    print(pickle.load(f))
                break
            except Exception as e:
                print(e)
                continue


In [121]:
s = MultiThreadedCrawler("https://camt.cmu.ac.th/index.php/en/", 3)
s.run_scraper()


/home/mansmooth/ir-py/hands-on/06/crawled
Adding@2: https://service.camt.cmu.ac.th/studentForm/
Adding@2: https://service.camt.cmu.ac.th/complaint/
Adding@2: https://camt.cmu.ac.th/index.php/th/
Adding@2: https://camt.cmu.ac.th/index.php/en/component/users/?view=remind&Itemid=101
Adding@2: https://camt.cmu.ac.th/index.php/en/component/users/?view=reset&Itemid=101
Adding@2: https://ditc.camt.cmu.ac.th/
Adding@2: http://meetingroom.camt.cmu.ac.th/
Adding@2: https://smartoffice.camt.cmu.ac.th/v1r
Adding@2: https://service.camt.cmu.ac.th/onestop
Adding@2: https://service.camt.cmu.ac.th/studentForm/
Adding@2: https://lib.camt.cmu.ac.th/
Adding@2: https://cmu.to/CAMT-SLRF
Adding@2: https://pandit.camt.cmu.ac.th/
Adding@2: https://service.camt.cmu.ac.th/iso/
Adding@2: https://service.camt.cmu.ac.th/gifted
Adding@2: https://web.microsoftstream.com/channel/a9f73640-9088-4ee8-b1eb-098e15ca8a68
Adding@2: https://cmu.to/camt-files
Adding@2: https://camt.cmu.ac.th/
Adding@2: https://camt.cmu.ac.th/

## Develop a simple web indexer (pg 43)

In [122]:
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
import numpy as np
import re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize


In [123]:
class BM25(object):
    def __init__(self, fitted_vectorizer: TfidfVectorizer, b=0.75, k1=1.6):
        self.fitted_vectorizer = fitted_vectorizer
        self.b = b
        self.k1 = k1

    def fit(self, X):
        """Fit IDF to documents X"""
        self.y = super(TfidfVectorizer, self.fitted_vectorizer).transform(X)
        self.avdl = self.y.sum(1).mean()

    def transform(self, q):
        """Calculate BM25 between query q and documents X"""
        b, k1, avdl = self.b, self.k1, self.avdl

        # apply CountVectorizer
        len_y = self.y.sum(1).A1
        (q,) = super(TfidfVectorizer, self.fitted_vectorizer).transform(q)
        assert sparse.isspmatrix_csr(q)

        # convert to csc for better column slicing
        y = self.y.tocsc()[:, q.indices]
        denom = y + (k1 * (1 - b + b * len_y / avdl))[:, None]
        idf = self.fitted_vectorizer._tfidf.idf_[None, q.indices] - 1.0
        numer = y.multiply(np.broadcast_to(idf, y.shape)) * (k1 + 1)
        return (numer / denom).sum(1).A1


In [124]:
def custom_preprocessor(s: str):
    lemmatizer = WordNetLemmatizer()
    s = re.sub(r"[^A-Za-z]", " ", s)
    s = re.sub(r"\s+", " ", s)
    s = re.sub(" +", " ", s)
    s = s.lower()
    s = word_tokenize(s)
    s = set(s).difference(set(stopwords.words("english")))
    s = [word for word in s if len(word) > 2]
    s = [lemmatizer.lemmatize(w) for w in s]
    s = " ".join(s)
    return s


In [125]:
class WebIndexer:
    def __init__(self):
        self.crawled_folder = Path(os.path.abspath('')) / 'crawled/'
        self.stored_file = 'indexer/manual_indexer.pickle'
        if not Path(self.stored_file).parent.exists():
            Path.mkdir(Path(self.stored_file).parent)
        if os.path.isfile(self.stored_file):
            with open(self.stored_file, 'rb') as f:
                cached_dict = pickle.load(f)
            self.__dict__.update(cached_dict)
        else:
            self.run_indexer()
    def run_indexer(self):
        documents = []
        for file in os.listdir(self.crawled_folder):
            if file.endswith(".json"):
                j = json.load(open(os.path.join(self.crawled_folder, file)))
                documents.append(j)
        self.documents = pd.DataFrame.from_dict(documents)
        tfidf_vectorizor = TfidfVectorizer(preprocessor=custom_preprocessor, stop_words=stopwords.words('english'))
        tfidf_vectorizor.fit(self.documents.apply(lambda s: ' '.join(s[['title', 'text']]), axis=1))
        self.bm25 = BM25(tfidf_vectorizor)
        self.bm25.fit(self.documents.apply(lambda s: ' '.join(s[['title', 'text']]), axis=1))
        with open(self.stored_file, 'wb') as f:
            pickle.dump(self.__dict__, f)
    def search(self, query):
        scores = self.bm25.transform([query])
        df = pd.DataFrame(scores, columns=["score"])
        return self.documents.join(df)



## Quick workout #1: Search using a query ‘school’ with BM25. (pg 44)

In [127]:
wi = WebIndexer()
wi.search("School").sort_values("score", ascending=False).head(20)


Unnamed: 0,url,title,text,url_lists,score
214,https://service.camt.cmu.ac.th/gifted,Gift School 2023,<< คลิกที่นี่ >> ระบบรับสมัคร Gifted School | ...,[https://service.camt.cmu.ac.th/gifted/gifted/...,4.376591
175,https://w3.grad.cmu.ac.th/,"Graduate School, Chiang Mai University",,[],4.376591
26,https://www.grad.cmu.ac.th/,"Graduate School, Chiang Mai University",MIdS : Multidisciplinary and Interdisciplinary...,"[https://cmu.to/admission/, https://w3.grad.cm...",4.27399
133,https://service.camt.cmu.ac.th/gifted/gifted/i...,Gift School 2566,Login Form เลขบัตรประชาชน 13 หลัก ดู P...,[],4.234285
25,https://pandit.camt.cmu.ac.th/home/rule,"CAMT : College of Arts, Media and Technology",Home Regulations Knowledge Lecturers Struc...,"[https://pandit.camt.cmu.ac.th/home/index, htt...",3.299405
171,http://cmuir.cmu.ac.th/,CMU Intellectual Repository: Home,Skip navigation Home Browse C...,"[https://camt.cmu.ac.th/index.php/en/#content,...",3.056056
152,https://support.google.com/a?p=privpol_admin&h...,Who is my administrator? - Google Workspace Ad...,Skip to main content Google Workspace Admin...,[https://camt.cmu.ac.th/index.php/en/#hcfe-con...,2.846137
64,https://support.google.com/groups/answer/81275...,Report abuse or legal issue - Google Groups Help,Skip to main content Google Groups Help Sig...,[https://camt.cmu.ac.th/index.php/en/#hcfe-con...,2.663204
71,https://support.google.com/googleplay?p=privpo...,How to rate & review apps in the Google Play S...,Skip to main content Google Play Help Sign ...,[https://camt.cmu.ac.th/index.php/en/#hcfe-con...,2.60238
285,https://www.mids.cmu.ac.th/,MIdS : (M)ultidisciplinary and (I)nter(d)iscip...,Ask a Question mids@cmu.ac.th ภาษาไทย ...,"[javascript:;, javascript:;, https://camt.cmu....",2.558554
