In [103]:
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
import numpy as np
import re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import pickle
import os
import json
from pathlib import Path
from flask import Flask, request
from elasticsearch import Elasticsearch
import time
from sklearn.preprocessing import minmax_scale
import re


class PR:
    def __init__(self, alpha):
        self.crawled_folder = Path(os.path.abspath("")) / "crawled/"
        self.alpha = alpha
        self.url_extract()

    def url_extract(self):
        url_maps = {}
        all_urls = set([])
        for file in os.listdir(self.crawled_folder):
            if file.endswith(".json"):
                try:
                    j = json.load(open(os.path.join(self.crawled_folder, file)))
                    all_urls.add(j["url"])
                    all_urls.update(set(j["url_lists"]))
                    url_maps[j["url"]] = list(set(j["url_lists"]))
                except json.JSONDecodeError:
                    print(file)
        all_urls = list(all_urls)
        self.url_maps = url_maps
        self.all_urls = all_urls

    def pr_calc(self):
        url_maps, all_urls = self.url_maps, self.all_urls
        print(f"{len(all_urls)=}")
        url_idx = {v: i for (i, v) in enumerate(all_urls)}
        size = len(all_urls)
        url_matrix = sparse.lil_array((size, size), dtype=int)
        for url in url_maps:
            if len(url_maps[url]) > 0 and len(all_urls) > 0:
                url_matrix[
                    url_idx[url], [url_idx[sub_url] for sub_url in url_maps[url]]
                ] = 1
        # return url_matrix
        print(f"bytes@prepad: {url_matrix.data.nbytes}")
        rows = np.where(url_matrix.sum(1) == 0)[0]
        url_matrix[rows, :] = np.ones(size, int)
        print(f"bytes@postpad: {url_matrix.data.nbytes}")
        url_matrix = url_matrix * sparse.coo_array(1 / url_matrix.sum(axis=1)).T
        print(f"bytes@multiply: {url_matrix.data.nbytes}")

        x0 = np.repeat(1 / len(all_urls), len(all_urls)).T
        v = np.repeat(1 / len(all_urls), len(all_urls)).T

        prev_Px = x0
        Px = self.alpha * x0 @ url_matrix + (1 - self.alpha) * v
        i = 0
        while any(abs(np.asarray(prev_Px).flatten() - np.asarray(Px).flatten()) > 1e-8):
            i += 1
            prev_Px = Px
            Px = self.alpha * Px @ url_matrix + (1 - self.alpha) * v

        print(
            "Converged in {0} iterations: {1}".format(
                i, np.around(np.asarray(Px).flatten().astype(float), 5)
            )
        )

        self.pr_result = pd.Series(minmax_scale(Px), index=all_urls)


class BM25(object):
    def __init__(self, fitted_vectorizer: TfidfVectorizer, b=0.75, k1=1.6):
        self.fitted_vectorizer = fitted_vectorizer
        self.b = b
        self.k1 = k1

    def fit(self, X):
        """Fit IDF to documents X"""
        self.y = super(TfidfVectorizer, self.fitted_vectorizer).transform(X)
        self.avdl = self.y.sum(1).mean()

    def transform(self, q):
        """Calculate BM25 between query q and documents X"""
        b, k1, avdl = self.b, self.k1, self.avdl

        # apply CountVectorizer
        len_y = self.y.sum(1).A1
        (q,) = super(TfidfVectorizer, self.fitted_vectorizer).transform(q)
        assert sparse.isspmatrix_csr(q)

        # convert to csc for better column slicing
        y = self.y.tocsc()[:, q.indices]
        denom = y + (k1 * (1 - b + b * len_y / avdl))[:, None]
        idf = self.fitted_vectorizer._tfidf.idf_[None, q.indices] - 1.0
        numer = y.multiply(np.broadcast_to(idf, y.shape)) * (k1 + 1)
        return (numer / denom).sum(1).A1


def custom_preprocessor(s: str):
    lemmatizer = WordNetLemmatizer()
    s = re.sub(r"[^A-Za-z]", " ", s)
    s = re.sub(r"\s+", " ", s)
    s = re.sub(" +", " ", s)
    s = s.lower()
    s = word_tokenize(s)
    s = set(s).difference(set(stopwords.words("english")))
    s = [word for word in s if len(word) > 2]
    s = [lemmatizer.lemmatize(w) for w in s]
    s = " ".join(s)
    return s


class ManualIndexer:
    def __init__(self):
        self.crawled_folder = Path(os.path.abspath("")) / "crawled/"
        self.stored_file = "indexer/manual_indexer.pickle"
        if not Path(self.stored_file).parent.exists():
            Path.mkdir(Path(self.stored_file).parent)
        if os.path.isfile(self.stored_file):
            with open(self.stored_file, "rb") as f:
                cached_dict = pickle.load(f)
            self.__dict__.update(cached_dict)
        else:
            self.run_indexer()

    def run_indexer(self):
        documents = []
        for file in os.listdir(self.crawled_folder):
            if file.endswith(".json"):
                j = json.load(open(os.path.join(self.crawled_folder, file)))
                documents.append(j)
        self.documents = pd.DataFrame.from_dict(documents)
        tfidf_vectorizor = TfidfVectorizer(
            preprocessor=custom_preprocessor, stop_words=stopwords.words("english")
        )
        tfidf_vectorizor.fit(
            self.documents.apply(lambda s: " ".join(s[["title", "text"]]), axis=1)
        )
        self.bm25 = BM25(tfidf_vectorizor)
        self.bm25.fit(
            self.documents.apply(lambda s: " ".join(s[["title", "text"]]), axis=1)
        )
        self.pr = PR(0.8)
        self.pr.pr_calc()
        with open(self.stored_file, "wb") as f:
            pickle.dump(self.__dict__, f)

    def search(self, query):
        scores = minmax_scale(self.bm25.transform([query]))
        df = pd.DataFrame(scores, columns=["score"])
        result_df = self.documents.join(df)
        result_df["score"] = result_df.apply(
            lambda x: self.pr.pr_result[x["url"]] * x["score"], axis=1
        )
        result_df = result_df.sort_values("score", ascending=False).head(20)
        result_df["text_highlight"] = result_df["text"].apply(
            lambda x: [
                f"...{x[span[0] - 24: span[0]]}<b>{x[span[0] : span[1]]}</b>{x[span[1]: span[1] + 24]}..."
                for span in [
                    word_span
                    for word_spans in [
                        [
                            m.span()
                            for m in re.finditer(rf"\b{q_word}\b", x, re.IGNORECASE)
                        ]
                        for q_word in query.split()
                    ]
                    for word_span in word_spans
                ]
            ]
        )
        return result_df


In [104]:
manual_indexer = ManualIndexer()


len(all_urls)=7044
bytes@prepad: 56352
bytes@postpad: 56352
bytes@multiply: 377642016
Converged in 16 iterations: [0.00013 0.00014 0.00014 ... 0.00013 0.00014 0.00014]


In [102]:
query = "graduate school"
manual_indexer.search(query)


Unnamed: 0,url,title,text,url_lists,score,text_highlight
209,https://service.camt.cmu.ac.th/iso/home/lecturers,"ISO 30401:2018 - College of Arts, Media and Te...",Power BI Home Our Program Context manual ...,[https://www.camt.cmu.ac.th/index.php/th/index...,0.197608,[]
75,https://www.grad.cmu.ac.th/,"Graduate School, Chiang Mai University",MIdS : Multidisciplinary and Interdisciplinary...,"[https://cmu.to/admission/, https://w3.grad.cm...",0.069104,[...rsity Admission 2024 <b>Graduate</b> Sc...
82,https://pandit.camt.cmu.ac.th/home/rule,"CAMT : College of Arts, Media and Technology",Home Regulations Knowledge Lecturers Struc...,"[https://pandit.camt.cmu.ac.th/home/index, htt...",0.051672,[...iversity Regulations on <b>Graduate</b> St...
19,https://accounts.google.com/TOS?loc=TH&hl=en-US,Google Terms of Service – Privacy & Terms – Go...,Privacy & Terms Sign in Overview Privacy Polic...,"[https://www.camt.cmu.ac.th/, https://www.goog...",0.034935,"[...oration, non-profit, or <b>school</b>) and..."
90,https://w3.grad.cmu.ac.th/,"Graduate School, Chiang Mai University",,[],0.032362,[]
303,http://cmuir.cmu.ac.th/,CMU Intellectual Repository: Home,Skip navigation Home Browse C...,[https://www.camt.cmu.ac.th/index.php/th/#cont...,0.032304,[...se its collections. <b>Graduate</b> Sc...
342,https://service.camt.cmu.ac.th/gifted,Gift School 2023,monk4d https://164.90.134.40/ https://159.22...,"[https://monk4dmain.com/, https://164.90.134.4...",0.030293,[... >> ระบบรับสมัคร Gifted <b>School</b> | Pr...
213,https://www.mids.cmu.ac.th/,MIdS : (M)ultidisciplinary and (I)nter(d)iscip...,Ask a Question mids@cmu.ac.th ภาษาไทย ...,"[javascript:;, javascript:;, https://www.camt....",0.016306,[... View More <b>Graduate</b> Sc...
144,https://cmu.to/admission/,การรับนักเรียน/บุคคล เข้าศึกษาต่อมหาวิทยาลัยเช...,การรับสมัครคัดเลือกบุคคลเข้าศึกษาต่อมหาวิทยาลั...,"[https://admission.grad.cmu.ac.th/, https://ww...",0.01569,[... ระดับบัณฑิตศึกษา / <b>Graduate</b> Le...
158,https://www.google.com/accounts/TOS,Google Terms of Service – Privacy & Terms – Go...,Privacy & Terms Sign in Overview Privacy Polic...,"[https://www.camt.cmu.ac.th/, https://www.goog...",0.012498,"[...oration, non-profit, or <b>school</b>) and..."


In [107]:
scores = minmax_scale(manual_indexer.bm25.transform([query]))
df = pd.DataFrame(scores, columns=["score"])
result_df = manual_indexer.documents.join(df)
result_df.set_index("url")["score"]


url
https://www.reg.cmu.ac.th/webreg/th/links/                                                                                                                                                                           0.0
https://www1.reg.cmu.ac.th/degreeverification/                                                                                                                                                                       0.0
https://www.youtube.com/channel/UCBmwzQnSoj9b6HzNmFrg_yw                                                                                                                                                             0.0
https://www.camt.cmu.ac.th/index.php/th/#dtm                                                                                                                                                                         0.0
https://transparency.google/?hl=en_US                                                                                           