# Libraries

In [17]:
from concurrent.futures import ThreadPoolExecutor
import json
import multiprocessing
import os
import pickle
from queue import Empty
from urllib.parse import urljoin, urlparse
import os
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from ordered_set import OrderedSet
import re
from sklearn.feature_extraction.text import CountVectorizer
import itertools
from string import ascii_lowercase
from multiprocessing import Pool
from multiprocessing.pool import ThreadPool
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from pathlib import Path
from bs4 import BeautifulSoup, Comment
import requests


## Customised BM25

In [18]:
from scipy import sparse
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

class BM25(object):
    def __init__(self, vectorizer, b=0.75, k1=1.6):
        self.vectorizer = vectorizer
        self.b = b
        self.k1 = k1
    def fit(self , x):
        self.vectorizer.fit(x)
        self.y = super(TfidfVectorizer, self.vectorizer).transform(x)
        self.avdl = self.y.sum(1).mean()
    def transform(self , q):
        b, k1, avdl = self.b , self.k1, self.avdl
    
        len_y = self.y.sum(1).A1
        q, = super(TfidfVectorizer, self.vectorizer).transform([q])
        assert sparse.isspmatrix_csr(q)

        y = self.y.tocsc()[:, q.indices]
        denom = y + (k1 * (1 - b + b * len_y / avdl))[:, None]
        idf = self.vectorizer._tfidf.idf_[None, q.indices] - 1
        numer = y.multiply(np.broadcast_to(idf, y.shape)) * (k1+1)
        return (numer / denom).sum(1).A1

# A simple multithreaded web crawler (P.43)

In [19]:
class MultiThreadCrawler:
    def __init__(self, base_url, depth):
        self.base_url = base_url
        extracted_url = urlparse(base_url)
        parent = extracted_url.path[:extracted_url.path.rfind("/") +  1]
        self.root_url = '{}://{}{}'.format(extracted_url.scheme, extracted_url.netloc, parent)
        self.pool = ThreadPoolExecutor(max_workers= multiprocessing.cpu_count() - 1)
        self.to_crawl = multiprocessing.Queue()
        self.to_crawl.put({self.base_url: depth})
        self.stored_folder = Path(os.path.abspath('')).parent / 'crawled/'

        if not Path(self.stored_folder).exists(): 
            Path.mkdir(self.stored_folder)

        if Path(self.stored_folder / 'url_list.pickle').exists():
            with open(self.stored_folder / 'url_list.pickle', 'rb') as f:
                self.crawled_pages = pickle.load(f)
            print(self.crawled_pages)
        else:
            self.crawled_pages = set([])
    
    def extract_page(self, obj):
        if obj.result():
            result, url, depth = obj.result()
            if result and result.status_code == 200 :
                url_lists = self.parse_links(result.text, depth)
                self.parse_contents(url, result.text, url_lists)

    def get_page(self, url, depth): 
        try:
            res = requests.get(url, timeout=(3,30))
            return res, url, depth
        except requests.RequestException:
            return
    
    def parse_links(self, html, depth):
        soup = BeautifulSoup(html, 'html.parser')
        links = soup.find_all('a', href=True)
        url_lists = []
        for link in links:
            url = link['href']
            url = urljoin(self.root_url, url)
            if not url.endswith("/"):
                url += "/"
            if depth >= 0 and '..' not in url and url not in self.crawled_pages:
                print("Adding {}".format(url))
                self.to_crawl.put({url: depth})
            url_lists.append(url)
        return url_lists
    
    def parse_contents(self, url, html, url_lists):
        def tag_visible(element):
            if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
                return False
            if isinstance(element, Comment):
                return False
            return True
        
        try: 
            soup = BeautifulSoup(html, 'html.parser')
            texts = soup.findAll(string=True)
            visible_texts = filter(tag_visible, texts)
            title = soup.find('title').string.strip()
            text = u" ".join(t.strip() for t in visible_texts).strip()  

            with open(self.stored_folder / (str(hash(url)) + '.txt'), 'w', encoding='utf-8') as f: 
                json.dump({
                    'url': url,
                    'title': title,
                    'text': text,
                    'url_lists': url_lists
                }, f, ensure_ascii=False)
        except:
            pass

    
    def run_scraper(self):
        while True:
            try:
                target = self.to_crawl.get(timeout=10)
                url, depth = [(k, target[k]) for k in target][0]
                if url not in self.crawled_pages:
                    self.crawled_pages.add(url)
                    job = self.pool.submit(self.get_page, url , depth - 1)
                    job.add_done_callback(self.extract_page)
            except Empty:
                with open(self.stored_folder / 'url_list.pickle', 'wb') as f :
                    pickle.dump(self.crawled_pages, f , pickle.HIGHEST_PROTOCOL)
                with open(self.stored_folder/ 'url_list.pickle', 'rb') as f :
                    print(pickle.load(f))
                break
            except Exception as e :
                print(e)
                continue


In [20]:
if __name__ == '__main__':
    s = MultiThreadCrawler("https://camt.cmu.ac.th/index.php/en/", 2)
    s.run_scraper()

{'https://camt.cmu.ac.th/images/gallery_in_article/2024013101/DSC00818.jpg/', 'https://camt.cmu.ac.th/images/gallery_in_article/2024013101/DSC00850.jpg/', 'https://camt.cmu.ac.th/images/gallery_in_article/2024012404/DSC09733.jpg/', 'https://camt.cmu.ac.th/index.php/en/all-news-groups/54-camt-announcement/1111-cancel_midterm_266_954491.html/', 'https://camt.cmu.ac.th/index.php/th/major/bachelor/bachelor-dii/', 'https://camt.cmu.ac.th/images/gallery_in_article/2024020701/DSC02345.jpg/', 'https://camt.cmu.ac.th/index.php/th/หัวข้อกลุ่มข่าวทั้งหมด/24-ข่าวทั่วไป/1142-camt-จัดกิจกรรม-bitdev-game-developer-meetup-2024-พูดคุยแลกเปลี่ยนประสบการณ์จากบริษัทเกม.html/', 'https://camt.cmu.ac.th/index.php/en/view_entry.php?id=170972&area=1&day=12&month=2&year=2024/', 'https://admission.grad.cmu.ac.th/admissions/indexth.php?p=107&id=10582/', 'https://camt.cmu.ac.th/index.php/en/day.php?area=1&room=1/', 'https://www.grad.cmu.ac.th/index.php?lang=en/', 'https://camt.cmu.ac.th/attachments/article/1162/รั

{'https://camt.cmu.ac.th/images/gallery_in_article/2024013101/DSC00818.jpg/', 'https://camt.cmu.ac.th/images/gallery_in_article/2024013101/DSC00850.jpg/', 'https://camt.cmu.ac.th/images/gallery_in_article/2024012404/DSC09733.jpg/', 'https://camt.cmu.ac.th/index.php/en/all-news-groups/54-camt-announcement/1111-cancel_midterm_266_954491.html/', 'https://camt.cmu.ac.th/index.php/th/major/bachelor/bachelor-dii/', 'https://camt.cmu.ac.th/images/gallery_in_article/2024020701/DSC02345.jpg/', 'https://camt.cmu.ac.th/index.php/th/หัวข้อกลุ่มข่าวทั้งหมด/24-ข่าวทั่วไป/1142-camt-จัดกิจกรรม-bitdev-game-developer-meetup-2024-พูดคุยแลกเปลี่ยนประสบการณ์จากบริษัทเกม.html/', 'https://camt.cmu.ac.th/index.php/en/view_entry.php?id=170972&area=1&day=12&month=2&year=2024/', 'https://admission.grad.cmu.ac.th/admissions/indexth.php?p=107&id=10582/', 'https://camt.cmu.ac.th/index.php/en/day.php?area=1&room=1/', 'https://www.grad.cmu.ac.th/index.php?lang=en/', 'https://camt.cmu.ac.th/attachments/article/1162/รั

# Quick workout #1 (P.44)

In [21]:
def custom_preprocessor(s):
    ps = PorterStemmer()
    s = re.sub(r'[^A-Za-z]', ' ', s)
    s = re.sub(r'\s+', ' ', s)
    s = word_tokenize(s)
    s = [word for word in s if len(word)>2]
    s = [ps.stem(w) for w in s]
    s = ' '.join(s)
    return s

In [22]:
class Indexer:
    def __init__(self):
        self.crawled_folder = Path(os.path.abspath('')).parent / 'crawled/'
        self.stored_file = 'src/resource/manual_indexer.pkl'
            
        if os.path.isfile(self.stored_file):
            with open(self.stored_file, 'rb') as f:
                cached_dict = pickle.load(f)
            self.__dict__.update(cached_dict)
        else:
            self.run_indexer()

    def run_indexer(self):
        documents = []
        for file in os.listdir(self.crawled_folder):
            if file.endswith(".txt"):
                j = json.load(open(os.path.join(self.crawled_folder, file)))
                documents.append(j)
                    
        self.documents = pd.DataFrame.from_dict(documents)
            
        tfidf_vectorizer = TfidfVectorizer(preprocessor=custom_preprocessor,stop_words=stopwords.words('english'))
        self.bm25 = BM25(tfidf_vectorizer)
        self.bm25.fit(self.documents.apply(lambda s: ' '.join(s[['title', 'text']]), axis=1))
            
        with open(self.stored_file, 'wb') as f:
            pickle.dump(self.__dict__, f)
    
    def bm25_search(self, keyword):
        scores = self.bm25.transform(keyword)
        return self.documents.join(pd.DataFrame(scores, columns=["score"])).sort_values("score", ascending=False).head(5)


In [23]:
indextor = Indexer()
keyword = "school"

indextor.bm25_search(keyword)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Unnamed: 0,url,title,text,url_lists,score
132,https://go.camt.cmu.ac.th/index.php/th/2019-05...,Gifted School 2020,"Choose your language ไทย li dir=""ltr"" ...","[http://www.go-camt.com/index.php/th/, http://...",4.983037
30,https://www.grad.cmu.ac.th/index.php?lang=en/,"Graduate School, Chiang Mai University",MIdS : Multidisciplinary and Interdisciplinary...,"[https://cmu.to/admission/, https://w3.grad.cm...",4.908884
120,https://service.camt.cmu.ac.th/gifted/,Gift School 2023,<< คลิกที่นี่ >> ระบบรับสมัคร Gifted School | ...,[https://service.camt.cmu.ac.th/gifted/gifted/...,4.755403
62,https://go.camt.cmu.ac.th/index.php/th/major/g...,การจัดการความรู้และนวัตกรรม ป.โท,Choose your language ไทย English (UK)...,"[https://go.camt.cmu.ac.th/index.php/th/, http...",3.883824
178,https://go.camt.cmu.ac.th/index.php/th/major/g...,การจัดการความรู้และนวัตกรรม ป.เอก,Choose your language ไทย English (UK)...,"[https://go.camt.cmu.ac.th/index.php/th/, http...",3.815028


# A simple indexer (P.50)

In [24]:
from elasticsearch import Elasticsearch 
es = Elasticsearch(
    "https://localhost:9200",
    basic_auth=("elastic", "gKzjmdkd=QKeITShRujT"),
    ca_certs="../http_ca.crt"
)
es.info()

ObjectApiResponse({'name': '0f389090e4cd', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'QiDNW7YgRQmSYLxT3deyQQ', 'version': {'number': '8.11.4', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'da06c53fd49b7e676ccf8a32d6655c5155c16d81', 'build_date': '2024-01-08T10:05:08.438562403Z', 'build_snapshot': False, 'lucene_version': '9.8.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [25]:
class Indexer:
    def __init__(self):
        self.crawled_folder = Path(os.path.abspath('')).parent / 'crawled/'
        with open(self.crawled_folder / 'url_list.pickle', 'rb') as f:
            self.file_mapper = pickle.load(f)
        self.es_client = Elasticsearch(
            "https://localhost:9200",
            basic_auth=("elastic", "gKzjmdkd=QKeITShRujT"),
            ca_certs="../http_ca.crt"
        )
        
    def run_indexer(self):
        self.es_client.options(ignore_status=400).indices.create(index='simple')
        self.es_client.options(ignore_status=[400, 404]).indices.delete(index='simple')
        
        for file in os.listdir(self.crawled_folder):
            if file.endswith(".txt"):
                with open(os.path.join(self.crawled_folder, file)) as f:
                    j = json.load(f)
                    j['id'] = j['url']
                    # print(j)
                    self.es_client.index(index='simple', document=j)


camt

In [26]:
s = Indexer()
s.run_indexer()
query = {'bool': {'must': [{'match': {'text': 'camt'}}]}}
results = s.es_client.search(index='simple', body={'query': query})
print("Got %d Hits:" % results['hits']['total']['value'])
for hit in results['hits']['hits']:
    print("The title is '{0} ({1})'.".format(hit["_source"]['title'], hit["_source"]['url']))

Got 142 Hits:
The title is 'ประกาศ (https://camt.cmu.ac.th/index.php/th/2-uncategorised/324-ประกาศ.html/)'.
The title is 'หน้าหลัก (https://camt.cmu.ac.th/index.php/th/)'.
The title is 'หน้าหลัก (https://www.camt.cmu.ac.th/)'.
The title is 'หน้าหลัก (https://camt.cmu.ac.th/)'.
The title is 'Home (https://camt.cmu.ac.th/index.php/en/?p=calendar/)'.
The title is 'Home (https://camt.cmu.ac.th/index.php/en/?p=news_detail&key=a323ffb935e16217d23ba8b431ff0574/)'.
The title is 'Home (https://camt.cmu.ac.th/index.php/en/?p=download_documents_list&key=5c5e5b8cb9463a6938cd508e6eef950f/)'.
The title is 'Home (https://camt.cmu.ac.th/index.php/en/?p=service_catalog/)'.
The title is 'Home (https://camt.cmu.ac.th/index.php/en/#admin/)'.
The title is 'Home (https://camt.cmu.ac.th/index.php/en/#philosophy/)'.


examination

In [27]:
s = Indexer()
s.run_indexer()
query = {'bool': {'must': [{'match': {'text': 'examination'}}]}}
results = s.es_client.search(index='simple', body={'query': query})
print("Got %d Hits:" % results['hits']['total']['value'])
for hit in results['hits']['hits']:
    print("The title is '{0} ({1})'.".format(hit["_source"]['title'], hit["_source"]['url']))

Got 0 Hits:


.*vision.*

In [28]:
s = Indexer()
s.run_indexer()
query = {'regexp': { "text": ".*vision.*"}}
results = s.es_client.search(index='simple', body={'query': query})
print("Got %d Hits:" % results['hits']['total']['value'])
for hit in results['hits']['hits']:
    print("The title is '{0} ({1})'.".format(hit["_source"]['title'], hit["_source"]['url']))

Got 101 Hits:
The title is 'Home (https://camt.cmu.ac.th/index.php/en/?p=calendar/)'.
The title is 'Google Terms of Service – Privacy & Terms – Google (https://accounts.google.com/TOS?loc=TH&hl=en-US&privacy=true/)'.
The title is 'Home (https://camt.cmu.ac.th/index.php/en/?p=news_detail&key=a323ffb935e16217d23ba8b431ff0574/)'.
The title is 'All Download (https://camt.cmu.ac.th/index.php/en/all-download/category/27-logo.html/)'.
The title is 'Home (https://camt.cmu.ac.th/index.php/en/?p=download_documents_list&key=5c5e5b8cb9463a6938cd508e6eef950f/)'.
The title is 'All Download (https://camt.cmu.ac.th/index.php/en/all-download/category/34-general.html/)'.
The title is 'CAMT จัดกิจกรรม Workshop Lean Project 2024 และ FIN Café ครั้งที่ 8 (https://camt.cmu.ac.th/index.php/en/all-news-groups/24-ข่าวทั่วไป/1133-camt-จัดกิจกรรม-workshop-lean-project-2024-และ-fin-café-ครั้งที่-8.html/)'.
The title is 'Home (https://camt.cmu.ac.th/index.php/en/?p=service_catalog/)'.
The title is 'ข่าวบริการการศึก

vision

In [29]:
s = Indexer()
s.run_indexer()
query = {'regexp': { "text": "vision"}}
results = s.es_client.search(index='simple', body={'query': query})
print("Got %d Hits:" % results['hits']['total']['value'])
for hit in results['hits']['hits']:
    print("The title is '{0} ({1})'.".format(hit["_source"]['title'], hit["_source"]['url']))

Got 114 Hits:
The title is 'Home (https://camt.cmu.ac.th/index.php/en/?p=calendar/)'.
The title is 'Home (https://camt.cmu.ac.th/index.php/en/?p=news_detail&key=a323ffb935e16217d23ba8b431ff0574/)'.
The title is 'All Download (https://camt.cmu.ac.th/index.php/en/all-download/category/27-logo.html/)'.
The title is 'Home (https://camt.cmu.ac.th/index.php/en/?p=download_documents_list&key=5c5e5b8cb9463a6938cd508e6eef950f/)'.
The title is 'All Download (https://camt.cmu.ac.th/index.php/en/all-download/category/34-general.html/)'.
The title is 'CAMT จัดกิจกรรม Workshop Lean Project 2024 และ FIN Café ครั้งที่ 8 (https://camt.cmu.ac.th/index.php/en/all-news-groups/24-ข่าวทั่วไป/1133-camt-จัดกิจกรรม-workshop-lean-project-2024-และ-fin-café-ครั้งที่-8.html/)'.
The title is 'Home (https://camt.cmu.ac.th/index.php/en/?p=service_catalog/)'.
The title is 'ข่าวบริการการศึกษา (https://camt.cmu.ac.th/index.php/en/all-news-groups/17-ข่าวบริการการศึกษา.html/)'.
The title is 'ข่าวสมัครงาน (https://camt.cmu

In [30]:
from flask import Flask, request
import time
app = Flask(__name__)
app.es_client = Elasticsearch(
            "https://localhost:9200",
            basic_auth=("elastic", "gKzjmdkd=QKeITShRujT"),
            ca_certs="../http_ca.crt"
        )
@app.route('/search_es', methods=['GET'])
def search_es():
    start = time.time()
    response_object = {'status': 'success'}
    argList = request.args.to_dict(flat=False)
    query_term = argList['query'][0]
    results = app.es_client.search(index='simple', source_excludes=['url_lists'], size=100, query={"match": { "text": query_term}})
    end = time.time()
    total_hit = results['hits']['total']['value']
    results_df = pd.DataFrame([[hit["_source"]['title'], hit["_source"]['url'], hit["_source"]['text'][:100], hit["_score"]] for hit in results['hits']['hits']], columns=['title', 'url', 'text', 'score'])
    
    response_object['total_hit'] = total_hit
    response_object['results'] = results_df.to_dict('records')
    response_object['elapse'] = end - start

    return response_object

In [31]:
if __name__ == '__main__':
    app.run(debug=False)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
[33mPress CTRL+C to quit[0m


# Quick exercise (P.59)

In [32]:

class Indexer:
    def __init__(self):
        self.crawled_folder = Path(os.path.abspath('')).parent / 'crawled/'
        self.stored_file = 'src/resource/manual_indexer.pkl'
            
        if os.path.isfile(self.stored_file):
            with open(self.stored_file, 'rb') as f:
                cached_dict = pickle.load(f)
            self.__dict__.update(cached_dict)
        else:
            self.run_indexer()

    def run_indexer(self):
        documents = []
        for file in os.listdir(self.crawled_folder):
            if file.endswith(".txt"):
                j = json.load(open(os.path.join(self.crawled_folder, file)))
                documents.append(j)
                    
        self.documents = pd.DataFrame.from_dict(documents)
            
        tfidf_vectorizer = TfidfVectorizer(preprocessor=custom_preprocessor,stop_words=stopwords.words('english'))
        self.bm25 = BM25(tfidf_vectorizer)
        self.bm25.fit(self.documents.apply(lambda s: ' '.join(s[['title', 'text']]), axis=1))
            
        with open(self.stored_file, 'wb') as f:
            pickle.dump(self.__dict__, f)
    
    def search(self, q):
        scores = self.bm25.transform(q)
        hit = (scores > 0).sum()
        rank = scores.argsort()[::-1][:hit]
        results = self.documents.iloc[rank].copy().reset_index(drop = True)
        results['score'] = scores[rank]
        return results


In [33]:
from flask import Flask, request
import time
app = Flask(__name__)
app.es_client = Elasticsearch(
            "https://localhost:9200",
            basic_auth=("elastic", "gKzjmdkd=QKeITShRujT"),
            ca_certs="../http_ca.crt"
        )

app.manual_indexer = Indexer()
@app.route('/search_manual', methods=['GET'])

def search_manual():
    start = time.time()
    response_object = {'status': 'success'}
    argList = request.args.to_dict(flat=False)
    query_term = argList['query'][0]
    results = app.manual_indexer.search(query_term)
    end = time.time()
    total_hit = len(results)
    results_df = results.drop('url_lists',axis=1)
    
    response_object['total_hit'] = total_hit
    response_object['results'] = results_df.to_dict('records')
    response_object['elapse'] = end - start

    return response_object







https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [34]:
if __name__ == '__main__':
    app.run(debug=False)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
[33mPress CTRL+C to quit[0m


# Rank page tutorial (P.75-76)

In [35]:
x0 = np.matrix([1/7] * 7)
P = np.matrix([
    [1/7, 1/7, 1/7, 1/7, 1/7, 1/7, 1/7],
    [25/56, 3/140, 25/56, 3/140, 3/140, 3/140, 3/140],
    [3/140, 3/140, 3/140, 3/140, 61/70, 3/140, 3/140],
    [3/140, 3/140, 25/56, 3/140, 3/140, 3/140, 25/56],
    [25/56, 3/140, 3/140, 3/140, 3/140, 25/56, 3/140],
    [3/140, 3/140, 61/70, 3/140, 3/140, 3/140, 3/140],
    [3/140, 3/140, 25/56, 3/140, 3/140, 25/56, 3/140],
])

# After 1 step
result_1 = x0 * P

# After 2 steps
result_2 = x0 * P * P

# After 3 steps
result_3 = x0 * P * P * P

print("After 1 step:")
print(result_1)
print("\nAfter 2 steps:")
print(result_2)
print("\nAfter 3 steps:")
print(result_3)


After 1 step:
[[0.16020408 0.03877551 0.34234694 0.03877551 0.16020408 0.16020408
  0.0994898 ]]

After 2 steps:
[[0.12544825 0.04088192 0.25229774 0.04088192 0.33187682 0.15125182
  0.05736152]]

After 3 steps:
[[0.19508404 0.03666157 0.2243539  0.03666157 0.25111465 0.20208787
  0.05403639]]


In [36]:
prev_Px = x0
Px = x0*P
i=0
while(any(abs(np.asarray(prev_Px).flatten()-np.asarray(Px).flatten()) > 1e-8)):
    i+=1
    prev_Px = Px
    Px = Px * P

print('\nConverged in {0} iterations: {1}'.format(i, np.asarray(Px).flatten()))


Converged in 39 iterations: [0.16911688 0.04196419 0.25324048 0.04196419 0.2572186  0.17669667
 0.05979897]


# Integrating PageRank score with the crawled webpage (pg. 77-81)

In [37]:
class Pr:
    def __init__(self, alpha):
        self.crawled_folder = Path(os.path.abspath('')).parent / 'crawled/'
        self.alpha = alpha

    def url_extractor(self):
        url_maps = {}
        all_urls = set([])

        for file in os.listdir(self.crawled_folder):
            if file.endswith(".txt"):
                j = json.load(open(os.path.join(self.crawled_folder, file)))
                all_urls.add(j['url'])
                for s in j['url_lists']:
                    all_urls.add(s)
                url_maps[j['url']] = list(set(j['url_lists']))

        all_urls = list(all_urls)
        return url_maps, all_urls
    
    def pr_calc(self):
        url_maps, all_urls = self.url_extractor()
        url_matrix = pd.DataFrame(columns=all_urls, index=all_urls)

        for url in url_maps:
            if len(url_maps[url]) > 0 and len(all_urls) > 0:
                url_matrix.loc[url] = (1 - self.alpha) * (1 / len(all_urls))
                url_matrix.loc[url, url_maps[url]] += self.alpha * (1 / len(url_maps[url]))

        url_matrix.loc[url_matrix.isnull().all(axis=1), :] = (1 / len(all_urls))

        x0 = np.matrix([1 / len(all_urls)] * len(all_urls))
        P = np.asmatrix(url_matrix.values)

        prev_Px = x0
        Px = x0 * P
        i = 0
        while any(abs(np.asarray(prev_Px).flatten() - np.asarray(Px).flatten()) > 1e-8):
            i += 1
            prev_Px = Px
            Px = Px * P

        print('Converged in {0} iterations: {1}'.format(i, np.around(np.asarray(Px).flatten().astype(float), 5)))

        self.pr_result = pd.DataFrame(Px, columns=url_matrix.index, index=['score']).T.loc[list(url_maps.keys())]

In [38]:
s = Pr(alpha=0.85)
s.pr_calc()

Converged in 18 iterations: [0.0007  0.00028 0.00028 ... 0.00028 0.00028 0.00028]


In [39]:
print(s.pr_result.sort_values(by='score', ascending=False))

                                                       score
https://camt.cmu.ac.th/index.php/en/                0.008039
https://camt.cmu.ac.th/                             0.005122
http://www.faboba.com/                              0.003984
https://camt.cmu.ac.th/index.php/th/                0.003711
http://go.camt.cmu.ac.th/                           0.003599
...                                                      ...
https://camt.cmu.ac.th/index.php/en/all-news-gr...  0.000283
https://camt.cmu.ac.th/index.php/en/all-news-gr...  0.000283
https://camt.cmu.ac.th/index.php/en/all-news-gr...  0.000283
https://camt.cmu.ac.th/index.php/en/all-news-gr...  0.000283
https://camt.cmu.ac.th/index.php/en/all-news-gr...  0.000283

[361 rows x 1 columns]


In [43]:
class Indexer:
    def __init__(self):
        self.crawled_folder = Path(os.path.abspath('')).parent / 'crawled/'
        with open(self.crawled_folder / 'url_list.pickle', 'rb') as f:
            self.file_mapper = pickle.load(f)
        self.es_client = Elasticsearch("https://localhost:9200", basic_auth=("elastic", "gKzjmdkd=QKeITShRujT"), ca_certs="../http_ca.crt")

    def run_indexer(self):
        self.pr = Pr(alpha=0.85)
        self.pr.pr_calc()
        self.es_client.indices.create(index='simple', ignore=400)
        self.es_client.indices.delete(index='simple', ignore=[400, 404])

        for file in os.listdir(self.crawled_folder):
            if file.endswith(".txt"):
                j = json.load(open(os.path.join(self.crawled_folder, file)))
                j['id'] = j['url']
                j['pagerank'] = self.pr.pr_result.loc[j['id']].score
                print(j)
                self.es_client.index(index='simple', body=j)

In [44]:
s = Indexer()
s.run_indexer()

Converged in 18 iterations: [0.0007  0.00028 0.00028 ... 0.00028 0.00028 0.00028]
{'url': 'https://camt.cmu.ac.th/index.php/th/หัวข้อกลุ่มข่าวทั้งหมด/19-ข่าวประกวดราคา/1151-bidding_vr_25.html/', 'title': 'ประกาศประกวดราคาจัดซื้อเครื่องฉายเสมือนจริง Virtual Reality Headset จำนวน 25 เครื่อง', 'text': 'หน้าหลัก        รู้จักเรา           Back      วิสัยทัศน์และพันธกิจ        แผนที่        รายชื่อบุคลากร        โครงสร้างองค์กร              หลักสูตร และ การรับสมัคร        นักศึกษา           Back      ระบบขอหนังสือรับรอง        ดาวน์โหลดเอกสารเบิกค่าการศึกษาบุตรข้าราชการ        ดาวน์โหลดเอกสารสำหรับนักศึกษา        ลงทะเบียนเข้าร่วม WIL        ข้อร้องเรียนจากนักศึกษา              All Download        การชำระเงิน        ติดต่อเรา         Language           Back      THAI        ENG                        Back       เข้าสู่ระบบ         ชื่อสมาชิก             รหัสผ่าน        จำการเข้าระบบ      เข้าสู่ระบบ      ลืมชื่อผู้ใช้?    ลืมรหัสผ่าน?                       ข้อมูลสาธารณะ           Back      

  self.es_client.indices.create(index='simple', ignore=400)
  self.es_client.indices.delete(index='simple', ignore=[400, 404])


{'url': 'https://camt.cmu.ac.th/index.php/th/หัวข้อกลุ่มข่าวทั้งหมด/24-ข่าวทั่วไป/1134-ขอแสดงความยินดี-กับหลักสูตรในระดับบัณฑิตศึกษา-camt-ที่ได้รับรางวัลหลักสูตรดีเด่น-%20ประเภทยอดนิยม-ประจำปีการศึกษา-2565.html/', 'title': 'ขอแสดงความยินดี กับหลักสูตรในระดับบัณฑิตศึกษา CAMT ที่ได้รับรางวัลหลักสูตรดีเด่น  \xa0ประเภทยอดนิยม ประจำปีการศึกษา 2565', 'text': 'หน้าหลัก        รู้จักเรา           Back      วิสัยทัศน์และพันธกิจ        แผนที่        รายชื่อบุคลากร        โครงสร้างองค์กร              หลักสูตร และ การรับสมัคร        นักศึกษา           Back      ระบบขอหนังสือรับรอง        ดาวน์โหลดเอกสารเบิกค่าการศึกษาบุตรข้าราชการ        ดาวน์โหลดเอกสารสำหรับนักศึกษา        ลงทะเบียนเข้าร่วม WIL        ข้อร้องเรียนจากนักศึกษา              All Download        การชำระเงิน        ติดต่อเรา         Language           Back      THAI        ENG                        Back       เข้าสู่ระบบ         ชื่อสมาชิก             รหัสผ่าน        จำการเข้าระบบ      เข้าสู่ระบบ      ลืมชื่อผู้ใช้?    ลืมรหัสผ่าน?  

In [47]:
from flask import Flask, request
import time

app = Flask(__name__)
app.es_client = Elasticsearch(
            "https://localhost:9200",
            basic_auth=("elastic", "gKzjmdkd=QKeITShRujT"),
            ca_certs="../http_ca.crt"
        )
@app.route('/search', methods=['GET'])
def search():
    start = time.time()
    response_object = {'status': 'success'}
    argList = request.args.to_dict(flat=False)
    query_term = argList['query'][0]

    results = app.es_client.search(index='simple', source_excludes=['url_lists'], size=100,
                                    query={"script_score": {"query": {"match": {"text": query_term}},
                                                           "script": {"source": "_score * doc['pagerank'].value"}}})

    end = time.time()
    total_hit = results['hits']['total']['value']
    results_df = pd.DataFrame([[hit["_source"]['title'], hit["_source"]['url'], hit["_source"]['text'][:100], hit["_score"]] for hit in results['hits']['hits']],
                                columns=['title', 'url', 'text', 'score'])

    response_object['total_hit'] = total_hit
    response_object['results'] = results_df.to_dict('records')
    response_object['elapse'] = end - start

    return response_object


In [51]:
app = Flask(__name__)
app.es_client = Elasticsearch("https://localhost:9200", basic_auth=("elastic", "gKzjmdkd=QKeITShRujT"), ca_certs="../http_ca.crt")


@app.route('/search', methods=['GET'])
def search():
    start = time.time()
    response_object = {'status': 'success'}
    argList = request.args.to_dict(flat=False)
    query_term=argList['query'][0]
    results = app.es_client.search(index='simple', source_excludes=['url_lists'], size=100,
    query={"script_score": {"query": { "match": { "text": query_term } }, "script": {"source": "_score * doc['pagerank'].value"}}})
    end = time.time()
    total_hit = results['hits']['total']['value']
    results_df = pd.DataFrame([[hit["_source"]['title'], hit["_source"]['url'], hit["_source"]['text'][:100], hit["_score"]] for hit in results['hits']['hits']], columns=['title', 'url', 'text', 'score'])

    response_object['total_hit'] = total_hit
    response_object['results'] = results_df.to_dict('records')
    response_object['elapse'] = end - start

    return response_object

In [52]:
if __name__ == '__main__':
    app.run(debug=False)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
[33mPress CTRL+C to quit[0m
127.0.0.1 - - [19/Feb/2024 16:44:54] "[33mGET / HTTP/1.1[0m" 404 -
127.0.0.1 - - [19/Feb/2024 16:44:54] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
127.0.0.1 - - [19/Feb/2024 16:45:18] "GET /search?query=xทพ HTTP/1.1" 200 -
127.0.0.1 - - [19/Feb/2024 16:45:27] "GET /search?query=camt HTTP/1.1" 200 -
127.0.0.1 - - [19/Feb/2024 16:45:29] "GET /search?query=camt HTTP/1.1" 200 -
127.0.0.1 - - [19/Feb/2024 16:45:43] "GET /search?query=camt HTTP/1.1" 200 -
127.0.0.1 - - [19/Feb/2024 16:46:09] "GET /search?query=camt HTTP/1.1" 200 -
127.0.0.1 - - [19/Feb/2024 16:48:05] "GET /search?query=school HTTP/1.1" 200 -
