In [3]:
import numpy as np
import json
from pathlib import Path
import os
import pandas as pd
from scipy import sparse
import math
import sys


## PageRank foundation (pg. 75-76)

In [10]:
x0 = np.ones((1, 7)) / 7
v = np.ones((1, 7)) / 7
L = np.matrix(
    [
        [0, 0, 0, 0, 0, 0, 0],
        [1 / 2, 0, 1 / 2, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0],
        [0, 0, 1 / 2, 0, 0, 0, 1 / 2],
        [1 / 2, 0, 0, 0, 0, 1 / 2, 0],
        [0, 0, 1, 0, 0, 0, 0],
        [0, 0, 1 / 2, 0, 0, 1 / 2, 0],
    ]
)

a = 0.85


rows, cols = np.where(L.sum(1) == 0)
L[rows] = 1 / 7
# P = ((a * L) + (1 - a) * v / 7)
prev_Px = x0
Px = a * x0 * L + (1 - a) * v
i = 0
while any(abs(np.asarray(prev_Px).flatten() - np.asarray(Px).flatten()) > 1e-8):
    i += 1
    prev_Px = Px
    Px = a * Px * L + (1 - a) * v

print("Converged in {0} iterations: {1}".format(i, np.asarray(Px).flatten()))
Px.sum()


Converged in 39 iterations: [0.16911688 0.04196419 0.25324048 0.04196419 0.2572186  0.17669667
 0.05979897]


1.0

In [11]:
L = np.matrix(
    [
        [0, 0, 0, 0, 0, 0, 0],
        [1, 0, 1, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0],
        [0, 0, 1, 0, 0, 0, 1],
        [1, 0, 0, 0, 0, 1, 0],
        [0, 0, 1, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 1, 0],
    ]
)
sL = sparse.lil_matrix(L, dtype=bool)
rows, cols = np.where(sL.sum(1) == 0)
sL[rows, :] = np.ones(sL.shape[0], bool)

n = sL.multiply(sparse.csr_matrix(1 / sL.sum(axis=1)))

sys.getsizeof(n.tocoo())


48

In [12]:
x0 = np.matrix([1 / 7] * 7)
P = np.matrix(
    [
        [1 / 7, 1 / 7, 1 / 7, 1 / 7, 1 / 7, 1 / 7, 1 / 7],
        [25 / 56, 3 / 140, 25 / 56, 3 / 140, 3 / 140, 3 / 140, 3 / 140],
        [3 / 140, 3 / 140, 3 / 140, 3 / 140, 61 / 70, 3 / 140, 3 / 140],
        [3 / 140, 3 / 140, 25 / 56, 3 / 140, 3 / 140, 3 / 140, 25 / 56],
        [25 / 56, 3 / 140, 3 / 140, 3 / 140, 3 / 140, 25 / 56, 3 / 140],
        [3 / 140, 3 / 140, 61 / 70, 3 / 140, 3 / 140, 3 / 140, 3 / 140],
        [3 / 140, 3 / 140, 25 / 56, 3 / 140, 3 / 140, 25 / 56, 3 / 140],
    ]
)

prev_Px = x0
Px = x0 * P
i = 0
while any(abs(np.asarray(prev_Px).flatten() - np.asarray(Px).flatten()) > 1e-8):
    i += 1
    prev_Px = Px
    Px = Px * P

print("Converged in {0} iterations: {1}".format(i, np.asarray(Px).flatten()))
Px.sum()


Converged in 39 iterations: [0.16911688 0.04196419 0.25324048 0.04196419 0.2572186  0.17669667
 0.05979897]


1.0000000000000007

## Integrating PageRank score with the crawled webpage (pg. 77-81)

In [1]:
class Pr:
    def __init__(self, alpha):
        self.crawled_folder = Path(os.path.abspath("")) / "crawled/"
        self.alpha = alpha
        self.url_extract()

    def url_extract(self):
        url_maps = {}
        all_urls = set([])
        for file in os.listdir(self.crawled_folder):
            if file.endswith(".json"):
                try:
                    j = json.load(open(os.path.join(self.crawled_folder, file)))
                    all_urls.add(j["url"])
                    all_urls.update(set(j["url_lists"]))
                    url_maps[j["url"]] = list(set(j["url_lists"]))
                except json.JSONDecodeError:
                    print(file)
        all_urls = list(all_urls)
        self.url_maps = url_maps
        self.all_urls = all_urls

    def pr_calc(self):
        url_maps, all_urls = self.url_maps, self.all_urls
        print(f"{len(all_urls)=}")
        url_idx = {v: i for (i, v) in enumerate(all_urls)}
        size = len(all_urls)
        url_matrix = sparse.lil_array((size, size), dtype=int)
        for url in url_maps:
            if len(url_maps[url]) > 0 and len(all_urls) > 0:
                url_matrix[
                    url_idx[url], [url_idx[sub_url] for sub_url in url_maps[url]]
                ] = 1
        # return url_matrix
        print(f"bytes@prepad: {url_matrix.data.nbytes}")
        rows = np.where(url_matrix.sum(1) == 0)[0]
        url_matrix[rows, :] = np.ones(size, int)
        print(f"bytes@postpad: {url_matrix.data.nbytes}")
        return url_matrix
        url_matrix = (1 / url_matrix.sum(1)) * url_matrix
        print(f"bytes@multiply: {url_matrix.data.nbytes}")
        # x0 = np.repeat(1 / len(all_urls), len(all_urls)).T
        # v = np.repeat(1 / len(all_urls), len(all_urls)).T

        # prev_Px = x0
        # Px = self.alpha * x0 @ url_matrix + (1 - self.alpha) * v
        # print(v.T)
        # i = 0
        # while any(abs(np.asarray(prev_Px).flatten() - np.asarray(Px).flatten()) > 1e-8):
        #     i += 1
        #     prev_Px = Px
        #     Px = self.alpha * Px @ url_matrix + (1 - self.alpha) * v

        # print(
        #     "Converged in {0} iterations: {1}".format(
        #         i, np.around(np.asarray(Px).flatten().astype(float), 5)
        #     )
        # )

        # self.pr_result = pd.Series(Px, index=all_urls)


In [4]:
pr = Pr(alpha=0.85)
out = pr.pr_calc()


len(all_urls)=7044
bytes@prepad: 56352
bytes@postpad: 56352


In [18]:
out[[0], :].multiply(1 )


array([7044])

In [44]:
from elasticsearch import Elasticsearch
import pickle

es = Elasticsearch("https://localhost:9200", basic_auth=("elastic", "+oEqEIt7p6lC_=rI1HIC"), ca_certs="./http_ca.crt")
es.info()


ObjectApiResponse({'name': '30bfd7881813', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'o_7sMfofS_G5l-SWN2IbUA', 'version': {'number': '8.12.1', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '6185ba65d27469afabc9bc951cded6c17c21e3f3', 'build_date': '2024-02-01T13:07:13.727175297Z', 'build_snapshot': False, 'lucene_version': '9.9.2', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [45]:
class ElasticIndexer:
    def __init__(self):
        self.crawled_folder = Path(os.path.abspath("")) / "crawled/"
        with open(self.crawled_folder / "url_list.pickle", "rb") as f:
            self.file_mapper = pickle.load(f)
        self.es_client = Elasticsearch(
            "https://localhost:9200",
            basic_auth=("elastic", "+oEqEIt7p6lC_=rI1HIC"),
            ca_certs="./http_ca.crt",
        )

    def run_indexer(self):
        self.pr = Pr(alpha=0.85)
        self.pr.pr_calc()
        self.es_client.options(ignore_status=400).indices.create(index="simple")
        self.es_client.options(ignore_status=[400, 404]).indices.delete(index="simple")
        for file in os.listdir(self.crawled_folder):
            if file.endswith(".json"):
                try:
                    j = json.load(open(os.path.join(self.crawled_folder, file)))
                    j["id"] = j["url"]
                    j["pagerank"] = self.pr.pr_result[j["id"]]
                    self.es_client.index(index="simple", document=j)
                except json.JSONDecodeError as e:
                    print(file)


In [46]:
ei = ElasticIndexer()
ei.run_indexer()


len(all_urls)=7044
6699
377642016
[0.00014196 0.00014196 0.00014196 ... 0.00014196 0.00014196 0.00014196]
Converged in 17 iterations: [0.00014 0.00013 0.00013 ... 0.00028 0.00014 0.00014]


In [60]:
import requests

results = es.search(
    body={
        "_source": {"excludes": ["url_lists"]},
        "query": {
            "script_score": {
                "query": {"match": {"text": "vision"}},
                "script": {"source": "_score * doc['pagerank'].value"},
            }
        },
    },
)
pd.DataFrame(
    [
        [
            hit["_source"]["title"],
            hit["_source"]["url"],
            hit["_source"]["text"][:100],
            hit["_score"],
        ]
        for hit in results["hits"]["hits"]
    ],
    columns=["title", "url", "text", "score"],
)


Unnamed: 0,title,url,text,score
0,Home,https://www.camt.cmu.ac.th/index.php/en/,Home About us Back Visio...,0.001109
1,MIdS : (M)ultidisciplinary and (I)nter(d)iscip...,https://www.mids.cmu.ac.th/,Ask a Question mids@cmu.ac.th ภาษาไทย ...,0.000908
2,วิทยาลัยศิลปะ สื่อและเทคโนโลยี,https://www.camt.cmu.ac.th/index.php/en/compon...,Home About us Back Visio...,0.000724
3,วิทยาลัยศิลปะ สื่อและเทคโนโลยี,https://www.camt.cmu.ac.th/index.php/en/compon...,Home About us Back Visio...,0.000709
4,Home,https://www.camt.cmu.ac.th/index.php/th/?p=&la...,Home About us Back Visio...,0.000558
5,Browse privately in Safari on Mac - Apple Support,https://support.apple.com/kb/ph21413,Apple Store Mac iPad iPhone Watch Vision AirPo...,0.000539
6,Google Workspace: Secure Online Productivity &...,https://workspace.google.com/intl/en/?utm_sour...,Skip to main content Solutions For Ind...,0.000338
7,Cloud Compliance & Regulations Resources | Goo...,https://cloud.google.com/security/compliance?h...,Compliance resource center Google Cloud’s indu...,0.000263
8,Google Cloud Privacy Notice,https://cloud.google.com/terms/cloud-privacy-n...,Overview close Accelerate your digital ...,0.000151


## Flask application (pg. 82-84)


In [64]:
import requests
json.loads(requests.get("http://127.0.0.1:5000/search_es?query=vision").text)



{'elapse': 0.02182459831237793,
 'results': [{'score': 0.0011093874,
   'text': "Home        About us           Back      Vision and Mission        Map        List of CAMT's staff  ",
   'title': 'Home',
   'url': 'https://www.camt.cmu.ac.th/index.php/en/'},
  {'score': 0.00090815144,
   'text': 'Ask a Question  mids@cmu.ac.th       ภาษาไทย   Login  Staff                                         ',
   'title': 'MIdS : (M)ultidisciplinary and (I)nter(d)isciplinary (S)chool',
   'url': 'https://www.mids.cmu.ac.th/'},
  {'score': 0.00072371966,
   'text': "Home        About us           Back      Vision and Mission        Map        List of CAMT's staff  ",
   'title': 'วิทยาลัยศิลปะ สื่อและเทคโนโลยี',
   'url': 'https://www.camt.cmu.ac.th/index.php/en/component/users/?view=remind&Itemid=101'},
  {'score': 0.0007085531,
   'text': "Home        About us           Back      Vision and Mission        Map        List of CAMT's staff  ",
   'title': 'วิทยาลัยศิลปะ สื่อและเทคโนโลยี',
   'url': '