In [8]:
from elasticsearch7 import Elasticsearch
from elasticsearch7.helpers import scan
from tqdm import tqdm
import math
import random
import pandas as pd

#ignore warnings
import warnings
warnings.filterwarnings("ignore")


INDEX_NAME = "crawler"
es = Elasticsearch(cloud_id= "0feeb24636464a578a9c7a1ce9739181:dXMtY2VudHJhbDEuZ2NwLmNsb3VkLmVzLmlvOjQ0MyQyMzcyNjZmYzcwMzg0ZTA2OTM1MTJkZGIxMDgzYTRmMyQ1N2RhZjIzZTNiMWM0MjAwYjBhMDQ0MGY1ZTEyZTc2Yw==",
                   http_auth=("elastic", "pETnMazDlmfyCT2rZ2NAWh2V"))

es.ping()

True

In [9]:
inlinks_dict = {}
outlinks_dict = {}
inlinks_len = {}
outlinks_len = {}   

scroll_size = 1000

# Initialize scroll
scroll = scan(es, index=INDEX_NAME, query={"query": {"match_all": {}}}, scroll='5m', size=scroll_size)

# Iterate through scroll
for result in tqdm(scroll):
    # Extract document ID (assuming URL is the ID)
    doc_id = result['_id']
    
    # Extract inlinks and outlinks from the current result
    inlinks = result['_source'].get('inlinks', [])
    outlinks = result['_source'].get('outlinks', [])
    
    # Store inlinks and outlinks in separate dictionaries
    inlinks_dict[doc_id] = inlinks
    outlinks_dict[doc_id] = outlinks
    inlinks_len[doc_id] = len(inlinks)
    outlinks_len[doc_id] = len(outlinks)

181761it [02:51, 1056.85it/s]


In [10]:
class Hits:
    def __init__(self):
        self.base_set = self.create_root_set()
        self.d = 200
        self.authority_score = {}
        self.hub_score = {}

        while len(self.base_set) < 1000:
            self.update_set()
        self.compute_hits()


    def create_root_set(self):
        root_set = set()
        response = es.search(index=INDEX_NAME, 
                            body = {
                                "size": 1000,
                                "query": {
                                    "match": {"content" : "swine flu"}
                                    }
                                })
        for hit in response['hits']['hits']:
            root_set.add(hit['_id'])
        return root_set

    def update_set(self):
        new_out_links = set()
        for doc_id in self.base_set:
            if doc_id in outlinks_dict:
                out_links = outlinks_dict[doc_id]
                if len(out_links) > self.d:
                    out_links = random.sample(out_links, self.d)
                new_out_links.update(out_links)
        self.base_set.update(new_out_links)

        new_in_links = set()
        for doc_id in self.base_set:
            if doc_id in inlinks_dict:
                in_links = inlinks_dict[doc_id]
                if len(in_links) > self.d:
                    in_links = random.sample(in_links, self.d)
                new_in_links.update(in_links)
        self.base_set.update(new_in_links)

    def compute_hits(self):
        for page in self.base_set:
            self.authority_score[page] = 1
            self.hub_score[page] = 1

        for _ in tqdm(range(50), desc="Computing Hits"):
            norm = 0
            for page in self.base_set:
                auth_score = 0
                if page in inlinks_dict:
                    for inlink in inlinks_dict[page]:
                        if inlink in self.base_set:
                            auth_score += self.hub_score[inlink]
                    self.authority_score[page] = auth_score
                    norm += auth_score ** 2
                else:
                    self.authority_score[page] = 0
            norm = math.sqrt(norm)
            for page in self.base_set:
                self.authority_score[page] /= norm

            norm = 0
            for page in self.base_set:
                hub_score = 0
                if page in outlinks_dict:
                    for outlink in outlinks_dict[page]:
                        if outlink in self.base_set:
                            hub_score += self.authority_score[outlink]
                    self.hub_score[page] = hub_score
                    norm += hub_score ** 2
                else:
                    self.hub_score[page] = 0
            norm = math.sqrt(norm)
            for page in self.base_set:
                self.hub_score[page] /= norm
    
    def write_hits(self):
        authority_outputs = []
        for key in sorted(self.authority_score, key=self.authority_score.get, reverse=True)[:500]:
            authority_outputs.append(f"{key}\t{self.authority_score[key]}\t{outlinks_len[key]}\t{inlinks_len[key]}")
        with open("authority.txt", "w") as f:
            f.write("\n".join(authority_outputs))
        
        hub_outputs = []
        for key in sorted(self.hub_score, key=self.hub_score.get, reverse=True)[:500]:
            hub_outputs.append(f"{key}\t{self.hub_score[key]}\t{outlinks_len[key]}\t{inlinks_len[key]}")
        with open("hub.txt", "w") as f:
            f.write("\n".join(hub_outputs))

hits = Hits()
hits.write_hits()

Computing Hits: 100%|██████████| 50/50 [00:00<00:00, 60.23it/s]


In [11]:
# laod the hubs into a pandas dataframe
hubs_df = pd.read_csv("hub.txt", sep="\t", header=None)
hubs_df.columns = ["doc_id", "hub_score", "outlinks", "inlinks"]
hubs_df.head(20)

Unnamed: 0,doc_id,hub_score,outlinks,inlinks
0,http://hxnxflu.blogspot.com/2009/07/tamiflu-re...,0.164793,126,35
1,http://hxnxflu.blogspot.com/2009/07/swine-flu-...,0.164793,128,35
2,http://hxnxflu.blogspot.com/2009/07/health-wor...,0.164793,122,35
3,http://hxnxflu.blogspot.com/2009/07/h1n1-vacci...,0.164793,121,35
4,http://hxnxflu.blogspot.com/2009/07/swine-flu-...,0.164793,119,35
5,http://hxnxflu.blogspot.com/2009/07/weekly-sit...,0.164793,121,35
6,http://hxnxflu.blogspot.com/2009/07/bma-warns-...,0.164793,125,35
7,http://hxnxflu.blogspot.com/2009/07/hong-kong-...,0.164793,123,35
8,http://hxnxflu.blogspot.com/2009/07/swine-flu-...,0.164793,126,35
9,http://hxnxflu.blogspot.com/2009/07/overweight...,0.164793,124,35


In [12]:
# load the authorities into a pandas dataframe
authorities_df = pd.read_csv("authority.txt", sep="\t", header=None)
authorities_df.columns = ["doc_id", "authority_score", "outlinks", "inlinks"]
authorities_df.head(20)

Unnamed: 0,doc_id,authority_score,outlinks,inlinks
0,https://en.wikipedia.org/wiki/Flu,0.231653,358,130
1,https://www.nature.com/collections/klkmbfpjdq,0.231381,15,132
2,https://en.wikipedia.org/wiki/2009_swine_flu_o...,0.231381,299,129
3,http://hxnxflu.blogspot.com/2009/04/,0.231381,133,129
4,http://hxnxflu.blogspot.com/2009/07/,0.231381,163,129
5,http://news.bbc.co.uk/2/hi/uk_news/8083179.stm,0.231381,22,129
6,http://hxnxflu.blogspot.com/2009/05/,0.231381,171,129
7,https://en.wikipedia.org/wiki/2009_H1N1_flu_ou...,0.231381,580,129
8,http://hxnxflu.blogspot.com/2009/06/,0.231381,141,129
9,http://hxnxflu.blogspot.com/2009/07/tamiflu-re...,0.126974,126,35
