In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess
import numpy as np
import json
import re
from elasticsearch import Elasticsearch
import logging

for _ in ("boto", "elasticsearch", "urllib3"):
    logging.getLogger(_).setLevel(logging.CRITICAL)

# Part 1: Crawler

In [2]:
def crawler(starting_URLs, max_document=2000):
    
    frontier_urls = list()
    doc_count = [-1]
    data = {}
    data['articles'] = []
    
    class ArticleSpider(scrapy.Spider):
        name = "article"
        start_urls = starting_URLs
        
        def parse(self, response):
            string = response.css("#paper-header > pre::text").get().splitlines()
            current_url = response.css("head > link:nth-child(3)").attrib['href']
            current_ref = list()
            
            for i in range(1, 11):
                CSS_selector = "#references > div.card-content > div > div.citation-list__citations > div:nth-child("+str(i)+") > div.citation__body > h2 > a"
                rel_url = response.css(CSS_selector)
                
                if rel_url != []:
                    if "href" in rel_url.attrib:
                        
                        rel_url = response.css(CSS_selector).attrib['href']
                        next_url = response.urljoin(rel_url) 
                        current_ref.append(next_url[-next_url[::-1].find('/'):])

                        if next_url not in frontier_urls:
                            frontier_urls.append(next_url)      
                else:
                    break
            
            correct_date = response.css("#paper-header > div.flex-row.paper-meta > li:nth-child(2) > span > span > span > span::text").get()
            if correct_date is None:
                correct_date = ''
            else:
                if correct_date.isdigit() == False:
                    correct_date = response.css("#paper-header > div.flex-row.paper-meta > li:nth-child(2) > span > span:nth-child(2) > span > span::text").get()
            
            correct_authors = re.search("\{.*\}", string[2]).group(0)[1:-1]
                                       
            if correct_authors is not None:
                correct_authors = re.split(' and ', correct_authors)
            else:
                correct_authors = ""
                
            data['articles'].append({
                'id': current_url[-current_url[::-1].find('/'):],                                                                                                   
                'title': response.css("#paper-header > h1::text").get(),
                'authors': correct_authors,
                'date': correct_date,
                'abstract': response.css("head > meta:nth-child(7)").attrib['content'],
                'references': current_ref
            })
            
            doc_count[0] += 1
            
            if doc_count[0] >= max_document-len(starting_URLs):
                process.stop()
                
            yield scrapy.Request(frontier_urls[doc_count[0]], callback=self.parse)
    
    process = CrawlerProcess()
    process.crawl(ArticleSpider)
    process.start()
    
    file = open("papers_index.json", 'w')
    string = json.dumps(data, indent=4)
    file.write(string)
    file.close()
    
    return data

# Part 2: Insert & Delete

In [3]:
def insert_elasticSearch(json_data, host, port):

    es = Elasticsearch([{'host': host, 'port': int(port)}])
    
    for i in range(len(json_data)):
        es.index(index='paper_index',doc_type='paper',id=i,body={"paper": json_data[i]})
    return es

In [4]:
def delete_elasticSearch(es):
    leng = es.count(index='paper_index', doc_type='paper')["count"]
    for i in range(leng):
        es.delete(index='paper_index',doc_type='paper',id=i)
    return

# Part 3: Page Rank

In [5]:
def page_rank(es, alpha):
    
    N = es.count(index='paper_index', doc_type='paper')["count"]
    total_docs = list()
    
    for i in range(N):
        doc_info = es.get(index='paper_index', doc_type='paper', id=i)
        total_docs.append(doc_info['_source']['paper']['id'])
        
    map_id = dict(zip(total_docs, range(N)))
    
    P = np.zeros((N, N), dtype=float)
    
    for i in range(N):
        doc_ref = es.get(index='paper_index', doc_type='paper', id=i)['_source']['paper']['references']
        for j in range(len(doc_ref)):
            if doc_ref[j] in total_docs:
                P[i][map_id[doc_ref[j]]] += 1
                
    P /= N
    P[np.where(~P.any(axis=1))[0]] = np.ones(N, dtype=float) / N
    P = (1 - alpha) * P + alpha * (np.ones((N, N), dtype=float) / N)

    nex_v = np.ones(N, dtype=float) / N
    v = np.zeros(N, dtype=float)
    
    while(np.sqrt(np.sum(np.power(nex_v - v, 2))) > 1e-6):
        v = nex_v
        nex_v = np.matmul(v, P)
        
    for i in range(N):
        es.update(index='paper_index', doc_type='paper', id=i, body={
            'doc' : {"paper" : {'pageRank' : v[i]}}
        })
    return v

# Part 4: Search

In [6]:
def search(es, title, w_title, date, w_date, abstract, w_abstract, inv_pageRank=True):
    if inv_pageRank == True:
        res= es.search(index='paper_index', doc_type='paper', body={
            "query": {
                "function_score": {
                  "functions": [
                      {
                        "field_value_factor" : {
                                "field" : "paper.pageRank",
                                "factor" : 1e9,
                                "modifier": "log1p"
                       }
                      },
                      {
                          "filter": { "match": { "paper.title": { 
                                                      "query": title,
                                                      "operator": "or",
                                                      "fuzziness": 1} } },
                          "weight": w_title
                      },
                      {
                          "filter": { "match_phrase": { "paper.title": title } },
                          "weight": w_title
                      },
                      {
                          "filter": { "match": { "paper.abstract": { 
                                                      "query": abstract,
                                                      "operator": "or",
                                                      "fuzziness": 1} } },
                          "weight": w_abstract
                      },
                      {
                          "filter": { "match_phrase": { "paper.abstract": abstract } },
                          "weight": w_abstract
                      },
                      {
                          "filter": {"range":{"paper.date":{"gt": str(date-1)}}},
                          "weight": w_date
                      },
                  ],
                  "score_mode": "sum",
                  "boost_mode": "multiply",
                }
            }
        })
    else:
        res= es.search(index='paper_index', doc_type='paper', body={
            "query": {
                "function_score": {
                  "functions": [
                      {
                          "filter": { "match": { "paper.title": { 
                                                      "query": title,
                                                      "operator": "or",
                                                      "fuzziness": 1} } },
                          "weight": w_title
                      },
                      {
                          "filter": { "match_phrase": { "paper.title": title } },
                          "weight": w_title
                      },
                      {
                          "filter": { "match": { "paper.abstract": { 
                                                      "query": abstract,
                                                      "operator": "or",
                                                      "fuzziness": 1} } },
                          "weight": w_abstract
                      },
                      {
                          "filter": { "match_phrase": { "paper.abstract": abstract } },
                          "weight": w_abstract
                      },
                      {
                          "filter": {"range":{"paper.date":{"gt": str(date-1)}}},
                          "weight": w_date
                      },
                  ],
                  "score_mode": "sum",
                  "boost_mode": "multiply",
                }
            }
        })
        
    rank = 1
    for hit in res['hits']['hits']:
        print(str(rank)+". title:", hit['_source']['paper']['title'])
        print("abstract:", hit['_source']['paper']['abstract'])
        print("authors:", hit['_source']['paper']['authors'])
        print("date:", hit['_source']['paper']['date'])
        print ()
        rank += 1
    
    return

# Part 5: HITS

In [7]:
def HITS(es, n):
    all_authors, all_papers = list(), list()
    N = es.count(index='paper_index', doc_type='paper')["count"]
    
    for i in range(N):
        cur_doc = es.get(index='paper_index', doc_type='paper', id=i)['_source']['paper']
        all_papers.append(cur_doc['id'])
        for author in cur_doc['authors']:
            all_authors.append(author.lower())
            
    map_id = dict(zip(all_papers, range(N)))
    all_authors = np.unique(all_authors)
    M = len(all_authors)
    map_author = dict(zip(all_authors, range(M)))
    
    A = np.zeros((M, M), dtype=float)
    
    for i in range(N):
        cur_doc = es.get(index='paper_index', doc_type='paper', id=i)['_source']['paper']
        for ref_doc in cur_doc['references']:
            if ref_doc in list(map_id):
                for cur_author in cur_doc['authors']:
                    des_authors = es.get(index='paper_index', doc_type='paper', id=map_id[ref_doc])['_source']['paper']['authors']
                    for author in des_authors:
                        A[map_author[cur_author.lower()]][map_author[author.lower()]] += 1
                        
    h = np.ones(M, dtype=float)
    a = np.ones(M, dtype=float)
    
    for _ in range(5):
        for i in range(M):
            h[i] = np.sum(a[(A[i] != 0)])
            a[i] = np.sum(h[(A[:,i] != 0)])
        
        h /= np.sum(h)
        a /= np.sum(a)
        
    authority_list = dict(zip(list(map_author), list(a)))
    return sorted(authority_list.items(), key=lambda item: item[1], reverse=True)[:n]

# Console: User Interface

In [8]:
def menu():
    print("1. Crawl semanticScholar.com (Only Once run this command!)\n2. Insert data to ElasticSearch\n3. Delete data from ElasticSearch\n4. Calculate pageRank\n5. Search\n6. HITS\n7. Exit")
    print("order of computing: 1 -> 2 -> 3 or 4 or 6 or 5")
    print("please enter your command by number (e.g. 1):")
    return

documents, es, flag = 0, 0, 0

In [10]:
menu()
while True:
    cmd = int(input())
    if cmd == 1:
        if flag == 1:
            print("You've already crawled needed data!")
        else:
            print("Please enter number of pages to crawl (e.g. 200):")
            page_num = int(input())
            urls = [
                "https://www.semanticscholar.org/paper/The-Lottery-Ticket-Hypothesis%3A-Training-Pruned-Frankle-Carbin/f90720ed12e045ac84beb94c27271d6fb8ad48cf",
                "https://www.semanticscholar.org/paper/Attention-is-All-you-Need-Vaswani-Shazeer/204e3073870fae3d05bcbc2f6a8e263d9b72e776",
                "https://www.semanticscholar.org/paper/BERT%3A-Pre-training-of-Deep-Bidirectional-for-Devlin-Chang/df2b0e26d0599ce3e70df8a9da02e51594e0e992"
            ]
            documents = crawler(urls, page_num)
            flag = 1
        print()
        menu()
        
    elif cmd == 2:
        print("Make sure that the elasticSearch server is running!")
        print("Host (e.g. localhost):")
        host = input()
        print("Port (e.g. 9200):")
        port = input()
        print("takes a few seconds...(ignore warnings!!)")
        if documents == 0:
            print("ERROR, do command 1 first!")
        else:
            es = insert_elasticSearch(documents['articles'], host, port)
        
        print()
        menu()
        
    elif cmd == 3:
        delete_elasticSearch(es)
            
        print()
        menu()
        
    elif cmd == 4:
        print("alpha (e.g. 0.1):")
        alpha = float(input())
        print("Calculating...(ignore warnings!!)")
        page_rank(es, alpha)
        print()
        menu()
        
    elif cmd == 5:
        print("title:")
        title = input()
        print("weight:")
        w_t = int(input())
        print("date:")
        date = int(input())
        print("weight:")
        w_d = int(input())
        print("abstract:")
        abstract = input()
        print("weight:")
        w_a = int(input())
        print("Involve pageRank (1 or 0):")
        pr = int(input())
        if pr == 1:
            pr = True
        else:
            pr = False
        print("List of top 10 papers by order:")
        search(es, title, w_t, date, w_d, abstract, w_a, inv_pageRank=pr)
        print()
        menu()
        
    elif cmd == 6:
        
        print("top N authors, select N:")
        n = int(input())
        print("Calculating...(ignore warnings!!)")
        auth = HITS(es, n)
        print("List of top "+str(n)+" authors:")
        for i in range(len(auth)):
            print(i+1, auth[i][0])
            
        print()
        menu()
        
    elif cmd == 7:
        break

1. Crawl semanticScholar.com (Only Once run this command!)
2. Insert data to ElasticSearch
3. Delete data from ElasticSearch
4. Calculate pageRank
5. Search
6. HITS
7. Exit
order of computing: 1 -> 2 -> 3 or 4 or 6 or 5
please enter your command by number (e.g. 1):
7
