In [1]:
import json
from elasticsearch import Elasticsearch, helpers
from tqdm.notebook import tqdm
import time

# Loading data into ElasticSearch : experiments

In [3]:
es=Elasticsearch([{'host':'localhost', 'port':9200}])

In [10]:
es.indices.delete(index='dblp_v12_v2_test', ignore=[400, 404])

{'acknowledged': True}

In [11]:
settings = {
        "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 0
        },
}

es.indices.create(index='dblp_v12_v2_test', ignore=[400, 404], body=settings)

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'dblp_v12_v2_test'}

In [12]:
# loading dataset
with open('./datasets/preprocessed_dblp_v12_subset.json') as file:
    test_set = json.load(file)
    
len(test_set)

22726

In [13]:
# uploading to elasticsearch
bulk = []
for paper in tqdm(test_set):
    formatted_paper = {} 
    formatted_paper['paper_id'] = paper['id']
    formatted_paper['title'] = paper['title']
    formatted_paper['abstract'] = paper['abstract']
    formatted_paper['references'] = paper['references']
    formatted_paper['citations'] = paper['citations']
    formatted_paper['fos'] = paper['fos']
    formatted_paper['linked_papers'] = list(set(paper['references'] + paper['citations']))
    
    bulk.append(formatted_paper)
    
    if len(bulk) > 5000:
        helpers.bulk(es, bulk, index='dblp_v12_v2_test', doc_type='paper')
        bulk = []
        
if papers_bulk:
    helpers.bulk(es, bulk, index='dblp_v12_v2_test', doc_type='paper')

HBox(children=(FloatProgress(value=0.0, max=22726.0), HTML(value='')))




# Loading data into ElasticSearch

In [2]:
DATASET_PATH = "./datasets/dblp-aminer_v12/dblp.v12.json"
N_RECORDS = 4894081
INDEX_NAME = "dblp_v12_v2"

In [3]:
paper_cit_dict = {}

# adding papers with at least 1 citation
with open(DATASET_PATH) as file:
    file.readline() # skip first line
    
    for _ in tqdm(range(N_RECORDS)):
        line = file.readline()
        try:
            paper = json.loads(line[1:])
        except:
            paper = json.loads(line)
        
        if 'references' in paper:
            for ref_id in paper['references']:
                if ref_id in paper_cit_dict:
                    paper_cit_dict[ref_id].append(paper['id'])
                else:
                    paper_cit_dict[ref_id] = [paper['id']]

# adding papers with no citation
with open(DATASET_PATH) as file:
    file.readline() # skip first line
    
    for _ in tqdm(range(N_RECORDS)):
        line = file.readline()
        try:
            paper = json.loads(line[1:])
        except:
            paper = json.loads(line)
        
        if paper['id'] not in paper_cit_dict:
            paper_cit_dict[paper['id']] = []

len(paper_cit_dict)

HBox(children=(FloatProgress(value=0.0, max=4894081.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4894081.0), HTML(value='')))




4894081

In [None]:
paper_cit_dict_p1 = dict(list(paper_cit_dict.items())[:len(paper_cit_dict)//2])
paper_cit_dict_p2 = dict(list(paper_cit_dict.items())[len(paper_cit_dict)//2:])

with open("./datasets/dblp-aminer_v12/paper_cit_dict_p1.json", 'w') as file:
    json.dump(paper_cit_dict_p1, file)

with open("./datasets/dblp-aminer_v12/paper_cit_dict_p2.json", 'w') as file:
    json.dump(paper_cit_dict_p2, file)

In [50]:
es=Elasticsearch([{'host':'localhost', 'port':9200}])

In [51]:
es.indices.delete(index=INDEX_NAME, ignore=[400, 404])

{'acknowledged': True}

In [52]:
settings = {
        "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 0,        
            "similarity": {
                "scripted_tfidf": {
                    "type": "scripted",
                    "script": {
                        "source": "double tf = Math.sqrt(doc.freq); double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; double norm = 1/Math.sqrt(doc.length); return query.boost * tf * idf * norm;"
                    }
                }
            }
        },
        "mappings": {
            "properties": {
                "title": {
                    "type": "text",
                    "similarity": "scripted_tfidf"
                    },
                "abstract": {
                    "type": "text",
                    "similarity": "scripted_tfidf"
                }
            }
        }
}

es.indices.create(index=INDEX_NAME, ignore=[400, 404], body=settings)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'dblp_v12_v2'}

In [15]:
def reconstruct_abstract(inverted_index):    
    index = {}
    for word, list_of_pos in inverted_index.items():
        for pos in list_of_pos:
            index[pos] = word
    
    abstract_list = []
    for _, word in sorted(index.items(), key=lambda t: t[0]):
        abstract_list.append(word)
    
    return " ".join(abstract_list)

In [53]:
with open(DATASET_PATH) as file:
    bulk = []
    file.readline() # skip first line
    
    for _ in tqdm(range(N_RECORDS)): # n_papers
        line = file.readline()
        try:
            paper = json.loads(line[1:])
        except:
            paper = json.loads(line)
            
        formatted_paper = {} 
        formatted_paper['paper_id'] = paper['id']
        formatted_paper['title'] = paper['title']

        if 'indexed_abstract' in paper:
            indexed_abstract = paper.pop('indexed_abstract')
            formatted_paper['abstract'] = reconstruct_abstract(indexed_abstract['InvertedIndex'])
        else:
            formatted_paper['abstract'] = ''
            
        if 'fos' in paper:
            formatted_paper['fos'] = [fos['name'].lower() for fos in paper['fos']]
        else:
            formatted_paper['fos'] = []
            
        formatted_paper['doi'] = paper.pop('doi', '')
        
        linked_papers = []
        if 'references' in paper:
            linked_papers = paper['references']
        if paper['id'] in paper_cit_dict:
            linked_papers += paper_cit_dict[paper['id']]
        formatted_paper['linked_papers'] = linked_papers
        
        # TODO add linked_papers
            
        bulk.append(formatted_paper)

        if len(bulk) > 5000:
            helpers.bulk(es, bulk, index=INDEX_NAME)
            bulk = []
                
    if bulk:
        helpers.bulk(es, bulk, index=INDEX_NAME)

HBox(children=(FloatProgress(value=0.0, max=4894081.0), HTML(value='')))




In [54]:
es.indices.refresh(INDEX_NAME)
es.cat.count(INDEX_NAME, params={"format": "json"})

[{'epoch': '1594570479', 'timestamp': '16:14:39', 'count': '4894081'}]