In [17]:
import requests
import re
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
from json import JSONDecodeError
from collections import defaultdict
import json
import numpy as np

from parse_dbpedia import *

In [2]:
es = Elasticsearch()
es.info()

{'name': 'BIEBER',
 'cluster_name': 'elasticsearch',
 'cluster_uuid': 'nN32_N-8T1OLuu3AJOSr2g',
 'version': {'number': '7.9.1',
  'build_flavor': 'default',
  'build_type': 'zip',
  'build_hash': '083627f112ba94dffc1232e8b42b73492789ef91',
  'build_date': '2020-09-01T21:22:21.964974Z',
  'build_snapshot': False,
  'lucene_version': '8.6.2',
  'minimum_wire_compatibility_version': '6.8.0',
  'minimum_index_compatibility_version': '6.0.0-beta1'},
 'tagline': 'You Know, for Search'}

In [15]:
def analyze_query(es, index, query, field='body'):
    """Analyzes a query with respect to the relevant index. 
    
    Arguments:
        es: Elasticsearch object instance.
        query: String of query terms.
        field: The field with respect to which the query is analyzed. 
        index: Name of the index with respect to which the query is analyzed.  
    
    Returns:
        A list of query terms that exist in the specified field among the documents in the index. 
    """
    tokens = es.indices.analyze(index=index, body={'text': query})['tokens']
    query_terms = []
    for t in sorted(tokens, key=lambda x: x['position']):
        ## Use a boolean query to find at least one document that contains the term.
        hits = es.search(index=index,
                         body={
                             'query': {
                                 'match': {
                                     field: t['token']
                                 }
                             }
                         },
                         _source=False,
                         size=1).get('hits', {}).get('hits', {})
        doc_id = hits[0]['_id'] if len(hits) > 0 else None
        if doc_id is None:
            continue
        query_terms.append(t['token'])
    return query_terms

def baseline_retrieval(es, index_name, query, k=100):
    """Performs baseline retrival on index.
    
    Arguments:
        index_name: A string of text.
        query: A string of text, space separated terms.
        k: An integer.
        
    Returns:
        A list of entity IDs as strings, up to k of them, in descending order of scores.
    """

    hits = es.search(index=index_name, q=query, _source=True,
                     size=k)['hits']['hits']
    return [(doc['_score'], doc['_source']['types']) for doc in hits]

In [4]:
INDEX_NAME = 'entity_centric_bm25_long_abstract'
INDEX_SETTINGS = {
    'mappings': {
        'properties': {
            'body': {
                'type': 'text',
                'term_vector': 'yes',
                'analyzer': 'english'
            },
            'types': {
                'type': 'text',
                'term_vector': 'yes',
                'analyzer': 'english'
            },
        }
    }
}

In [5]:
def reset_index(es, index):
    if es.indices.exists(INDEX_NAME):
        es.indices.delete(index=INDEX_NAME)

    es.indices.create(index=INDEX_NAME, body=INDEX_SETTINGS)
    
def get_data(index, doc):
    num_docs = len(doc) // 10
    for i, (doc_id, body) in enumerate(doc.items()):
        yield {'_index': index, '_id': doc_id, '_source': body}
        if i % num_docs == 0:
            print('{}% done'.format((i // num_docs)*10))

In [6]:
ontology = get_ontology_tree()
doc_entity = get_entity_docs()
entity_data = get_entity_data()
for entity, body in doc_entity.items():
    ancestors = []
    for entity_type in body['types'].split():
        ancestors.extend(get_ancestors(ontology, entity_type))
    doc_entity[entity]['types'] = ' '.join(set(ancestors))
    doc_entity[entity]['body'] += ' ' + entity_data.get(entity, '')

Num entities with types:  4767652
Num entities with data:  4935279


In [7]:
doc_entity['http://dbpedia.org/resource/Library_of_Alexandria']

{'body': 'Library of Alexandria The Royal Library of Alexandria or Ancient Library of Alexandria in Alexandria, Egypt, was one of the largest and most significant libraries of the ancient world. It was dedicated to the Muses, the nine goddesses of the arts. It flourished under the patronage of the Ptolemaic dynasty and functioned as a major center of scholarship from its construction in the 3rd century BC until the Roman conquest of Egypt in 30 BC, with collections of works, lecture halls, meeting rooms, and gardens. The library was part of a larger research institution called the Musaeum of Alexandria, where many of the most famous thinkers of the ancient world studied. The library was created by Ptolemy I Soter, who was a Macedonian general and the successor of Alexander the Great. Most of the books were kept as papyrus scrolls. It is unknown precisely how many such scrolls were housed at any given time, but estimates range from 40,000 to 400,000 at its height. Arguably, this library

In [9]:
reset_index(es, INDEX_NAME)
for success, info in parallel_bulk(es,
                                   get_data(INDEX_NAME, doc_entity),
                                   thread_count=12,
                                   chunk_size=5000,
                                   queue_size=6):
    if not success:
        print('A document failed:', info)

0% done
1% done
2% done
3% done
4% done
5% done
6% done
7% done
8% done
9% done
10% done


In [45]:
es.cat.count(INDEX_NAME, params={"format": "json"})[0]['count']

'4767652'

In [50]:
from smart_dataset.evaluation.dbpedia.evaluate import load_type_hierarchy, evaluate, get_type_path
with open('./data/train_set', 'r') as f:
    train_set = json.load(f)
    
type_hierarchy, max_depth = load_type_hierarchy('./smart_dataset/evaluation/dbpedia/dbpedia_types.tsv')

Loading type hierarchy from ./smart_dataset/evaluation/dbpedia/dbpedia_types.tsv... 760 types loaded (max depth: 7)


In [None]:
k = 100
system_output = {}
ground_truth = {}
for item in train_set:
    if item['category'] == 'resource':
        if not item['question']:
            print('Missing question:\n{}\n'.format(item))
            continue
            
        query = analyze_query(es, INDEX_NAME, item['question'])
        
        scores = defaultdict(int)
        for score, types in baseline_retrieval(es, INDEX_NAME, ' '.join(query), k):
            for t in types.split():
                scores['dbo:' + t] += score
        
        top_res = max(scores.items(), key=lambda x: x[1] if x[0] in type_hierarchy else 0
            )[0] if len(scores) > 0 else None
        
        ground_truth_type = [t for t in item['type'] if t in type_hierarchy]
        
        system_output[item['id']] = {
            'category': 'resource',
            'type': [t for t in get_type_path(top_res, type_hierarchy)] if top_res else []
        }
        ground_truth[item['id']] = {
            'category': item['category'],
            'type': ground_truth_type
        }
        #if len(system_output) > 50:
        #    break

Missing question:
{'id': 'dbpedia_9619', 'question': None, 'category': 'resource', 'type': ['dbo:Person', 'dbo:Agent']}

Missing question:
{'id': 'dbpedia_7262', 'question': None, 'category': 'resource', 'type': ['dbo:Single', 'dbo:MusicalWork', 'dbo:Work']}

Missing question:
{'id': 'dbpedia_3102', 'question': None, 'category': 'resource', 'type': ['dbo:Award']}

Missing question:
{'id': 'dbpedia_10264', 'question': None, 'category': 'resource', 'type': ['dbo:Media']}

Missing question:
{'id': 'dbpedia_12182', 'question': None, 'category': 'resource', 'type': ['dbo:Person', 'dbo:Agent']}

Missing question:
{'id': 'dbpedia_3072', 'question': None, 'category': 'resource', 'type': ['dbo:Village', 'dbo:Settlement', 'dbo:PopulatedPlace', 'dbo:Place', 'dbo:Location']}

Missing question:
{'id': 'dbpedia_16491', 'question': None, 'category': 'resource', 'type': ['dbo:City', 'dbo:Settlement', 'dbo:PopulatedPlace', 'dbo:Place', 'dbo:Location']}

Missing question:
{'id': 'dbpedia_17378', 'questi

In [None]:
evaluate(system_output, ground_truth, type_hierarchy, max_depth)