# Indexing
This notebook indexes the training set using Elasticsearch. Each document has the fields: question, category and type. This allows for searching for similar questions and then extract their category and/or type.  

In [1]:
import elasticsearch
import math
import numpy as np
import os
import pytest
import random
import requests
import tarfile
import utils

from collections import Counter
from collections import defaultdict
from elasticsearch import Elasticsearch

In [2]:
es = Elasticsearch()
es.info()

{'name': 'Karl-PC',
 'cluster_name': 'elasticsearch',
 'cluster_uuid': 'c2RTr28oQWi6_aB1dQbLmA',
 'version': {'number': '7.9.1',
  'build_flavor': 'default',
  'build_type': 'tar',
  'build_hash': '083627f112ba94dffc1232e8b42b73492789ef91',
  'build_date': '2020-09-01T21:22:21.964974Z',
  'build_snapshot': False,
  'lucene_version': '8.6.2',
  'minimum_wire_compatibility_version': '6.8.0',
  'minimum_index_compatibility_version': '6.0.0-beta1'},
 'tagline': 'You Know, for Search'}

In [3]:
train = utils.load_dataset('datasets/DBpedia/train.json')

In [4]:
INDEX_NAME = 'questions'

INDEX_SETTINGS = {
    'mappings': {
            'properties': {
                'question': {
                    'type': 'text',
                    'term_vector': 'yes',
                    'analyzer': 'english'
                },
                'category': {
                    'type': 'text',
                    'term_vector': 'yes',
                    'analyzer': 'english'
                },
                'type': {
                    'type': 'keyword'
                }
            }
        }
    }

In [5]:
def index_doc(es, doc, doc_id, index='questions'):
    """Indexes the body of a fielded document to be indexed.
    
    Argments:
        es: Elasticsearch object instance.
        doc: Dict with fields as keys and strings as values. Assumes {'title': '...', 'body': '...'}.
        doc_id: Document ID to be used in index. 
        index: Name of index under which documents will be organized.
    """
    es.index(index=index, id=doc_id, body=doc)

Index the training set

In [6]:
if es.indices.exists(INDEX_NAME):
    es.indices.delete(INDEX_NAME)
    
es.indices.create(index=INDEX_NAME, body=INDEX_SETTINGS)
count = 0
for x in train:
    if x['category'] != 'resource':
        count += 1
        continue
    doc = {
        'question': x['question'],
        'category': x['category'],
        'type': x['type']
    }

    index_doc(es, doc, x['id'], index=INDEX_NAME)
    count += 1
    print('\rIndexed documents: ' + str(count) + '/' + str(len(train)), end='', flush=True)
print('') # Write newline

Indexed documents: 13146/13146


In [7]:
es.get(index=INDEX_NAME, id='dbpedia_14427')

{'_index': 'questions',
 '_type': '_doc',
 '_id': 'dbpedia_14427',
 '_version': 1,
 '_seq_no': 0,
 '_primary_term': 1,
 'found': True,
 '_source': {'question': 'What is the name of the opera based on Twelfth Night ?',
  'category': 'resource',
  'type': ['dbo:Opera', 'dbo:MusicalWork', 'dbo:Work']}}

In [8]:
q = 'Is cola healthy?'

es.search(index=INDEX_NAME, q=q, _source=True, size=5).get('hits', {}).get('hits', {})

[{'_index': 'questions',
  '_type': '_doc',
  '_id': 'dbpedia_16642',
  '_score': 9.011995,
  '_source': {'question': 'Who manufactures Coca-Cola?',
   'category': 'resource',
   'type': ['dbo:Company', 'dbo:Organisation', 'dbo:Agent']}},
 {'_index': 'questions',
  '_type': '_doc',
  '_id': 'dbpedia_3697',
  '_score': 7.914068,
  '_source': {'question': 'What products does Coca-Cola produce?',
   'category': 'resource',
   'type': ['dbo:Company', 'dbo:Organisation', 'dbo:Agent']}},
 {'_index': 'questions',
  '_type': '_doc',
  '_id': 'dbpedia_11504',
  '_score': 7.914068,
  '_source': {'question': 'Which is the public company for the manufacturing of Coca-Cola?',
   'category': 'resource',
   'type': ['dbo:Company', 'dbo:Organisation', 'dbo:Agent']}},
 {'_index': 'questions',
  '_type': '_doc',
  '_id': 'dbpedia_15758',
  '_score': 7.4596634,
  '_source': {'question': 'What is the patent for the products produced by the Coca-Cola Company?',
   'category': 'resource',
   'type': ['dbo:A