In [1]:
#import dependancies
import json
from elasticsearch import Elasticsearch, helpers

In [2]:
#initialize Elasticsearch client
client = Elasticsearch()

In [3]:
#query the idex called "pandemics".
client.search(index='pandemics', filter_path=['hits.hits._id', 'hits.hits._type'])



{'hits': {'hits': [{'_type': '_doc', '_id': '5i5Bun8BLCaO74XWoA3T'},
   {'_type': '_doc', '_id': '5y5Bun8BLCaO74XWoA3T'},
   {'_type': '_doc', '_id': '6C5Bun8BLCaO74XWoA3T'},
   {'_type': '_doc', '_id': '6S5Bun8BLCaO74XWoA3T'},
   {'_type': '_doc', '_id': '6i5Bun8BLCaO74XWoA3T'},
   {'_type': '_doc', '_id': '6y5Bun8BLCaO74XWoA3T'},
   {'_type': '_doc', '_id': '7C5Bun8BLCaO74XWoA3T'},
   {'_type': '_doc', '_id': '7S5Bun8BLCaO74XWoA3T'},
   {'_type': '_doc', '_id': '7i5Bun8BLCaO74XWoA3T'},
   {'_type': '_doc', '_id': '7y5Bun8BLCaO74XWoA3T'}]}}

### Get 10 articles

In [4]:
res = client.search(index="pandemics", query= {"match_all": {}})
print("Got %d Hits:" % res['hits']['total']['value'])
for hit in res['hits']['hits']:
    print("%(article_title)s %(section_title)s" % hit["_source"])

Got 401 Hits:
Pandemic Summary
Pandemic Definition
Pandemic Stages
Pandemic Severity
Pandemic Management
Pandemic HIV/AIDS
Pandemic COVID-19
Pandemic Notable outbreaks
Pandemic Cholera
Pandemic Influenza


### Create a simple (match) query and extract the results from the nested JSON response.

In [5]:
question = "spanish flu"
query = {"match": {"text": question}}

res = client.search(index="pandemics", query=query)
print("Got %d Hits:" % res['hits']['total']['value'])
for hit in res['hits']['hits']:
    print("%(article_title)s %(section_title)s" % hit["_source"])

Got 45 Hits:
Pandemic Influenza
Disease X In popular culture
Swine influenza Summary
Pandemic H5N1 (Avian flu)
Pandemic Summary
Pandemic severity index Guidelines
1929–1930 psittacosis pandemic Later
Pandemic Typhus
Swine influenza External links
Swine influenza Humans


In [6]:
# you can also count documents in a specific index
client.cat.count(['pandemics'], params={"format": "json"})

  client.cat.count(['pandemics'], params={"format": "json"})


[{'epoch': '1648097386', 'timestamp': '04:49:46', 'count': '401'}]

### Match query

In [7]:
# Create a query body and a search string (we will be looking for information about Spanish flu)
question = "spanish flu"
match_query = {"match": {"text": question}}

# submit a search query to ElasticSearch
docs = client.search(query = match_query, index="pandemics", size=15)

#### Inspect the response structure

In [8]:
# print out the response - looks like it is nested dictionary
print(type(docs))
docs

<class 'dict'>


{'took': 6,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 45, 'relation': 'eq'},
  'max_score': 9.171614,
  'hits': [{'_index': 'pandemics',
    '_type': '_doc',
    '_id': '7y5Bun8BLCaO74XWoA3T',
    '_score': 9.171614,
    '_ignored': ['text.keyword'],
    '_source': {'section_title': 'Influenza',
     'text': 'Pandemic,Notable outbreaks,Major outbreaks in countries,Influenza\nThe Greek physician Hippocrates, the "Father of Medicine", first described influenza in 412 BC.\nThe first influenza pandemic to be pathologically described occurred in 1510. Since the pandemic of 1580, influenza pandemics have occurred every 10 to 30 years.\nThe 1889–1890 flu pandemic, also known as Russian Flu or Asiatic Flu, was first reported in May 1889 in Bukhara, Uzbekistan. By October, it had reached Tomsk and the Caucasus. It rapidly spread west and hit North America in December 1889, South America in February–April 1890, India i

In [9]:
from pprint import pprint
# The search results are in the field 'hits':
print("Total articles found", docs['hits']['total']) #total results count returned
# Another way to check how many results have been actually returned:
print("Results returned", len(docs['hits']['hits'])) # the response was limited to top 10 results
# Let's access the first result:
pprint(docs['hits']['hits'][0]) #use pprint for a more convenient display of hierarchical structure

Total articles found {'value': 45, 'relation': 'eq'}
Results returned 15
{'_id': '7y5Bun8BLCaO74XWoA3T',
 '_ignored': ['text.keyword'],
 '_index': 'pandemics',
 '_score': 9.171614,
 '_source': {'article_title': 'Pandemic',
             'main_section': 'Notable outbreaks',
             'page_id': 24255,
             'section_number': 12,
             'section_title': 'Influenza',
             'source_url': 'https://en.wikipedia.org/wiki/Pandemic',
             'tags': 'Pandemic,Notable outbreaks,Major outbreaks in '
                     'countries,Influenza',
             'text': 'Pandemic,Notable outbreaks,Major outbreaks in '
                     'countries,Influenza\n'
                     'The Greek physician Hippocrates, the "Father of '
                     'Medicine", first described influenza in 412 BC.\n'
                     'The first influenza pandemic to be pathologically '
                     'described occurred in 1510. Since the pandemic of 1580, '
                     

#### Print out top three results

In [10]:
responses = []
for doc in docs["hits"]["hits"][:3]:
    article_title = doc['_source']['article_title']
    section_title = doc['_source']['section_title']
    score = doc['_score']
    idx = doc['_index']
    text = doc['_source']['text']
    url = doc['_source']['source_url']

    responses.append({'article_title': article_title,
                      'section_title': section_title,
                      'text': text,
                      'url': url,
                      'score': score
                     })

pprint(responses)

[{'article_title': 'Pandemic',
  'score': 9.171614,
  'section_title': 'Influenza',
  'text': 'Pandemic,Notable outbreaks,Major outbreaks in countries,Influenza\n'
          'The Greek physician Hippocrates, the "Father of Medicine", first '
          'described influenza in 412 BC.\n'
          'The first influenza pandemic to be pathologically described '
          'occurred in 1510. Since the pandemic of 1580, influenza pandemics '
          'have occurred every 10 to 30 years.\n'
          'The 1889–1890 flu pandemic, also known as Russian Flu or Asiatic '
          'Flu, was first reported in May 1889 in Bukhara, Uzbekistan. By '
          'October, it had reached Tomsk and the Caucasus. It rapidly spread '
          'west and hit North America in December 1889, South America in '
          'February–April 1890, India in February–March 1890, and Australia in '
          'March–April 1890. The H3N8 and H2N2 subtypes of the Influenza A '
          'virus have each been identified as

### Term query

In [11]:
# Let's search for a section titled "Zoonotic viruses"
term_question = "pandemic"
term_query = {"term": {"article_title": term_question}}

"""
# Note that you need to use the keyword version of the field if your search query contains more that one word.
term_question = "Zoonotic viruses"
term_query = {"term": {"section_title.keyword": term_question}}
"""

# submit a search query to ElasticSearch
term_docs = client.search(query = term_query, index="pandemics", size=5)

In [12]:
term_responses = []

for doc in term_docs["hits"]["hits"]:
    article_title = doc['_source']['article_title']
    section_title = doc['_source']['section_title']
    score = doc['_score']
    idx = doc['_index']
    text = doc['_source']['text']
    url = doc['_source']['source_url']

    term_responses.append({'article_title': article_title,
                           'section_title': section_title,
                           'text':text,
                           'url': url,
                           'score': score
                               })
pprint(term_responses)

[{'article_title': 'Pandemic',
  'score': 1.5233563,
  'section_title': 'Summary',
  'text': 'Pandemic,Summary\n'
          'A pandemic (from Greek πᾶν, pan, "all" and δῆμος, demos, "people") '
          'is an epidemic of an infectious disease that has spread across a '
          'large region, for instance multiple continents or worldwide, '
          'affecting a substantial number of people. A widespread endemic '
          'disease with a stable number of infected people is not a pandemic. '
          'Widespread endemic diseases with a stable number of infected people '
          'such as recurrences of seasonal influenza are generally excluded as '
          'they occur simultaneously in large regions of the globe rather than '
          'being spread worldwide.\n'
          'Throughout human history, there have been a number of pandemics of '
          'diseases such as smallpox and tuberculosis. The most fatal pandemic '
          'in recorded history was the Black Death (also

### Boolean query

In [18]:
# In this example we will exclude all sections called "See Also" from our search results
# Compare it with the results without using boolean query

bool_question = "world health organization"
exclude_sections = "External links"
bool_query = {
    "bool": {
        "should": 
        { "match": {"text": bool_question}},
        "must_not": {
            "term" : { "section_title.keyword" : exclude_sections }
      },
    }
}

# submit a search query to Elasticsearch
bool_docs = client.search(query = bool_query, index="pandemics", size=5)

In [14]:
bool_responses = []

for doc in bool_docs["hits"]["hits"]:
    article_title = doc['_source']['article_title']
    section_title = doc['_source']['section_title']
    score = doc['_score']
    idx = doc['_index']
    text = doc['_source']['text']
    url = doc['_source']['source_url']

    bool_responses.append({'article_title': article_title,
                           'section_title': section_title,
                           'text':text,
                           'url': url,
                           'score': score
                               })
pprint(bool_responses)

[{'article_title': 'Johns Hopkins Center for Health Security',
  'score': 7.7753215,
  'section_title': 'Summary',
  'text': 'Johns Hopkins Center for Health Security,Summary\n'
          'The Johns Hopkins Center for Health Security (abbreviated CHS; '
          'previously the UPMC Center for Health Security, the Center for '
          'Biosecurity of UPMC, and the Johns Hopkins Center for Civilian '
          'Biodefense Strategies) is an independent, nonprofit organization of '
          'the Johns Hopkins Bloomberg School of Public Health, and part of '
          'the Environmental Health and Engineering department. It is '
          'concerned with the areas of health consequences from epidemics and '
          'disasters as well as averting biological weapons development, and '
          'implications of biosecurity for the bioeconomy. It is a think tank '
          'that does policy research and gives policy recommendations to the '
          'United States government as well a

### Field boosting

In [19]:
# In this example we will boost title fields by a factor of 3.
# Suggestion: Compare it with the results without boosting

boost_question = "covid-19 pandemic"
boost_query = {
    "multi_match": {
        "query": boost_question,
        "fields": [ "article_title^3", "section_title^3" ] 
    }
}

# submit a search query to ElasticSearch
boost_docs = client.search(query = boost_query, index="pandemics", size=5)

In [16]:
boost_responses = []

for doc in boost_docs["hits"]["hits"]:
    article_title = doc['_source']['article_title']
    section_title = doc['_source']['section_title']
    score = doc['_score']
    idx = doc['_index']
    text = doc['_source']['text']
    url = doc['_source']['source_url']

    boost_responses.append({'article_title': article_title,
                           'section_title': section_title,
                           'text':text,
                           'url': url,
                           'score': score
                             })
# Notice how relevance scores for "article_title" and "section_title" fields are three times higher then before
pprint(boost_responses)

[{'article_title': 'Pandemic',
  'score': 27.540213,
  'section_title': 'COVID-19',
  'text': 'Pandemic,Current pandemics,COVID-19\n'
          'A new strain of coronavirus was first identified in the city of '
          'Wuhan, Hubei province, China, in late December 2019. It has caused '
          'a cluster of cases of an acute respiratory disease, which is '
          'referred to as coronavirus disease 2019 (COVID-19). According to '
          'media reports, more than 200 countries and territories have been '
          'affected by COVID-19, with major outbreaks occurring in Brazil, '
          'Russia, India, Mexico, Peru, South Africa, Western Europe and the '
          'United States. On 11 March 2020, the World Health Organization '
          'characterized the spread of COVID-19 as a pandemic. As of 16 '
          'November 2020, the number of people infected with COVID-19 has '
          'reached 54,978,057 worldwide, of whom 38,243,617 have recovered. '
          'The deat

### Highlights

In [22]:
# Create a query and a search string (we will be looking for information about Spanish flu)
hl_question = "genetic mutations"
hl_query = {
    "match": { "text": hl_question }
}

hl_highlight = {
    "fields": {"text": {"number_of_fragments" : 1, 'fragment_size':256}
             }
}


# submit a search query to ElasticSearch
hl_docs = client.search(query = hl_query, highlight = hl_highlight, index="pandemics", size=15)

In [23]:
hl_responses = []

for doc in hl_docs["hits"]["hits"]:
    article_title = doc['_source']['article_title']
    section_title = doc['_source']['section_title']
    score = doc['_score']
    idx = doc['_index']
    text = doc['_source']['text']
    highlight = doc['highlight']['text']
    url = doc['_source']['source_url']

    hl_responses.append({'article_title': article_title,
                           'section_title': section_title,
                           'text': text,
                           'highlight': highlight,
                           'url': url,
                           'score': score
                             })
# Notice how relevance scores for "article_title" and "section_title" fields are three times higher than before
pprint(hl_responses)

[{'article_title': 'Virus',
  'highlight': ['Virus,Microbiology,<em>Genetic</em> mutation\n'
                'Viruses undergo <em>genetic</em> change by several '
                'mechanisms. These include a process called antigenic drift '
                'where individual bases in the DNA or RNA mutate to other '
                'bases.'],
  'score': 8.579811,
  'section_title': 'Genetic mutation',
  'text': 'Virus,Microbiology,Genetic mutation\n'
          'Viruses undergo genetic change by several mechanisms. These include '
          'a process called antigenic drift where individual bases in the DNA '
          'or RNA mutate to other bases. Most of these point mutations are '
          '"silent"—they do not change the protein that the gene encodes—but '
          'others can confer evolutionary advantages such as resistance to '
          'antiviral drugs. Antigenic shift occurs when there is a major '
          'change in the genome of the virus. This can be a result of '
     