# Reranking with BERT

In [1]:
from pprint import pprint

import torch

from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer, util



In [7]:
# Build a search finction
def search(exclude_sections, query, index = ""):  

    client = Elasticsearch("localhost:9200")
    
    """
    We use a boolean query to exclude irrelevant sections, but you choose another query type
    if you feel like it returns better results or easier to use
    """
    
    es_query = {
                "bool": {
                    "should": { 
                        "match": { "text": query }
                    },
                    "must_not": {
                        "terms" : { "section_title.keyword" : exclude_sections }
                  },
                }
            }


    # Full text search within an ElasticSearch index (''=all indexes) for the indicated text
    
    docs = client.search(index=index, query=es_query, size=50)
    
    # Reshape search results to prepare them for sentence embeddings
    
    texts = []
    section_titles = []
    article_titles = []

    for h in docs['hits']['hits']:
        texts.append(h["_source"]["text"])
        section_titles.append(h["_source"]["section_title"])
        article_titles.append(h["_source"]["article_title"])
    
    return texts, article_titles, section_titles

In [8]:
query = "what is disease X?"
exclude_sections = ["See also", 'Further reading', 'Data and graphs', 'Medical journals', "External links"]
es_results = search(exclude_sections = exclude_sections, 
                    index = "pandemics", 
                    query = query)

In [9]:
es_results

(['Disease X,Candidates,COVID-19 (2019–2020)\nFrom the outset of the COVID-19 pandemic, experts have speculated whether COVID-19 met the criteria to be Disease X. In early February 2020, Chinese virologist Shi Zhengli from the Wuhan Institute of Virology, wrote that the first Disease X is from a coronavirus. Later that month, Marion Koopmans, Head of Viroscience at Erasmus University Medical Center in Rotterdam, and a member of the WHO\'s R&D Blueprint Special Advisory Group, wrote in scientific journal Cell, "Whether it will be contained or not, this outbreak is rapidly becoming the first true pandemic challenge that fits the disease X category". At the same time, Peter Daszak, also a member of the WHO\'s R&D Blueprint, wrote in an opinion piece in the New York Times saying: "In a nutshell, Covid-19 is Disease X".',
  'Disease X,Candidates,Synthetic viruses / bioweapons\nAt the 2018 announcement of the updated shortlist of blueprint priority diseases, the media speculated that a futur

### Sentence embeddings with the large ROBERTA model

In [10]:
# Determine if you can use a GPU
torch.cuda.is_available()

False

In [11]:
def compute_embeddings(query, es_results, model, top_k=10):
    
    texts = es_results[0]
    article_titles = es_results[1]
    seaction_titles = es_results[2]
    
    embedder = SentenceTransformer(model)
    corpus_embeddings = embedder.encode(texts, convert_to_tensor=True) 
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    reranked_results = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)[0]
    
    reranked_results_list = []
    
    for item in reranked_results:
        
        idx = item['corpus_id']
        reranked_results_dict = {
            'bert_score': item['score'],
            'article_title': article_titles[idx],
            'section_title': article_titles[idx],
            'text': texts[idx]
        }
        
        reranked_results_list.append(reranked_results_dict)

    return reranked_results_list

In [16]:
%%time
reranked = compute_embeddings(query, es_results, model = 'roberta-large-nli-stsb-mean-tokens')

CPU times: user 58 s, sys: 7.3 s, total: 1min 5s
Wall time: 1min 4s


In [17]:
pprint(reranked)

[{'article_title': 'Disease X',
  'bert_score': 0.4526536464691162,
  'section_title': 'Disease X',
  'text': 'Disease X,Candidates,COVID-19 (2019–2020)\n'
          'From the outset of the COVID-19 pandemic, experts have speculated '
          'whether COVID-19 met the criteria to be Disease X. In early '
          'February 2020, Chinese virologist Shi Zhengli from the Wuhan '
          'Institute of Virology, wrote that the first Disease X is from a '
          'coronavirus. Later that month, Marion Koopmans, Head of Viroscience '
          'at Erasmus University Medical Center in Rotterdam, and a member of '
          "the WHO's R&D Blueprint Special Advisory Group, wrote in scientific "
          'journal Cell, "Whether it will be contained or not, this outbreak '
          'is rapidly becoming the first true pandemic challenge that fits the '
          'disease X category". At the same time, Peter Daszak, also a member '
          "of the WHO's R&D Blueprint, wrote in an opinion 

### Sentence embeddings with a Distilled BERT model

In [20]:
%%time 
reranked = compute_embeddings(query, es_results, model = 'distilbert-base-nli-stsb-mean-tokens')

CPU times: user 8.75 s, sys: 741 ms, total: 9.49 s
Wall time: 12.5 s


In [15]:
pprint(reranked)

[{'article_title': 'Disease X',
  'bert_score': 0.4438258707523346,
  'section_title': 'Disease X',
  'text': 'Disease X,Summary\n'
          'Disease X is a placeholder name that was adopted by the World '
          'Health Organization (WHO) in February 2018 on their shortlist of '
          'blueprint priority diseases to represent a hypothetical, unknown '
          'pathogen that could cause a future epidemic. The WHO adopted the '
          'placeholder name to ensure that their planning was sufficiently '
          'flexible to adapt to an unknown pathogen (e.g. broader vaccines and '
          'manufacturing facilities).  Director of the US National Institute '
          'of Allergy and Infectious Diseases Anthony Fauci stated that the '
          'concept of Disease X would encourage WHO projects to focus their '
          'research efforts on entire classes of viruses (e.g. flaviviruses), '
          'instead of just individual strains (e.g. zika virus), thus '
          'imp