In [1]:
import logging
from elasticsearch import Elasticsearch, helpers
import pandas as pd

es = Elasticsearch(
    hosts=[{"hosts":"99:44.95.175", "port":9200}], 
    request_timeout=12000)
index_name = "covid-kaggle"
doc_type = "pdf_json"

index_name_80000 = "covid-kaggle_limit80000"
doc_type_80000 = "large_json"

## Test query ES

In [2]:
## Query to get all ids
query = { 
    'size' : 10000,
    "query" : { 
        "match_all" : {} 
    },
    "stored_fields": []
}
a=helpers.scan(es,query=query,scroll='1m',index=index_name)#like others so far
all_es_ids=[aa['_id'] for aa in a]
print("There are %i of jsons in index 1"%(len(all_es_ids)))

a=helpers.scan(es,query=query,scroll='1m',index=index_name_80000)#like others so far
all_es_ids=[aa['_id'] for aa in a]
print("There are %i of jsons in index 2"%(len(all_es_ids)))

There are 39706 of jsons in index 1
There are 12917 of jsons in index 2


In [85]:
def es_extract(hits):
    scores = []
    paper_ids = []
    publish_years = []
    titles = []
    abstracts = []
    sources = []
    for hit in hits:
        scores.append(hit["_score"])
        paper_ids.append(hit["_source"]["paper_id"])
        publish_years.append(hit["_source"]["publish_year"])
        titles.append(hit["_source"]["metadata"]["title"])
        if "abstract" in hit["_source"]:
            abstracts.append(" ".join([tem_["text"] for tem_ in hit["_source"]["abstract"]]))
        else:
            abstracts.append([])
        sources.append(hit["_source"])

    return scores, paper_ids, publish_years, titles, abstracts, sources 

def es_equery(es, query, index='covid-kaggle', doc_type="pdf_json"):
    '''
    accept : es connection and query doc
    return : dataframe with paper_id, match_scores, publish_year, titles, abstracts, json_contents
    '''
    res3 = es.search(index=index, doc_type=doc_type, body=query)
    print("There are %i of papers returned"%(res3["hits"]["total"]["value"]))
    
    scores, paper_ids, publish_years, titles, abstracts, sources  = es_extract(res3["hits"]["hits"])
    df_return = pd.DataFrame.from_dict({
        "paper_id": paper_ids, 
        "match_scores": scores, 
        "publish_years": publish_years, 
        "title": titles, 
        "abstract": abstracts, 
        "json_obj": sources
    })
    return df_return

In [139]:
def query_key_words_phrases(keywords, keyphrases, task="risk factor"):
    
    multi_match = []
    
    ## task 
    multi_match.append(
     {"multi_match": {
            "query": task, 
            "type": "phrase", 
            "fields": ["metadata.title", "abstract.text", "body_text.text"]}},  
    )
    
    ## keywords
    
    if len(keywords) > 0:
        for keyword in keywords:
            multi_match.append(
             {"multi_match": {
                "query": keyword, 
                "type": "cross_fields", 
                "fields": ["metadata.title", "abstract.text", "body_text.text"]}},  
            )
    
    if len(keyphrases)> 0:
        for keyphrase in keyphrases:
            multi_match.append(
             {"multi_match": {
                "query": keyphrase, 
                "type": "phrase", 
                "fields": ["body_text.text", "abstract.text"]}},  
            )

        
    covid_topic_match = []
    
    covid_key_words = ["coronavirus", 
                       "covid-19", 
                       "2019-nCoV", 
                       "novel CoV", 
                       "SARS Coronavirus 2", 
                       "SARS-CoV-2"
                      ]
    for covid_key_word in covid_key_words:
        covid_topic_match.append(
        {"match": {"metadata.title":covid_key_word}}
        )
    
    query = {
            'query': {
                "bool":{
                    "must":
                       [
                         multi_match
                        ], 

                    "should":  ## For Convid-19 big topic
                    [  
                      covid_topic_match
                    ], 
                    "minimum_should_match" : 1,
                }
            }
       }
    return query

# doc = query_key_words_phrases(keywords=["smoking~", "smoke~"], keyphrases=["risk factor"])
# smoke_df = es_equery(es, doc, index=",".join([index_name, index_name_80000]), 
#                      doc_type=",".join([doc_type, doc_type_80000]), 
#                     )
# smoke_df

In [140]:
doc = query_key_words_phrases(keywords=["smoking~2"], keyphrases=["pulmonary disease"])
q1_df = es_equery(es, doc, index=",".join([index_name, index_name_80000]), 
                     doc_type=",".join([doc_type, doc_type_80000]), 
                    )
q1_df

There are 47 of papers returned


Unnamed: 0,paper_id,match_scores,publish_years,title,abstract,json_obj
0,e9457327ddb51cf3ceb3698660ff08f526a09d44,25.329872,2020,Analysis of factors associated with disease ou...,"Background: Since early December 2019, the 201...",{'paper_id': 'e9457327ddb51cf3ceb3698660ff08f5...
1,48656efc59191537073975938f25f201524971af,25.14307,2020,Neutrophil-to-Lymphocyte Ratio Predicts Severe...,Background: Severe ill patients with 2019 nove...,{'paper_id': '48656efc59191537073975938f25f201...
2,3d77851b29b6aae8f448825a52262e6792d2dbf8,22.949642,2014,Association between Serum Angiotensin-converti...,"After adjustment for age, sex, body mass index...",{'paper_id': '3d77851b29b6aae8f448825a52262e67...
3,PMC7086939,22.89011,-1,Middle East respiratory syndrome coronavirus (...,[],"{'paper_id': 'PMC7086939', 'metadata': {'title..."
4,PMC6466079,21.20957,-1,Host Determinants of MERS-CoV Transmission and...,[],"{'paper_id': 'PMC6466079', 'metadata': {'title..."
5,PMC4689825,20.4738,-1,Renal Complications and Their Prognosis in Kor...,[],"{'paper_id': 'PMC4689825', 'metadata': {'title..."
6,114ed64f52f503d9d2e2ba1fd1ee62b0a168cd84,18.781258,2020,Acute Myocardial Injury of Patients with Coron...,"China, respiratory manifestations of the disea...",{'paper_id': '114ed64f52f503d9d2e2ba1fd1ee62b0...
7,aafa6cdfe96a5cdaf7b7c2f04b11a5dbdd73b2df,18.61014,2020,"Incidence, clinical characteristics and progno...","Background: Recently, Coronavirus Disease 2019...",{'paper_id': 'aafa6cdfe96a5cdaf7b7c2f04b11a5db...
8,6291f43a3acfa5d46254f23d117a8aa252f78bb9,18.106121,2020,History of Coronary Heart Disease Increases th...,China has experienced an outbreak of a novel h...,{'paper_id': '6291f43a3acfa5d46254f23d117a8aa2...
9,c646e655066d0982c151b516ebdf0334d037ffcf,17.708097,2014,Middle East respiratory syndrome coronavirus (...,"In 2012, a novel human coronavirus emerged and...",{'paper_id': 'c646e655066d0982c151b516ebdf0334...


In [141]:
q1_df.iloc[3]["title"]

'Middle East respiratory syndrome coronavirus (MERS-CoV): evidence and speculations'

In [142]:
q1_df.iloc[3]["json_obj"]

{'paper_id': 'PMC7086939',
 'metadata': {'title': 'Middle East respiratory syndrome coronavirus (MERS-CoV): evidence and speculations',
  'authors': [{'first': 'Ahmed',
    'middle': ['S.'],
    'last': 'Abdel-Moneim',
    'suffix': '',
    'email': 'asa@tu.edu.sa',
    'affiliation': {}}]},
 'body_text': [{'text': 'Coronaviruses (CoV) are able to infect humans, birds and many animal species [68]. In humans, coronavirus infections result mostly in mild respiratory, enteric and neurological diseases [65, 66]. In June 2012, a novel coronavirus was detected in a Saudi patient who had experienced pneumonia and renal failure. Virus genome sequencing demonstrated that the virus belonged to lineage C of the genus Betacoronavirus and was phylogenetically related to the bat coronaviruses HKU4 and HKU5, which had previously been found in lesser bamboo bats and Japanese Pipistrelle bats in Hong Kong.',
   'cite_spans': [{'start': 78,
     'end': 80,
     'mention': '68',
     'ref_id': 'BIBREF64'