In [36]:
import logging
from elasticsearch import Elasticsearch, helpers
import pandas as pd

es = Elasticsearch(
    hosts=[{"hosts":"99:44.95.175", "port":9200}], 
    request_timeout=12000)
index_name = "covid-kaggle"
doc_type = "pdf_json"

## Test query ES

In [15]:
## Query to get all ids
query = { 
    'size' : 10000,
    "query" : { 
        "match_all" : {} 
    },
    "stored_fields": []
}
a=helpers.scan(es,query=query,scroll='1m',index='covid-kaggle')#like others so far
all_es_ids=[aa['_id'] for aa in a]
print("There are %i of ids on the elasticsearch server"%(len(all_es_ids)))

There are 29237 of ids on the elasticsearch server


In [41]:
def es_extract(hits):
    scores = []
    paper_ids = []
    publish_years = []
    titles = []
    abstracts = []
    sources = []
    for hit in hits:
        scores.append(hit["_score"])
        paper_ids.append(hit["_source"]["paper_id"])
        publish_years.append(hit["_source"]["publish_year"])
        titles.append(hit["_source"]["metadata"]["title"])
        abstracts.append(" ".join([tem_["text"] for tem_ in hit["_source"]["abstract"]]))
        sources.append(hit["_source"])

    return scores, paper_ids, publish_years, titles, abstracts, sources 

def es_equery(es, query, index='covid-kaggle', doc_type="pdf_json"):
    '''
    accept : es connection and query doc
    return : dataframe with paper_id, match_scores, publish_year, titles, abstracts, json_contents
    '''
    res3 = es.search(index=index, doc_type=doc_type, body=query)
    print("There are %i of papers returned"%(res3["hits"]["total"]["value"]))
    
    scores, paper_ids, publish_years, titles, abstracts, sources  = es_extract(res3["hits"]["hits"])
    df_return = pd.DataFrame.from_dict({
        "paper_id": paper_ids, 
        "match_scores": scores, 
        "publish_years": publish_years, 
        "title": titles, 
        "abstract": abstracts, 
        "json_obj": sources
    })
    return df_return

In [69]:
## smoking
doc = {
        'query': {
            "bool":{
                "must":
                   [{"match": {"abstract.text": "risk"}},  
                    {"match":{"body_text.text":"covid-19"}},
                    {"match": {"body_text.text": "risk"}},
                    {"match": {"body_text.text":"smoking"}}, 
#                     {"match": {"abstract.text":"smok\w{1,4}"}},
                    ], 
                "should":
                [  {"match": {"abstract.text":"corona\w{1, 3}virus"}}, 
                   {"match": {"abstract.text":"covid-19"}}, 
                   {"match": {"metadata.title":"corona\w{1, 3}virus"}}, 
                   {"match": {"metadata.title":"covid-19"}}, 
                ], 
                "minimum_should_match" : 1,
            }
        }
   }
smoke_df = es_equery(es, doc)
smoke_df

There are 120 of papers returned


Unnamed: 0,paper_id,match_scores,publish_years,title,abstract,json_obj
0,e9457327ddb51cf3ceb3698660ff08f526a09d44,28.709036,2020,Analysis of factors associated with disease ou...,"Background: Since early December 2019, the 201...",{'paper_id': 'e9457327ddb51cf3ceb3698660ff08f5...
1,875b7c463f00772fa0dc18ada678bc1ff16a4274,23.984732,2020,performed data analysis,Objective: To evaluate the spectrum of comorbi...,{'paper_id': '875b7c463f00772fa0dc18ada678bc1f...
2,48106886ec5b19e6cc62abf552ff3529f1d8aca3,21.233038,2017,The presence of fever in adults with influenza...,We compared the rates of fever in adult subjec...,{'paper_id': '48106886ec5b19e6cc62abf552ff3529...
3,86a11741a321a657e22fd7d94af6193d27cc6f5f,18.933857,2018,Clinical characteristics of patients with labo...,A novel pandemic influenza A(H1N1)pdm09 virus ...,{'paper_id': '86a11741a321a657e22fd7d94af6193d...
4,79c4b114ab9f9e8070911e38d090446dc700453f,18.684471,2011,Abbreviations: DALY 5 disability-adjusted life...,"publicly funded repositories, such as the WHO ...",{'paper_id': '79c4b114ab9f9e8070911e38d090446d...
5,7574e8c8c1e0e6df24e6ef15177e2eff23262511,18.66302,2014,,"publicly funded repositories, such as the WHO ...",{'paper_id': '7574e8c8c1e0e6df24e6ef15177e2eff...
6,422ba1058ef4ab555ac1a874029c600f71d80520,18.198818,2013,A Prospective Study of Respiratory Viral Infec...,"publicly funded repositories, such as the WHO ...",{'paper_id': '422ba1058ef4ab555ac1a874029c600f...
7,b31d9e4a450071584fa87e054414be87264def22,17.703465,2014,Vitamin D 3 and gargling for the prevention of...,"Background: We undertook a 2X2 factorial, rand...",{'paper_id': 'b31d9e4a450071584fa87e054414be87...
8,fef233c816bbfae6c4468f858654c5af42237fc0,17.53978,2008,Preterm Birth 1 Epidemiology and causes of pre...,This paper is the fi rst in a three-part serie...,{'paper_id': 'fef233c816bbfae6c4468f858654c5af...
9,0c5c7c0eda4954c87bac8bc496087b963d006796,17.365715,2014,Gut microbiome and the risk factors in central...,Humans are colonized after birth by microbial ...,{'paper_id': '0c5c7c0eda4954c87bac8bc496087b96...


In [72]:
## Drinking
doc = {
        'query': {
            "bool":{
                "must":
                   [{"match": {"abstract.text": "risk"}},  
                    {"match":{"body_text.text":"covid-19"}},
                    {"match": {"body_text.text": "risk"}},
                    {"match": {"body_text.text":"drinking"}}, 
#                     {"match": {"abstract.text":"smok\w{1,4}"}},
                    ], 
                "should":
                [  {"match": {"abstract.text":"corona\w{1, 3}virus"}}, 
                   {"match": {"abstract.text":"covid-19"}}, 
                   {"match": {"metadata.title":"corona\w{1, 3}virus"}}, 
                   {"match": {"metadata.title":"covid-19"}}, 
                ], 
                "minimum_should_match" : 1,
            }
        }
   }
smoke_df = es_equery(es, doc)
smoke_df

There are 48 of papers returned


Unnamed: 0,paper_id,match_scores,publish_years,title,abstract,json_obj
0,54cfc3c68e1e4832fee5b3294e5673e37978ebdf,44.05081,2020,Risk factors related to hepatic injury in pati...,Corona virus disease 2019 has rapidly become t...,{'paper_id': '54cfc3c68e1e4832fee5b3294e5673e3...
1,d729ec4af90b62b3fd52c5e79c868955213a70f2,17.899366,2016,Detection of hepatitis E virus and other lives...,Manure application is a source of pathogens to...,{'paper_id': 'd729ec4af90b62b3fd52c5e79c868955...
2,d1fd1fafa75ee05827d3e7e916a03ddf159317ea,17.254992,2017,Evaluation of biosecurity measures to prevent ...,Background: The effectiveness of biosecurity m...,{'paper_id': 'd1fd1fafa75ee05827d3e7e916a03ddf...
3,90f666e3bf3eb357e25515c3fb72c97a80b5a2d6,17.246162,2011,Surveillance of adenoviruses and noroviruses i...,Noroviruses Bathing water River water Sea wate...,{'paper_id': '90f666e3bf3eb357e25515c3fb72c97a...
4,0c6668002dc61159a3e4456adc22d7ab38c7e8e6,17.193268,2019,Biosafety and data quality considerations for ...,Animal models are crucial for the study of sev...,{'paper_id': '0c6668002dc61159a3e4456adc22d7ab...
5,b01edd6c71207f4ab7f514a7e39267ee2eb8d029,16.507553,2010,,To identify environmental sites commonly conta...,{'paper_id': 'b01edd6c71207f4ab7f514a7e39267ee...
6,5eb89ef1b9b8c83821dc226c368a9cf07e663614,15.904154,2002,Foodborne viruses 1,Foodborne and waterborne viral infections are ...,{'paper_id': '5eb89ef1b9b8c83821dc226c368a9cf0...
7,79c4b114ab9f9e8070911e38d090446dc700453f,15.36631,2011,Abbreviations: DALY 5 disability-adjusted life...,"publicly funded repositories, such as the WHO ...",{'paper_id': '79c4b114ab9f9e8070911e38d090446d...
8,691856452de91727b7b9b7644f82b4bc5876f32f,13.942347,2016,Article 88 1 Original research,Nearly 4 years after the first report of the e...,{'paper_id': '691856452de91727b7b9b7644f82b4bc...
9,ec2e48e04cf44079c26155ee6d4ba37e2c404c52,13.124254,2019,A time-trend ecological study for identifying ...,Background: Flood-related damage can be very s...,{'paper_id': 'ec2e48e04cf44079c26155ee6d4ba37e...


In [73]:
## match_phrase
doc = {
        'query': {
            "bool":{
                "must":
                   [{"match": {"abstract.text": "risk"}},  
                    {"match":{"body_text.text":"covid-19"}},
                    {"match": {"body_text.text": "risk"}},
                    {"match_phrase": {"body_text.text":"seafood market"}}, 
#                     {"match": {"abstract.text":"smok\w{1,4}"}},
                    ], 
                "should":
                [  {"match": {"abstract.text":"corona\w{1, 3}virus"}}, 
                   {"match": {"abstract.text":"covid-19"}}, 
                   {"match": {"metadata.title":"corona\w{1, 3}virus"}}, 
                   {"match": {"metadata.title":"covid-19"}}, 
                ], 
                "minimum_should_match" : 1,
            }
        }
   }
smoke_df = es_equery(es, doc)
smoke_df

There are 14 of papers returned


Unnamed: 0,paper_id,match_scores,publish_years,title,abstract,json_obj
0,606233835c3d6d195b7d230745ccb0fded626aa7,47.504387,2020,Distribution of the COVID-19 Epidemic and Corr...,The ongoing new coronavirus pneumonia (Corona ...,{'paper_id': '606233835c3d6d195b7d230745ccb0fd...
1,8330d31a5ec8b240991ed6d5a0dcbaf967f7dbd0,43.63439,2020,Clinical course and risk factors for mortality...,"Background Since December, 2019, Wuhan, China,...",{'paper_id': '8330d31a5ec8b240991ed6d5a0dcbaf9...
2,49ac69f362c27acbc6de0c5cbb640267e7a1e797,42.260384,2020,Clinical features and outcomes of 221 patients...,Pan and ZY Peng are cocorresponding authors. A...,{'paper_id': '49ac69f362c27acbc6de0c5cbb640267...
3,8ab598acacefc034a59ae01580fdf29bcf853234,35.170048,2020,Journal Pre-proof The SARS-CoV-2 outbreak: wha...,Highlights  the latest summary of the COVID-1...,{'paper_id': '8ab598acacefc034a59ae01580fdf29b...
4,e9457327ddb51cf3ceb3698660ff08f526a09d44,34.925125,2020,Analysis of factors associated with disease ou...,"Background: Since early December 2019, the 201...",{'paper_id': 'e9457327ddb51cf3ceb3698660ff08f5...
5,2ba7dfd6227e7f64d80204770ccbe140afa78117,31.341694,2020,RBD mutations from circulating SARS-CoV-2 stra...,26 A novel zoonotic coronavirus SARS-CoV-2 is ...,{'paper_id': '2ba7dfd6227e7f64d80204770ccbe140...
6,111b9a6e91c938696fcdb4cb128b8ae739dbe11c,30.51995,2020,Clinical features and sexual transmission pote...,"Background: As of March 2, 2020, SARS-CoV-2 ha...",{'paper_id': '111b9a6e91c938696fcdb4cb128b8ae7...
7,1766a9d5945897581c80f83276cc723c77e3798d,21.065514,2020,Articles Clinical course and outcomes of criti...,Background An ongoing outbreak of pneumonia as...,{'paper_id': '1766a9d5945897581c80f83276cc723c...
8,2271485cae8757f2abdb1c2d012bb892c5421ba4,19.996134,-1,,A novel bat-origin coronavirus emerged in Wuha...,{'paper_id': '2271485cae8757f2abdb1c2d012bb892...
9,0efa3d98b741783872bbc16745f163a27b39a59e,19.996134,-1,A strategy to prevent future epidemics similar...,A novel bat-origin coronavirus emerged in Wuha...,{'paper_id': '0efa3d98b741783872bbc16745f163a2...
