# Risk factor for COVID-19

Author: Jingbo Liu, Tina Zhao, Yinsen Miao, Jenny Yang  

Main strategy: we used the "key words" + "elastic search" to find out the most relavant papers from the database. Then, based on the match score from high to low. We provided: (1) important selected papers; (2) manully extract tables and numbers from the top 3 papers for each risk factor described below. 

## Content Summary

### 1. behavioral risk factors
1. Smoking
2. Drinking
3. exposure to seafood market

### 2. exisiting diseases/co-morbidities
1. respiratory disease
2. cardiovascular disease
3. chronic disease, including high blood pressure, diabetes, malignancy
4. Other

### 3. Demographic characterisics
1. Age
2. Gender
3. Occupation
4. pregnancy
5. socio-economic status

### 4. COVID-19 characteristics
1. patient number, deaths number
2. incubation period
3. symptom

In [6]:
###set up elastic search portal
import logging
from elasticsearch import Elasticsearch, helpers
import pandas as pd

# es = Elasticsearch(
#     hosts=[{"hosts":"99:44.95.175", "port":9200}], 
#     request_timeout=12000)
# index_name = "covid-kaggle"
# doc_type = "pdf_json"


from datetime import datetime
from elasticsearch import Elasticsearch, helpers
es = Elasticsearch(
    hosts=[{'host': "99.44.95.175", 'port': 9200}])

In [7]:
## check total paper numbers
query = { 
    'size' : 10000,
    "query" : { 
        "match_all" : {} 
    },
    "stored_fields": []
}
a=helpers.scan(es,query=query,scroll='1m',index='covid-kaggle')#like others so far
all_es_ids=[aa['_id'] for aa in a]
print("There are %i of ids on the elasticsearch server"%(len(all_es_ids)))

There are 39706 of ids on the elasticsearch server


In [8]:
##define functions used for query
def es_extract(hits):
    scores = []
    paper_ids = []
    publish_years = []
    titles = []
    abstracts = []
    sources = []
    for hit in hits:
        scores.append(hit["_score"])
        paper_ids.append(hit["_source"]["paper_id"])
        publish_years.append(hit["_source"]["publish_year"])
        titles.append(hit["_source"]["metadata"]["title"])
        abstracts.append(" ".join([tem_["text"] for tem_ in hit["_source"]["abstract"]]))
        sources.append(hit["_source"])

    return scores, paper_ids, publish_years, titles, abstracts, sources 

def es_equery(es, query, index='covid-kaggle', doc_type="pdf_json"):
    '''
    accept : es connection and query doc
    return : dataframe with paper_id, match_scores, publish_year, titles, abstracts, json_contents
    '''
    res3 = es.search(index=index, doc_type=doc_type, body=query)
    print("There are %i of papers returned"%(res3["hits"]["total"]["value"]))
    
    scores, paper_ids, publish_years, titles, abstracts, sources  = es_extract(res3["hits"]["hits"])
    df_return = pd.DataFrame.from_dict({
        "paper_id": paper_ids, 
        "match_scores": scores, 
        "publish_years": publish_years, 
        "title": titles, 
        "abstract": abstracts, 
        "json_obj": sources
    })
    return df_return

In [23]:
def build_doc(keyword):
    '''
    keywords could be one string, or a list with multiple string
    
    '''
    
    doc = {
        'query': {
            "bool":{
                "must":
                   [{"match": {"abstract.text": "risk"}},  
                    {"match": {"body_text.text":"covid-19"}},
                    {"match": {"body_text.text": "noval coronavirus"}},
                    {"match": {"body_text.text": "risk"}},
                    {"match": {"body_text.text": keyword}},
                    {"match": {"body_text.text": "2019-ncov"}},
                    {"match": {"body_text.text": "sars coronavirus 2"}},
                    {"match": {"body_text.text": "sars-cov-2"}},
                    {"match": {"body_text.text": "coronavirus"}}
                    ], 
                "should":
                [  {"match": {"abstract.text":"corona\w{1, 3}virus"}}, 
                   {"match": {"abstract.text":"covid-19"}}, 
                   {"match": {"metadata.title":"corona\w{1, 3}virus"}}, 
                   {"match": {"metadata.title":"covid-19"}}, 
                ], 
                "minimum_should_match" : 1,
            }
        }
    }
    
    return doc

In [12]:
#load meta.csv for summary of database
meta_csv = pd.read_csv("metadata.csv")
print(meta_csv.shape)

(47298, 18)


In [15]:
meta_csv.head(3)

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_pdf_parse,has_pmc_xml_parse,full_text_file,url
0,8q5ondtn,,Elsevier,Intrauterine virus infections and congenital h...,10.1016/0002-8703(72)90077-4,,4361535.0,els-covid,Abstract The etiologic basis for the vast majo...,1972-12-31,"Overall, James C.",American Heart Journal,,,False,False,custom_license,https://doi.org/10.1016/0002-8703(72)90077-4
1,pzfd0e50,,Elsevier,Coronaviruses in Balkan nephritis,10.1016/0002-8703(80)90355-5,,6243850.0,els-covid,,1980-03-31,"Georgescu, Leonida; Diosi, Peter; Buţiu, Ioan;...",American Heart Journal,,,False,False,custom_license,https://doi.org/10.1016/0002-8703(80)90355-5
2,22bka3gi,,Elsevier,Cigarette smoking and coronary heart disease: ...,10.1016/0002-8703(80)90356-7,,7355701.0,els-covid,,1980-03-31,"Friedman, Gary D",American Heart Journal,,,False,False,custom_license,https://doi.org/10.1016/0002-8703(80)90356-7


## 1. Behavioral Risk Factors



### 1.1 Smoking

In [25]:
#query
smoke_df = es_equery(es, build_doc(keyword="smoking"))

There are 10 of papers returned


In [44]:
#query
smoke_df = es_equery(es, build_doc(keyword="vaping"))

There are 0 of papers returned


In [43]:
smoke_df.head(3)

Unnamed: 0,paper_id,match_scores,publish_years,title,abstract,json_obj
0,aafa6cdfe96a5cdaf7b7c2f04b11a5dbdd73b2df,54.171986,2020,"Incidence, clinical characteristics and progno...","Background: Recently, Coronavirus Disease 2019...",{'paper_id': 'aafa6cdfe96a5cdaf7b7c2f04b11a5db...
1,bf013151c39358fb0c19ad4dee802a9ccb7fb0ac,48.682587,2020,ACE-2 Expression in the Small Airway Epithelia...,"All of these institutions are in Vancouver, Br...",{'paper_id': 'bf013151c39358fb0c19ad4dee802a9c...
2,1a48450da54865731ffe01c0e289a63065b3b1fd,46.779774,2020,Risk Factors Associated with Clinical Outcomes...,With evidence of sustained transmission in mor...,{'paper_id': '1a48450da54865731ffe01c0e289a630...


In [16]:
for paper_id in smoke_df["paper_id"].head(3):
    print(meta_csv[meta_csv["sha"] == paper_id]["url"])

42078    https://doi.org/10.1101/2020.03.17.20037572
Name: url, dtype: object
42112    https://doi.org/10.1101/2020.03.18.20038455
Name: url, dtype: object
42326    https://doi.org/10.1101/2020.03.25.20037721
Name: url, dtype: object


#### Paper: https://doi.org/10.1101/2020.03.17.20037572

The first paper with the highest match_score is a meta-analysis study which combines more than 30 studies with 53000 patients included. This paper has provided a comprehensive discussion about the risk factors, symptoms, and lab test results among COVID-19 patients. Summarized as below:

| COVID-19 | metric | 95%CI | SARS | MERS |
| --- | --- | --- | --- | --- |
| Smoking (%) | 6.4 | 0.0 - 12.9 | 17 | 23 |
| Age (avg. years) | 49.8 | 47.5 - 52.2 | 39.9 | 50 |
| Male (%) | 55.5 | 53.2 - 57.7 | 43 | 64.5 |
| Incubation Time (days) | 7.10 | 6.06 - 8.14 | --- | --- |
| days from symptom onset to hospital admission (days) | 6.18 | 5.23 - 7.12 | --- | --- |

| Comorbidity | percentage | 95%CI | SARS | MERS |
| --- | --- | --- | --- | --- |
| Any comorbidity | 37.1 | 28.1 - 46.1 | 10 - 30* | 76 |
| Hypertension | 19.0 | 13.2 - 24.9 | 19 | 34 |
| diabetes | 8.2 | 6.3 - 10.0 | 24 | 68 |
| cardiovascular disease | 2.7 | 1.4 - 4.1 | 10 | 28 |
| COPD | 0.6 | 0.3 - 0.9 | --- | --- |
| CKD | 0.4 | 0.1 - 0.7 | 2 - 6* | 49 |
| Cancer | 0.8 | 0.1 - 1.5 | 3 | 2 |

Note:  
COPD: chronic obstructive pulmonary disease  
CKD: chronic kidney disease


### 1.2 drinking

No significant paper talk about drinking/alcohol usage information. However, drinking alcohol is proved to show significant relation with many chronic disease, such as diabete and hypertenison, and cardiovascular disease.

In [26]:
#query
drink_df = es_equery(es, build_doc(keyword = "drink"))

There are 1 of papers returned


In [27]:
drink_df

Unnamed: 0,paper_id,match_scores,publish_years,title,abstract,json_obj
0,7fadc31f532d97c2f2317c05237e5d17833a0381,12.762307,2019,Review Challenges in the diagnosis of paediatr...,Pneumonia is a leading killer of children youn...,{'paper_id': '7fadc31f532d97c2f2317c05237e5d17...


### 1.3 Seafood market exposure

Though the early identified few patients show enrichment of seafood market exposure history, but later on, it is confirmed that person-to-person transmission occurs.

In [30]:
exposure_df = es_equery(es, build_doc(keyword="seafood market"))

There are 34 of papers returned


In [34]:
for paper_id in exposure_df["paper_id"].head(3):
    print(meta_csv[meta_csv["sha"] == paper_id]["url"])

45476    https://doi.org/10.1016/s0140-6736(20)30566-3
Name: url, dtype: object
42872    https://doi.org/10.1097/cm9.0000000000000782
Name: url, dtype: object
47079    https://doi.org/10.1016/j.ijid.2020.03.017
Name: url, dtype: object


## 2. Co-morbidities


### 2.1 cardiovascular disease

In [41]:
CD = es_equery(es, build_doc(keyword="cardiovascular disease"))

There are 116 of papers returned


In [40]:
for paper_id in CD["paper_id"].head(3):
    print(meta_csv[meta_csv["sha"] == paper_id]["url"])

45476    https://doi.org/10.1016/s0140-6736(20)30566-3
Name: url, dtype: object
42078    https://doi.org/10.1101/2020.03.17.20037572
Name: url, dtype: object
41754    https://doi.org/10.1101/2020.02.28.20028514
Name: url, dtype: object


## Interface develop

In [5]:
import ipywidgets as widgets
from IPython.display import display

In [15]:
from gensim.summarization import summarize, keywords
def sen2keywords(sentence):
       return  keywords(sentence)

output = sen2keywords("Is vascular disease a risk factor of COVID-19?")
print(output)




In [16]:
text = 'Challenges in natural language processing frequently involve speech recognition, natural language understanding, natural language generation (frequently from formal, machine-readable logical forms), connecting language and machine perception, dialog systems, or some combination thereof.'
keywords(text)

'natural language\nmachine\nfrequently'

### simple version

In [7]:
text = widgets.Combobox(

    placeholder='Type your question here...',
    options=['What are the risk factors of COVID-19?', 
             'Is smoking a risk factor for COVID-19?', 
             'Is cardiovascular disease a risk factor for COVID-19?'],
    description='Question:',
    ensure_option=True,
    disabled=False
)



def process_question(text):
    questions = text.value
    keywords = sen2keywords(questions)
    print("Keywords" + keywords)
    
text.on_submit(process_question)
text

Combobox(value='', description='Question:', ensure_option=True, options=('What are the risk factors of COVID-1…

Keywords


In [106]:
text = widgets.Combobox(

    placeholder='Type your question here...',
    options=['What are the risk factors of COVID-19?', 
             'Is smoking a risk factor for COVID-19?', 
             'Is cardiovascular disease a risk factor for COVID-19?'],
    description='Question:',
    ensure_option=True,
    disabled=False
)


def process_question(text):
    questions = text.value
    print("Answer: " + questions[:3])
    
    
text.on_submit(process_question)

#text
widgets.interactive_output( process_question, text)
out
#display(text)
#print(text.value)

AttributeError: 'Combobox' object has no attribute 'items'

### complex version

In [109]:
from ipywidgets import AppLayout, Button, Layout, GridspecLayout, Output

grid = GridspecLayout(4,2, height = "400px", width = "800px")
# grid[0, 0:] = widgets.Image(
#     value=image,
#     format='png',
#     width=800,
#     height=400)

display("Ask question about COVID-19")

grid[1, 0] = widgets.Combobox(

    placeholder='Type or select your question here...',
    options=['What are the risk factors of COVID-19?', 
             'Is smoking a risk factor for COVID-19?', 
             'Is cardiovascular disease a risk factor for COVID-19?'],
    description='Question:',
    ensure_option=True,
    disabled=False
)
grid[2, 0] =  widgets.RadioButtons(
    options=['1', '3', '5'],
    value='3', # Defaults to 'pineapple'
    layout={'width': 'max-content'}, # If the items' names are long
    description='# of papers',
    disabled=False
)

grid[2, 1] = widgets.RadioButtons(
    options=['3', '5', '7'],
    value='3', # Defaults to 'pineapple'
    layout={'width': 'max-content'}, # If the items' names are long
    description='# of sentences',
    disabled=False
)

#out = widgets.Output(layout={'border': '1px solid black'})
#out.append_display_data(display(text))

#grid[3, 0:] = out
#grid[3, :].append_stdout('Output appended with append_stdout')
#grid[3, :].append_display_data(on_submit(process_question))
#header_button = Button(description = 'Ask your question Here', button_style = 'primary', font_weight = 30)
#left_button = Button(description = 'Left', button_style = 'info')
#center_button = create_expanded_button('Center', 'warning')
grid



'Ask question about COVID-19'

GridspecLayout(children=(Combobox(value='', description='Question:', ensure_option=True, layout=Layout(grid_ar…

In [80]:
b1 = Button(description='Custom color')
b1.style.button_color = 'lightgreen'
b1

Button(description='Custom color', style=ButtonStyle(button_color='lightgreen'))

### tkinter usage

In [82]:
import tkinter as tk

master = tk.Tk()
tk.Label(master, text="Type question here").grid(row=0)
tk.Label(master, text="Last Name").grid(row=1)

e1 = tk.Entry(master)
e2 = tk.Entry(master)

e1.grid(row=0, column=1)
e2.grid(row=1, column=1)

master.mainloop()

### interactive output

In [103]:
a = widgets.IntSlider(description='a')
b = widgets.IntSlider(description='b')
c = widgets.IntSlider(description='c')
def f(a, b, c):
    print('{}*{}*{}={}'.format(a, b, c, a*b*c))

out = widgets.interactive_output(f, {'a': a, 'b': b, 'c': c})
out
#widgets.HBox([widgets.VBox([a, b, c]), out])

Output()