<a href="https://colab.research.google.com/github/jansoe/public-covid-xr-data/blob/main/PRISMAmedRxiv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# medRxiv Scraping

#### Imports

In [1]:
import requests
from bs4 import BeautifulSoup as bs
#import arxiv

import pandas as pd
import numpy as np
from pprint import pprint

import collections
import time

#### Google Drive/Spreadsheet Authenticaction

In [4]:
from google.colab import auth
auth.authenticate_user()

In [5]:
import gspread
from oauth2client.client import GoogleCredentials

gc = gspread.authorize(GoogleCredentials.get_application_default())

## Identification

In [6]:
base_url = 'https://www.medrxiv.org/' 


terms = [
    ['covid', 'x-ray' , 'dataset'],
    ['covid', 'x-ray' , 'machine%252Blearning'],
    ['covid', 'x-ray' , 'deep%252Blearning'],
    ['covid', 'x-ray' , 'data%252Bset'],
    ['covid-19', 'x-ray' , 'dataset'],
    ['covid-19', 'x-ray' , 'machine%252Blearning'],
    ['covid-19', 'x-ray' , 'deep%252Blearning'],
    ['covid-19', 'x-ray' , 'data%252Bset'],
]

start = '2020-01-01'
end = '2021-04-01'

terms = ["%2522%252Band%252B%2522".join(x) for x in terms]
parse_term = lambda x: f'%2522{x}%2522'
limit_parsed = f'%20limit_from%3A{start}%20limit_to%3A{end}'
num_results = '%20numresults%3A75'

urls = {term: base_url + 'search/' + parse_term(term) + limit_parsed + num_results for term in terms}
print('Search URLs')
pprint(urls)

Search URLs
{'covid%2522%252Band%252B%2522x-ray%2522%252Band%252B%2522data%252Bset': 'https://www.medrxiv.org/search/%2522covid%2522%252Band%252B%2522x-ray%2522%252Band%252B%2522data%252Bset%2522%20limit_from%3A2020-01-01%20limit_to%3A2021-04-01%20numresults%3A75',
 'covid%2522%252Band%252B%2522x-ray%2522%252Band%252B%2522dataset': 'https://www.medrxiv.org/search/%2522covid%2522%252Band%252B%2522x-ray%2522%252Band%252B%2522dataset%2522%20limit_from%3A2020-01-01%20limit_to%3A2021-04-01%20numresults%3A75',
 'covid%2522%252Band%252B%2522x-ray%2522%252Band%252B%2522deep%252Blearning': 'https://www.medrxiv.org/search/%2522covid%2522%252Band%252B%2522x-ray%2522%252Band%252B%2522deep%252Blearning%2522%20limit_from%3A2020-01-01%20limit_to%3A2021-04-01%20numresults%3A75',
 'covid%2522%252Band%252B%2522x-ray%2522%252Band%252B%2522machine%252Blearning': 'https://www.medrxiv.org/search/%2522covid%2522%252Band%252B%2522x-ray%2522%252Band%252B%2522machine%252Blearning%2522%20limit_from%3A2020-01-01%

In [7]:
data ={}

for term, url in urls.items():
    
    found = 0
    skipped = 0 
    while True:

        url_response = requests.post(url)
        html = bs(url_response.text, features='html.parser')

        articles = html.find_all(attrs={'class': 'search-result'})
        for article in articles:
            found += 1
            doi = article.find(attrs={'class': 'highwire-cite-metadata'}).text.split('doi:')[-1].strip()
            if doi not in data:
                data[doi] = {
                    'title': article.find(attrs={'class': 'highwire-cite-title'}).find(attrs={'class': 'highwire-cite-title'}).text,
                    'author':  article.find(attrs={'class': 'highwire-citation-author first'}).text,
                    'term': term
                }
            else:
                skipped +=1
            

        url = html.find(attrs={'class': "link-icon link-icon-after"})
        if url:
            url = base_url + url.attrs['href']
        else:
            break
    print(f'Search for {term} gave {found} results, containing {found-skipped} new results')

Search for covid%2522%252Band%252B%2522x-ray%2522%252Band%252B%2522dataset gave 277 results, containing 277 new results
Search for covid%2522%252Band%252B%2522x-ray%2522%252Band%252B%2522machine%252Blearning gave 207 results, containing 42 new results
Search for covid%2522%252Band%252B%2522x-ray%2522%252Band%252B%2522deep%252Blearning gave 225 results, containing 41 new results
Search for covid%2522%252Band%252B%2522x-ray%2522%252Band%252B%2522data%252Bset gave 624 results, containing 264 new results
Search for covid-19%2522%252Band%252B%2522x-ray%2522%252Band%252B%2522dataset gave 276 results, containing 0 new results
Search for covid-19%2522%252Band%252B%2522x-ray%2522%252Band%252B%2522machine%252Blearning gave 206 results, containing 0 new results
Search for covid-19%2522%252Band%252B%2522x-ray%2522%252Band%252B%2522deep%252Blearning gave 224 results, containing 0 new results
Search for covid-19%2522%252Band%252B%2522x-ray%2522%252Band%252B%2522data%252Bset gave 621 results, contain

In [8]:
identified = pd.DataFrame(data).T
identified = identified.reset_index().rename(columns={'index': 'doi'})
identified.groupby('term').term.count()

term
covid%2522%252Band%252B%2522x-ray%2522%252Band%252B%2522data%252Bset            264
covid%2522%252Band%252B%2522x-ray%2522%252Band%252B%2522dataset                 277
covid%2522%252Band%252B%2522x-ray%2522%252Band%252B%2522deep%252Blearning        41
covid%2522%252Band%252B%2522x-ray%2522%252Band%252B%2522machine%252Blearning     42
Name: term, dtype: int64

## Screening step

#### Load manual annotation

In [11]:
worksheet = gc.open('DatasetScraping').worksheet('MedrxivPaper')
rows = worksheet.get_all_values()
manual_annotated = pd.DataFrame.from_records(rows[1:], columns=rows[0])
manual_annotated['done'] = True

In [12]:
annotated = pd.merge(
    identified, 
    manual_annotated[['doi', 'relevant', 'extracted datasets', 'published_doi', 'done']], 
    on=['doi'], 
    how='left'
)

Check if all papers are annotated

In [13]:
annotated.done.fillna(False).value_counts()

True    624
Name: done, dtype: int64

### Screening results

ct (computer tomographie), us (ultra sound) and mri are mapped to nochestxraydata

In [None]:
(annotated.relevant
    .replace('', 'PASSED')
    .replace('nochestimagingdata', 'nochestxraydata')
    .replace('CT', 'nochestxraydata')
    .replace('US', 'nochestxraydata')
    .replace('CT/US', 'nochestxraydata')
    .replace('MRI', 'nochestxraydata')
    .value_counts()
)

nochestxraydata    534
PASSED              81
nocovid              9
self                 1
Name: relevant, dtype: int64

Keep only papers that passed filtering

In [15]:
chestxray_paper = annotated[annotated.relevant == '']

### Dataset extraction

Check if all papers are annotated with datasets

In [16]:
chestxray_paper[chestxray_paper['extracted datasets'].str.len() == 0]

Unnamed: 0,doi,title,author,term,relevant,extracted datasets,published_doi,done


In [17]:
all_datasets = sum(chestxray_paper['extracted datasets'].str.split(', ').to_list(), [])

exclude = ['privat', 'onpub', 'onapplication']
set_unique = pd.DataFrame(
    collections.Counter([
        i.strip().lower() for i in all_datasets 
        if i and np.all([x not in i for x in exclude])
    ]).most_common()
)
set_unique.columns = ['name', 'count']

In [18]:
print(f'Found {len(all_datasets)} dataset references, containing {set_unique["count"].sum()} public with {set_unique.shape[0]} thereof unique')

Found 191 dataset references, containing 173 public with 27 thereof unique


### Manual annotation of dataset eligibility

In [25]:
gc = gspread.authorize(GoogleCredentials.get_application_default())
worksheet = gc.open('DatasetScraping').worksheet('PaperIdentifiedDatasets')

rows = worksheet.get_all_values()
datasets_eligibility = pd.DataFrame.from_records(rows[1:], columns=rows[0])

In [26]:
is_anaylsed = pd.merge(set_unique, datasets_eligibility, on='name', how='left')
is_anaylsed[is_anaylsed.eligibility.isnull()]

Unnamed: 0,name,count,eligibility,paper_name,origin


In [27]:
is_anaylsed.eligibility.replace('', 'ELIGIBLE').value_counts()

nocovid           10
remix              7
casecollection     6
ELIGIBLE           2
reannotation       2
Name: eligibility, dtype: int64

In [28]:
is_anaylsed[is_anaylsed.eligibility.isin(['', 'reannotation'])].sort_values(by='eligibility')

Unnamed: 0,name,count,eligibility,paper_name,origin
12,bimcv,3,,BIMCV-COVID19,https://bimcv.cipf.es/bimcv-projects/bimcv-cov...
18,hmhospitales,1,,HM HOSPITALES,https://www.hmhospitales.com/coronavirus/covid...
16,covebase4all,2,reannotation,CARING,https://osf.io/b35xu/
25,tcia_segmentation,1,reannotation,AR-OPACITY,https://github.com/haimingt/opacity_segmentati...
