<a href="https://colab.research.google.com/github/jansoe/public-covid-xr-data/blob/main/PRISMAarxXiv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# arXiv Scraping

### Imports

In [1]:
import requests
from bs4 import BeautifulSoup as bs

import pandas as pd
import numpy as np
from pprint import pprint

import collections
import time

In [2]:
!pip install arxiv
import arxiv

Collecting arxiv
  Downloading https://files.pythonhosted.org/packages/63/cb/01bb646298dd4646277d0b366b6f50001cb2971b0fc2d3879cc5fce01097/arxiv-1.2.0-py3-none-any.whl
Collecting feedparser
[?25l  Downloading https://files.pythonhosted.org/packages/1c/21/faf1bac028662cc8adb2b5ef7a6f3999a765baa2835331df365289b0ca56/feedparser-6.0.2-py3-none-any.whl (80kB)
[K     |████████████████████████████████| 81kB 3.7MB/s 
[?25hCollecting sgmllib3k
  Downloading https://files.pythonhosted.org/packages/9e/bd/3704a8c3e0942d711c1299ebf7b9091930adae6675d7c8f476a7ce48653c/sgmllib3k-1.0.0.tar.gz
Building wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-cp37-none-any.whl size=6067 sha256=20de3bc95a6d64c0c29cbd7efaac7b80d030ee190f93a781dac776c6d32688c7
  Stored in directory: /root/.cache/pip/wheels/f1/80/5a/444ba08a550cdd241bd9baf8bae44be750efe370adb944506a
Successfully built sgmllib3k
Installing

#### Google Drive Authentication 

In [3]:
from google.colab import auth
auth.authenticate_user()

In [4]:
import gspread
from oauth2client.client import GoogleCredentials

gc = gspread.authorize(GoogleCredentials.get_application_default())

## Identification

In [5]:
terms = [
    'all:covid AND all:x-ray AND all:dataset',
    'all:covid AND all:x-ray AND all:data set',
    'all:covid AND all:x-ray AND all:machine learning',
    'all:covid AND all:x-ray AND all:deep learning',
    'all:covid-19 AND all:x-ray AND all:dataset',
    'all:covid-19 AND all:x-ray AND all:data set',
    'all:covid-19 AND all:x-ray AND all:machine learning',
    'all:covid-19 AND all:x-ray AND all:deep learning',
]

In [6]:
screened_arxiv = {}

for query in terms:

    entries_before = len(screened_arxiv)

    search = arxiv.Search(
    query = query,
    max_results = 500,
    sort_by = arxiv.SortCriterion.SubmittedDate,
    sort_order = arxiv.SortOrder.Ascending
    )
    for i, result in enumerate(search.get()):    
        id = 'v'.join(result.entry_id.split('v')[:-1])
        if id not in screened_arxiv:
            screened_arxiv[id] = {
                'title': result.title,
                'year': result.published.year,
                'month': result.published.month,
                'title': result.title,
                'term': query,
                'doi': result.doi
            }
    print(f'{len(screened_arxiv)-entries_before} new from {i+1} for {query}')

123 new from 123 for all:covid AND all:x-ray AND all:dataset
36 new from 110 for all:covid AND all:x-ray AND all:data set
33 new from 155 for all:covid AND all:x-ray AND all:machine learning
2 new from 156 for all:covid AND all:x-ray AND all:deep learning
0 new from 1 for all:covid-19 AND all:x-ray AND all:dataset
0 new from 1 for all:covid-19 AND all:x-ray AND all:data set
0 new from 1 for all:covid-19 AND all:x-ray AND all:machine learning
0 new from 1 for all:covid-19 AND all:x-ray AND all:deep learning


In [7]:
identified = pd.DataFrame(screened_arxiv).T.sort_values(by=['year', 'month'])
identified = identified.reset_index().rename(columns={'index': 'link'})

#### temporal filtering

In [8]:
identified = identified[identified.year > 2019]
identified = identified[np.logical_or((identified.year != 2021), (identified.month < 4))]

In [10]:
display(identified.head(3))
display(identified.tail(3))

Unnamed: 0,link,title,year,month,term,doi
0,http://arxiv.org/abs/2003.09871,COVID-Net: A Tailored Deep Convolutional Neura...,2020,3,all:covid AND all:x-ray AND all:dataset,
1,http://arxiv.org/abs/2003.10769,Estimating Uncertainty and Interpretability in...,2020,3,all:covid AND all:x-ray AND all:dataset,
2,http://arxiv.org/abs/2003.10304,Attention U-Net Based Adversarial Architecture...,2020,3,all:covid AND all:x-ray AND all:dataset,


Unnamed: 0,link,title,year,month,term,doi
172,http://arxiv.org/abs/2103.03945,SCRIB: Set-classifier with Class-specific Risk...,2021,3,all:covid AND all:x-ray AND all:data set,
173,http://arxiv.org/abs/2103.05094,CovidGAN: Data Augmentation Using Auxiliary Cl...,2021,3,all:covid AND all:x-ray AND all:data set,10.1109/ACCESS.2020.2994762
174,http://arxiv.org/abs/2103.02961,Probabilistic combination of eigenlungs-based ...,2021,3,all:covid AND all:x-ray AND all:machine learning,


## Screening step

load manual annotation

In [18]:
worksheet = gc.open('DatasetScraping').worksheet('ArxivPaper')
rows = worksheet.get_all_values()
manual_annotated = pd.DataFrame.from_records(rows[1:], columns=rows[0])
manual_annotated['done'] = True

display(manual_annotated.head(1))

Unnamed: 0,link,title,year,month,doi,relevant,extracted datasets,doi_pub,done
0,http://arxiv.org/abs/2003.09871,COVID-Net: A Tailored Deep Convolutional Neura...,2020,3,,,covidx,10.1038/s41598-020-76550-z,True


In [19]:
annotated = pd.merge(
    identified, 
    manual_annotated[['link', 'relevant', 'extracted datasets', 'doi_pub', 'done']], 
    on=['link'], 
    how='left'
)

Check if all data is already annotated

In [20]:
annotated.done.fillna(False).value_counts()

True    175
Name: done, dtype: int64

### Screening results papers

map CT and US to nochestxraydata

In [21]:
(annotated.relevant
    .replace('', 'PASSED')
    .replace('CT', 'nochestxraydata') 
    .replace('US', 'nochestxraydata')
    .value_counts()
)

PASSED             155
nochestxraydata     19
nocovid              1
Name: relevant, dtype: int64

filter for paper that passed screening (and check if all are annotated)

In [24]:
chestxray_paper = annotated[annotated.relevant == '']
chestxray_paper[chestxray_paper['extracted datasets'].str.len() == 0]

Unnamed: 0,link,title,year,month,term,doi,relevant,extracted datasets,doi_pub,done


### Dataset Extraction

* Remove all datasets which are privat (i.e. privat, on publication, on request) or which could not be identified
* remove duplicates

In [25]:
all_datasets = sum(chestxray_paper['extracted datasets'].str.split(', ').to_list(), [])

exclude = ['privat', 'onpub', 'not_ident', 'onrequest']
set_unique = pd.DataFrame(
    collections.Counter([
        i.strip().lower() for i in all_datasets 
        if i and np.all([x not in i for x in exclude])
    ]).most_common()
)
set_unique.columns = ['name', 'count']

In [26]:
print(f'Found {len(all_datasets)} dataset references, containing {set_unique["count"].sum()} public with {set_unique.shape[0]} thereof unique')

Found 441 dataset references, containing 415 public with 47 thereof unique


## Manual annotation of dataset eligibility
Load annotations

In [29]:
gc = gspread.authorize(GoogleCredentials.get_application_default())
worksheet = gc.open('DatasetScraping').worksheet('PaperIdentifiedDatasets')

rows = worksheet.get_all_values()
datasets_eligibility = pd.DataFrame.from_records(rows[1:], columns=rows[0])

Check if all datasets are annotated 

In [30]:
is_anaylsed = pd.merge(set_unique, datasets_eligibility, on='name', how='left')
is_anaylsed[is_anaylsed.eligibility.isnull()]

Unnamed: 0,name,count,eligibility,paper_name,origin


In [31]:
is_anaylsed.eligibility.replace('', 'ELIGIBLE').value_counts()

remix                 15
nocovid               12
casecollection        10
ELIGIBLE               8
remix_reannotation     1
noinfo                 1
Name: eligibility, dtype: int64

In [32]:
is_anaylsed[is_anaylsed.eligibility == '']

Unnamed: 0,name,count,eligibility,paper_name,origin
10,bimcv,10,,BIMCV-COVID19,https://bimcv.cipf.es/bimcv-projects/bimcv-cov...
14,actualmed,7,,ACTUALMED,https://github.com/agchung/Actualmed-COVID-che...
18,mlhannover,5,,ML HANNOVER,https://github.com/ml-workgroup/covid-19-image...
20,brixia,4,,BRIXIA,https://brixia.github.io/
26,covidgr,2,,COVIDGR,https://dasci.es/transferencia/open-data/covid...
31,tcia_rural,2,,COVID-19-AR,https://wiki.cancerimagingarchive.net/pages/vi...
32,hmhospitales,2,,HM HOSPITALES,https://www.hmhospitales.com/coronavirus/covid...
42,aiforcovid,1,,AIforCOVID,https://aiforcovid.radiomica.it/
