In [33]:
import json
import requests

from tqdm.notebook import tqdm

import pandas as pd
import numpy as np

from seq2cite import config, utils

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Exploring CORD-19 Dataset

In [24]:
s3, s3_resource = utils.connect_aws_s3()
cord19_bucket = utils.get_cord19_bucket(s3, s3_resource)

In [29]:
def list_bucket_contents(bucket_resource, match='', size_mb=0):
    total_size_gb = 0
    total_files = 0
    match_size_gb = 0
    match_files = 0
    for key in bucket_resource.objects.all():
        key_size_mb = key.size/1024/1024
        total_size_gb += key_size_mb
        total_files += 1
        list_check = False
        if not match:
            list_check = True
        elif match in key.key:
            list_check = True
        if list_check and not size_mb:
            match_files += 1
            match_size_gb += key_size_mb
            print(f'{key.key} ({key_size_mb:3.0f}MB)')
        elif list_check and key_size_mb <= size_mb:
            match_files += 1
            match_size_gb += key_size_mb
            print(f'{key.key} ({key_sizexmb:3.0f}MB)')

        if key.key.endswith('.json'):
            break
    if match:
        print(f'Matched file size is {match_size_gb/1024:3.1f}GB with {match_files} files')            
    
    print(f'Bucket total size is {total_size_gb/1024:3.1f}GB with {total_files} files')
    
    

In [30]:
list_bucket_contents(cord19_bucket)

2020-03-13/COVID.DATA.LIC.AGMT.pdf (  0MB)
2020-03-13/all_sources_metadata_2020-03-13.csv ( 48MB)
2020-03-13/all_sources_metadata_2020-03-13.readme (  0MB)
2020-03-13/biorxiv_medrxiv.tar.gz ( 13MB)
2020-03-13/comm_use_subset.tar.gz (186MB)
2020-03-13/json_schema.txt (  0MB)
2020-03-13/noncomm_use_subset.tar.gz ( 36MB)
2020-03-13/pmc_custom_license.tar.gz ( 19MB)
2020-03-20/biorxiv_medrxiv.tar.gz ( 13MB)
2020-03-20/changelog (  0MB)
2020-03-20/comm_use_subset.tar.gz (183MB)
2020-03-20/custom_license.tar.gz (344MB)
2020-03-20/metadata.csv ( 60MB)
2020-03-20/noncomm_use_subset.tar.gz ( 40MB)
2020-03-27/biorxiv_medrxiv.tar.gz ( 15MB)
2020-03-27/changelog (  0MB)
2020-03-27/comm_use_subset.tar.gz (186MB)
2020-03-27/custom_license.tar.gz (414MB)
2020-03-27/metadata.csv ( 66MB)
2020-03-27/metadata_with_mag_mapping.csv ( 68MB)
2020-03-27/noncomm_use_subset.tar.gz ( 40MB)
2020-04-03/biorxiv_medrxiv.tar.gz ( 18MB)
2020-04-03/changelog (  0MB)
2020-04-03/comm_use_subset.tar.gz (346MB)
2020-04-03/

In [25]:
def preview_csv_dataset(bucket, key, rows=10):
    data_source = {
            'Bucket': bucket,
            'Key': key
        }
    # Generate the URL to get Key from Bucket
    url = s3.generate_presigned_url(
        ClientMethod = 'get_object',
        Params = data_source
    )

    data = pd.read_csv(url, nrows=rows)
    return data

In [38]:
df_preview = preview_csv_dataset(config.cord19_aws_bucket, '2020-04-03/metadata.csv', rows=200)
utils.display_all(df_preview)

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_pdf_parse,has_pmc_xml_parse,full_text_file,url
0,8q5ondtn,,Elsevier,Intrauterine virus infections and congenital h...,10.1016/0002-8703(72)90077-4,,4361535.0,els-covid,Abstract The etiologic basis for the vast majo...,1972-12-31,"Overall, James C.",American Heart Journal,,,False,False,custom_license,https://doi.org/10.1016/0002-8703(72)90077-4
1,pzfd0e50,,Elsevier,Coronaviruses in Balkan nephritis,10.1016/0002-8703(80)90355-5,,6243850.0,els-covid,,1980-03-31,"Georgescu, Leonida; Diosi, Peter; Buţiu, Ioan;...",American Heart Journal,,,False,False,custom_license,https://doi.org/10.1016/0002-8703(80)90355-5
2,22bka3gi,,Elsevier,Cigarette smoking and coronary heart disease: ...,10.1016/0002-8703(80)90356-7,,7355701.0,els-covid,,1980-03-31,"Friedman, Gary D",American Heart Journal,,,False,False,custom_license,https://doi.org/10.1016/0002-8703(80)90356-7
3,zp9k1k3z,aecbc613ebdab36753235197ffb4f35734b5ca63,Elsevier,Clinical and immunologic studies in identical ...,10.1016/0002-9343(73)90176-9,,4579077.0,els-covid,"Abstract Middle-aged female identical twins, o...",1973-08-31,"Brunner, Carolyn M.; Horwitz, David A.; Shann,...",The American Journal of Medicine,,,True,False,custom_license,https://doi.org/10.1016/0002-9343(73)90176-9
4,cjuzul89,,Elsevier,Epidemiology of community-acquired respiratory...,10.1016/0002-9343(85)90361-4,,4014285.0,els-covid,Abstract Upper respiratory tract infections ar...,1985-06-28,"Garibaldi, Richard A.",The American Journal of Medicine,,,False,False,custom_license,https://doi.org/10.1016/0002-9343(85)90361-4
5,wwf90zxt,212e990b378e8d267042753d5f9d4a64ea5e9869,Elsevier,Infectious diarrhea: Pathogenesis and risk fac...,10.1016/0002-9343(85)90367-5,,2861742.0,els-covid,Abstract Our understanding of the pathogenesis...,1985-06-28,"Cantey, J.Robert",The American Journal of Medicine,,,True,False,custom_license,https://doi.org/10.1016/0002-9343(85)90367-5
6,dlh93ax6,bf5d344243153d58be692ceb26f52c08e2bd2d2f,Elsevier,New perspectives on the pathogenesis of rheuma...,10.1016/0002-9343(88)90356-7,,3052052.0,els-covid,Abstract In the pathogenesis of rheumatoid art...,1988-10-14,"Zvaifler, Nathan J.",The American Journal of Medicine,,,True,False,custom_license,https://doi.org/10.1016/0002-9343(88)90356-7
7,i94lyfsh,ddd2ecf42ec86ad66072962081e1ce4594431f9c,Elsevier,Management of acute and chronic respiratory tr...,10.1016/0002-9343(88)90456-1,,3048091.0,els-covid,"Abstract Pharyngitis, bronchitis, and pneumoni...",1988-09-16,"Ellner, Jerrold J.",The American Journal of Medicine,,,True,False,custom_license,https://doi.org/10.1016/0002-9343(88)90456-1
8,vs5yondw,a55cb4e724091ced46b5e55b982a14525eea1c7e,Elsevier,Acute bronchitis: Results of U.S. and European...,10.1016/0002-9343(92)90608-e,,1621745.0,els-covid,"Abstract Acute bronchitis, an illness frequent...",1992-06-22,"Dere, Willard H.",The American Journal of Medicine,,,True,False,custom_license,https://doi.org/10.1016/0002-9343(92)90608-e
9,qwh8ei60,a1fd28115cfa14869f29f30d2121c1058e6e81e9,Elsevier,Clinical and Immunologic Responses in Patients...,10.1016/0002-9394(75)90398-0,,170831.0,els-covid,,1975-10-31,"Knopf, Harry L.S.; Hierholzer, John C.",American Journal of Ophthalmology,,,True,False,custom_license,https://doi.org/10.1016/0002-9394(75)90398-0


In [39]:
def read_json_item(key):
    data_source = {
        'Bucket': config.cord19_aws_bucket,
        'Key': key
    }
    url = s3.generate_presigned_url(ClientMethod='get_object', Params=data_source)
    return json.loads(requests.get(url).text)