In [109]:
import json
import requests
import re

from tqdm.notebook import tqdm

import pandas as pd
import numpy as np

from seq2cite import config, utils, aws, text

import sys
if str(config.src) not in sys.path:
    sys.path.append(str(config.src))

processing_data = __import__('0_processing_data')

%load_ext autoreload
%autoreload 2
%load_ext line_profiler

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


# Exploring CORD-19 Dataset

In [5]:
s3, s3_resource = aws.connect_aws_s3()
cord19_bucket = aws.get_cord19_bucket(s3, s3_resource)

In [6]:
def list_bucket_contents(bucket_resource, match='', size_mb=0):
    total_size_gb = 0
    total_files = 0
    match_size_gb = 0
    match_files = 0
    for key in bucket_resource.objects.all():
        key_size_mb = key.size/1024/1024
        total_size_gb += key_size_mb
        total_files += 1
        list_check = False
        if not match:
            list_check = True
        elif match in key.key:
            list_check = True
        if list_check and not size_mb:
            match_files += 1
            match_size_gb += key_size_mb
            print(f'{key.key} ({key_size_mb:3.0f}MB)')
        elif list_check and key_size_mb <= size_mb:
            match_files += 1
            match_size_gb += key_size_mb
            print(f'{key.key} ({key_sizexmb:3.0f}MB)')

        if key.key.endswith('.json'):
            break
    if match:
        print(f'Matched file size is {match_size_gb/1024:3.1f}GB with {match_files} files')            
    
    print(f'Bucket total size is {total_size_gb/1024:3.1f}GB with {total_files} files')
    
    

In [7]:
list_bucket_contents(cord19_bucket)

2020-03-13/COVID.DATA.LIC.AGMT.pdf (  0MB)
2020-03-13/all_sources_metadata_2020-03-13.csv ( 48MB)
2020-03-13/all_sources_metadata_2020-03-13.readme (  0MB)
2020-03-13/biorxiv_medrxiv.tar.gz ( 13MB)
2020-03-13/comm_use_subset.tar.gz (186MB)
2020-03-13/json_schema.txt (  0MB)
2020-03-13/noncomm_use_subset.tar.gz ( 36MB)
2020-03-13/pmc_custom_license.tar.gz ( 19MB)
2020-03-20/biorxiv_medrxiv.tar.gz ( 13MB)
2020-03-20/changelog (  0MB)
2020-03-20/comm_use_subset.tar.gz (183MB)
2020-03-20/custom_license.tar.gz (344MB)
2020-03-20/metadata.csv ( 60MB)
2020-03-20/noncomm_use_subset.tar.gz ( 40MB)
2020-03-27/biorxiv_medrxiv.tar.gz ( 15MB)
2020-03-27/changelog (  0MB)
2020-03-27/comm_use_subset.tar.gz (186MB)
2020-03-27/custom_license.tar.gz (414MB)
2020-03-27/metadata.csv ( 66MB)
2020-03-27/metadata_with_mag_mapping.csv ( 68MB)
2020-03-27/noncomm_use_subset.tar.gz ( 40MB)
2020-04-03/biorxiv_medrxiv.tar.gz ( 18MB)
2020-04-03/changelog (  0MB)
2020-04-03/comm_use_subset.tar.gz (346MB)
2020-04-03/

In [8]:
def preview_csv_dataset(bucket, key, rows=10):
    data_source = {
            'Bucket': bucket,
            'Key': key
        }
    # Generate the URL to get Key from Bucket
    url = s3.generate_presigned_url(
        ClientMethod = 'get_object',
        Params = data_source
    )

    data = pd.read_csv(url, nrows=rows)
    return data

In [9]:
metadata = pd.read_csv(f's3://{config.cord19_aws_bucket}/2020-04-03/metadata.csv',
                       nrows=100,
                       skiprows=0,
                      names=config.metadata_columns,
                      header=0)
metadata

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_pdf_parse,has_pmc_xml_parse,full_text_file,url
0,8q5ondtn,,Elsevier,Intrauterine virus infections and congenital h...,10.1016/0002-8703(72)90077-4,,4361535.0,els-covid,Abstract The etiologic basis for the vast majo...,1972-12-31,"Overall, James C.",American Heart Journal,,,False,False,custom_license,https://doi.org/10.1016/0002-8703(72)90077-4
1,pzfd0e50,,Elsevier,Coronaviruses in Balkan nephritis,10.1016/0002-8703(80)90355-5,,6243850.0,els-covid,,1980-03-31,"Georgescu, Leonida; Diosi, Peter; Buţiu, Ioan;...",American Heart Journal,,,False,False,custom_license,https://doi.org/10.1016/0002-8703(80)90355-5
2,22bka3gi,,Elsevier,Cigarette smoking and coronary heart disease: ...,10.1016/0002-8703(80)90356-7,,7355701.0,els-covid,,1980-03-31,"Friedman, Gary D",American Heart Journal,,,False,False,custom_license,https://doi.org/10.1016/0002-8703(80)90356-7
3,zp9k1k3z,aecbc613ebdab36753235197ffb4f35734b5ca63,Elsevier,Clinical and immunologic studies in identical ...,10.1016/0002-9343(73)90176-9,,4579077.0,els-covid,"Abstract Middle-aged female identical twins, o...",1973-08-31,"Brunner, Carolyn M.; Horwitz, David A.; Shann,...",The American Journal of Medicine,,,True,False,custom_license,https://doi.org/10.1016/0002-9343(73)90176-9
4,cjuzul89,,Elsevier,Epidemiology of community-acquired respiratory...,10.1016/0002-9343(85)90361-4,,4014285.0,els-covid,Abstract Upper respiratory tract infections ar...,1985-06-28,"Garibaldi, Richard A.",The American Journal of Medicine,,,False,False,custom_license,https://doi.org/10.1016/0002-9343(85)90361-4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,ro3x3qa3,bcea326da7ffbd034aa531671d14f0b907e561cc,Elsevier,A comparative assessment of four serological m...,10.1016/0020-7519(88)90147-6,,,els-covid,Abstract Antibodies against Crithidia fascicul...,1988-04-30,"Ingram, George A.; Al-Yaman, Fadwa",International Journal for Parasitology,,,True,False,custom_license,https://doi.org/10.1016/0020-7519(88)90147-6
96,ootj52fs,0ec192980f4b5026fb0aeef3de313f14ff58ef3d,Elsevier,Immunolabelling of fish host molecules on the ...,10.1016/0020-7519(94)00076-z,,7622332.0,els-covid,"Abstract Immunoblotting, SDS-PAGE and western ...",1995-02-28,"Williams, M.A.; Hoole, D.",International Journal for Parasitology,,,True,False,custom_license,https://doi.org/10.1016/0020-7519(94)00076-z
97,a4f8u1ze,188a50cddda7d12946f81822b3cee70d77d20cb4,Elsevier,Identification of diagnostic antigens for Sout...,10.1016/0020-7519(94)90034-5,,8026903.0,els-covid,Abstract Identification of diagnostic antigens...,1994-04-30,"Böse, Reinhard; Peymann, Berit; Barbosa, Imke ...",International Journal for Parasitology,,,True,False,custom_license,https://doi.org/10.1016/0020-7519(94)90034-5
98,i8x80knq,f6af90a5e3b6a85f64c42d66bbdc9ec3ace4b2ae,Elsevier,Diagnosis of Babesia caballi infections in hor...,10.1016/0020-7519(94)90081-7,,8070952.0,els-covid,Abstract From Babesia caballi in vitro culture...,1994-05-31,"Böse, Reinhard; Peymann, Berit",International Journal for Parasitology,,,True,False,custom_license,https://doi.org/10.1016/0020-7519(94)90081-7


In [10]:
metadata[~pd.isna(metadata['sha'])]

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_pdf_parse,has_pmc_xml_parse,full_text_file,url
3,zp9k1k3z,aecbc613ebdab36753235197ffb4f35734b5ca63,Elsevier,Clinical and immunologic studies in identical ...,10.1016/0002-9343(73)90176-9,,4579077.0,els-covid,"Abstract Middle-aged female identical twins, o...",1973-08-31,"Brunner, Carolyn M.; Horwitz, David A.; Shann,...",The American Journal of Medicine,,,True,False,custom_license,https://doi.org/10.1016/0002-9343(73)90176-9
5,wwf90zxt,212e990b378e8d267042753d5f9d4a64ea5e9869,Elsevier,Infectious diarrhea: Pathogenesis and risk fac...,10.1016/0002-9343(85)90367-5,,2861742.0,els-covid,Abstract Our understanding of the pathogenesis...,1985-06-28,"Cantey, J.Robert",The American Journal of Medicine,,,True,False,custom_license,https://doi.org/10.1016/0002-9343(85)90367-5
6,dlh93ax6,bf5d344243153d58be692ceb26f52c08e2bd2d2f,Elsevier,New perspectives on the pathogenesis of rheuma...,10.1016/0002-9343(88)90356-7,,3052052.0,els-covid,Abstract In the pathogenesis of rheumatoid art...,1988-10-14,"Zvaifler, Nathan J.",The American Journal of Medicine,,,True,False,custom_license,https://doi.org/10.1016/0002-9343(88)90356-7
7,i94lyfsh,ddd2ecf42ec86ad66072962081e1ce4594431f9c,Elsevier,Management of acute and chronic respiratory tr...,10.1016/0002-9343(88)90456-1,,3048091.0,els-covid,"Abstract Pharyngitis, bronchitis, and pneumoni...",1988-09-16,"Ellner, Jerrold J.",The American Journal of Medicine,,,True,False,custom_license,https://doi.org/10.1016/0002-9343(88)90456-1
8,vs5yondw,a55cb4e724091ced46b5e55b982a14525eea1c7e,Elsevier,Acute bronchitis: Results of U.S. and European...,10.1016/0002-9343(92)90608-e,,1621745.0,els-covid,"Abstract Acute bronchitis, an illness frequent...",1992-06-22,"Dere, Willard H.",The American Journal of Medicine,,,True,False,custom_license,https://doi.org/10.1016/0002-9343(92)90608-e
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,ro3x3qa3,bcea326da7ffbd034aa531671d14f0b907e561cc,Elsevier,A comparative assessment of four serological m...,10.1016/0020-7519(88)90147-6,,,els-covid,Abstract Antibodies against Crithidia fascicul...,1988-04-30,"Ingram, George A.; Al-Yaman, Fadwa",International Journal for Parasitology,,,True,False,custom_license,https://doi.org/10.1016/0020-7519(88)90147-6
96,ootj52fs,0ec192980f4b5026fb0aeef3de313f14ff58ef3d,Elsevier,Immunolabelling of fish host molecules on the ...,10.1016/0020-7519(94)00076-z,,7622332.0,els-covid,"Abstract Immunoblotting, SDS-PAGE and western ...",1995-02-28,"Williams, M.A.; Hoole, D.",International Journal for Parasitology,,,True,False,custom_license,https://doi.org/10.1016/0020-7519(94)00076-z
97,a4f8u1ze,188a50cddda7d12946f81822b3cee70d77d20cb4,Elsevier,Identification of diagnostic antigens for Sout...,10.1016/0020-7519(94)90034-5,,8026903.0,els-covid,Abstract Identification of diagnostic antigens...,1994-04-30,"Böse, Reinhard; Peymann, Berit; Barbosa, Imke ...",International Journal for Parasitology,,,True,False,custom_license,https://doi.org/10.1016/0020-7519(94)90034-5
98,i8x80knq,f6af90a5e3b6a85f64c42d66bbdc9ec3ace4b2ae,Elsevier,Diagnosis of Babesia caballi infections in hor...,10.1016/0020-7519(94)90081-7,,8070952.0,els-covid,Abstract From Babesia caballi in vitro culture...,1994-05-31,"Böse, Reinhard; Peymann, Berit",International Journal for Parasitology,,,True,False,custom_license,https://doi.org/10.1016/0020-7519(94)90081-7


In [140]:
metadata.head(10).to_dict('records')[0].sha

AttributeError: 'dict' object has no attribute 'sha'

In [141]:
tup1 = (1, 3, 6)
tup2 = (5, 6, 7)
tup1 + tup2

(1, 3, 6, 5, 6, 7)

In [122]:
id_ = 'f6af90a5e3b6a85f64c42d66bbdc9ec3ace4b2ae'
subset = 'custom_license'
article = aws.read_item(subset, id_, s3=s3)
article

{'paper_id': 'f6af90a5e3b6a85f64c42d66bbdc9ec3ace4b2ae',
 'metadata': {'title': 'DIAGNOSIS OF BABESIA CABALLI INFECTIONS IN HORSES BY ENZYME-LINKED IMMUNOSORBENT ASSAY (ELISA) AND WESTERN BLOT',
  'authors': [{'first': 'Reinhard',
    'middle': [],
    'last': 'Bose',
    'suffix': '',
    'affiliation': {},
    'email': ''},
   {'first': 'Berit',
    'middle': [],
    'last': 'Peymann',
    'suffix': '',
    'affiliation': {},
    'email': ''}]},
 'abstract': [{'text': 'R. and PEYMANN 3. 1994. Diagnosis of &&sia caba& infections in horses by enzymelinked immunosorbent assay (ELBA) and Western blot. inrernntionalJaurnulfnr Parasitology 24: 341-346. From Bubesia cabal/i in vitro cultures a preparation of 100% infected erythrocytes was obtained. From this, B. cabutli antigens were extracted with the detergent 3-[(3-CholamidopropyI)-dimethylammonio]-lpropane-sulfonate (CHAPS) and used as ELISA antigens. A control antigen of normal erythrocytes from the same donor horse was prepared in an 

In [3]:
import spacy
import scispacy

In [4]:
nlp = spacy.load("en_core_sci_sm", disable=["ner"])

In [14]:
section = article['body_text'][1]
doc = nlp(section['text'])

In [17]:
list(doc.sents)

[In a recent prevalence study of nosocomial infection in the United Kingdom (Meers et al., 1981) , there were surprisingly few reports of virus-associated disease.,
 There is, however, clear evidence that enteric viruses may be transmitted to patients in hospitals and other confined communities.,
 The spread of enteroviruses such as Echo (Nagington et al., 1978) and Coxsackie (Pether, 1982) is known to occur in these environments and in the case of neonates has occasionally resulted in serious illness with significant mortality.,
 Transmission of hepatitis A virus has also been documented as an infection hazard, especially in institutions caring for mentally retarded children (Krugman and Giles, 1970) .,
 During the last decade viruses have been recognized as an important major cause of acute diarrhoeal disease.,
 At present rotavirus infection is thought to account for the majority of these cases although there is increasing evidence of episodes associated with other agents, e.g. aden

In [27]:
sentences = list(doc.sents)

In [42]:
print(f'START: {sentences[0].start_char} -- END: {sentences[0].end_char}')
print(sentences[0])

START: 0 -- END: 162
In a recent prevalence study of nosocomial infection in the United Kingdom (Meers et al., 1981) , there were surprisingly few reports of virus-associated disease.


In [43]:
sentences[0].text.index(')')

94

In [44]:
print(f'START: {sentences[1].start_char} -- END: {sentences[1].end_char}')
print(sentences[1])

START: 163 -- END: 293
There is, however, clear evidence that enteric viruses may be transmitted to patients in hospitals and other confined communities.


In [45]:
section['cite_spans']

[{'start': 75, 'end': 95, 'text': '(Meers et al., 1981)', 'ref_id': 'BIBREF3'},
 {'start': 335,
  'end': 359,
  'text': '(Nagington et al., 1978)',
  'ref_id': None},
 {'start': 364,
  'end': 388,
  'text': 'Coxsackie (Pether, 1982)',
  'ref_id': None},
 {'start': 678,
  'end': 703,
  'text': '(Krugman and Giles, 1970)',
  'ref_id': None},
 {'start': 996, 'end': 1018, 'text': '(Flewett et al., 1975)', 'ref_id': None},
 {'start': 1040,
  'end': 1064,
  'text': 'Lee and Pickering, 1977)',
  'ref_id': 'BIBREF1'},
 {'start': 1079,
  'end': 1116,
  'text': '(Cubitt, McSwiggan and Arstall, 1980)',
  'ref_id': None},
 {'start': 1133,
  'end': 1163,
  'text': '(Caul, Paver and Clarke, 1975)',
  'ref_id': None}]

In [46]:
sent_ends = [sent.end_char for sent in sentences]
sent_ends

[162, 293, 529, 705, 814, 1165]

In [47]:
cite_ex = section['cite_spans'][2]
cite_ex

{'start': 364, 'end': 388, 'text': 'Coxsackie (Pether, 1982)', 'ref_id': None}

In [51]:
np.searchsorted(sent_ends, cite_ex['end'])

2

In [54]:
np.searchsorted(sent_ends, 529, side='left')

2

In [56]:
sentences[0:1]

[In a recent prevalence study of nosocomial infection in the United Kingdom (Meers et al., 1981) , there were surprisingly few reports of virus-associated disease.]

In [65]:
%time sentences[0].text.replace('(Meers et al., 1981)', '<CITE>')

CPU times: user 125 µs, sys: 2 µs, total: 127 µs
Wall time: 136 µs


'In a recent prevalence study of nosocomial infection in the United Kingdom <CITE> , there were surprisingly few reports of virus-associated disease.'

In [67]:
%time re.sub(re.escape('(Meers et al., 1981)'), '<CITE>', sentences[0].text)

CPU times: user 338 µs, sys: 7 µs, total: 345 µs
Wall time: 354 µs


'In a recent prevalence study of nosocomial infection in the United Kingdom <CITE> , there were surprisingly few reports of virus-associated disease.'

In [71]:
' '.join([s.text for s in sentences[0:2]])

'In a recent prevalence study of nosocomial infection in the United Kingdom (Meers et al., 1981) , there were surprisingly few reports of virus-associated disease. There is, however, clear evidence that enteric viruses may be transmitted to patients in hospitals and other confined communities.'

In [79]:
import csv

In [81]:
with (config.raw / 'cord19_articles.csv').open('w') as fp:
    csvwriter = csv.writer(fp)
    
    csvwriter.writerow(['test1', 'test2', 'test3'])
    csvwriter.writerow(['r1', 'r2', 'r3'])
    csvwriter.writerow(['text with, a comma', 'text with, a semicolon;', 'r3'])

In [75]:
fp = (config.raw / 'cord19_articles.csv').open('w')

In [78]:
fp.write(','.join(['test1', 'test2', 'test3']))
fp.write(','.join(['11', '12', '13']))
fp.write(','.join(['21', '22', '23']))

8

In [82]:
'Hello'.split(';')

['Hello']

In [83]:
'Hello; hello'.split(';')

['Hello', ' hello']

In [84]:
id_

'f94beaebca9e855e0f2f632aefb0c3e44d8fcf54'

In [85]:
%time aws.read_item(subset, id_)

CPU times: user 69.6 ms, sys: 154 ms, total: 223 ms
Wall time: 1.49 s


{'paper_id': 'f94beaebca9e855e0f2f632aefb0c3e44d8fcf54',
 'metadata': {'title': '', 'authors': []},
 'abstract': [],
 'body_text': [{'text': ' Infection (1982) ',
   'cite_spans': [{'start': 1,
     'end': 17,
     'text': 'Infection (1982)',
     'ref_id': None}],
   'ref_spans': [],
   'section': 'Journal of Hospital'},
  {'text': 'In a recent prevalence study of nosocomial infection in the United Kingdom (Meers et al., 1981) , there were surprisingly few reports of virus-associated disease. There is, however, clear evidence that enteric viruses may be transmitted to patients in hospitals and other confined communities. The spread of enteroviruses such as Echo (Nagington et al., 1978) and Coxsackie (Pether, 1982) is known to occur in these environments and in the case of neonates has occasionally resulted in serious illness with significant mortality. Transmission of hepatitis A virus has also been documented as an infection hazard, especially in institutions caring for mentally reta

In [99]:
cord_uid_test = 'test'
sha_test = 'f6af90a5e3b6a85f64c42d66bbdc9ec3ace4b2ae'
article = aws.read_item('custom_license', sha_test)

In [102]:
def get_and_process_article(sha, subset):
    uid = 'test'
    article = aws.read_item(subset, sha)
    return processing_data.get_citation_data(uid, article)

In [144]:
context_citation = get_and_process_article(sha_test, 'custom_license')
context_citation

[('test',
  'equiare obligate intraerythrocytic parasites of equines. They are the causative agents of equine babesioses which are endemic in most tropical and subtropical areas of the world <CITE> Friedhoff, Tenter & Miller, 1990) .',
  'Die Piroplasmen der Equiden-Bedeutung fiir den internationalen Pferdeverkehr',
  [{'first': 'K', 'middle': ['T'], 'last': 'Friedhoff', 'suffix': ''}],
  1982,
  'Berliner und Miinchener Tieriirztliche Wochenschrif'),
 ('test',
  'They are the causative agents of equine babesioses which are endemic in most tropical and subtropical areas of the world (Friedhoff, 1982; Friedhoff, Tenter & Miller, 1990) . Whereas 8. cabalii only invades erythrocytes B. equi is also capable of infecting lymphocytes <CITE> .',
  'Babesia equi (Laveran 1901) 1. Development in horses and in lymphocyte culture',
  [{'first': '', 'middle': [], 'last': 'Schein E', 'suffix': ''},
   {'first': '', 'middle': [], 'last': 'Rehbein G', 'suffix': ''},
   {'first': '', 'middle': ['P'], 

In [147]:
context_citation[2]

('test',
 'PBS with 0.05% Tween 20 OF PBS with 1% BSA and 0.05% Tween 20. After a further set of washes substrate (1 mg ml I of 5-aminosali~ylic acid <CITE> in 0.1 M-sodiumphosphate buffer, pH 6.0 containing 0.1 m&i-EDTA and 6 mM-H&f was dispensed and plates were read.',
 'A simple method for the purification of 5-aminosalicylic acid. Application of the product as substrate in enzyme-linked immunosorbent assay (ELISA)',
 [{'first': '', 'middle': ['J'], 'last': 'Ellens D', 'suffix': ''},
  {'first': '', 'middle': ['L J'], 'last': 'Gielkens A', 'suffix': ''}],
 1980,
 'Journal oflmmunological Methods')

In [120]:
%time get_and_process_article(sha_test, 'custom_license')

CPU times: user 397 ms, sys: 344 ms, total: 741 ms
Wall time: 1.88 s


[('test',
  'equiare obligate intraerythrocytic parasites of equines. They are the causative agents of equine babesioses which are endemic in most tropical and subtropical areas of the world <CITE> Friedhoff, Tenter & Miller, 1990) .',
  ('Die Piroplasmen der Equiden-Bedeutung fiir den internationalen Pferdeverkehr',
   [{'first': 'K', 'middle': ['T'], 'last': 'Friedhoff', 'suffix': ''}],
   1982,
   'Berliner und Miinchener Tieriirztliche Wochenschrif')),
 ('test',
  'They are the causative agents of equine babesioses which are endemic in most tropical and subtropical areas of the world (Friedhoff, 1982; Friedhoff, Tenter & Miller, 1990) . Whereas 8. cabalii only invades erythrocytes B. equi is also capable of infecting lymphocytes <CITE> .',
  ('Babesia equi (Laveran 1901) 1. Development in horses and in lymphocyte culture',
   [{'first': '', 'middle': [], 'last': 'Schein E', 'suffix': ''},
    {'first': '', 'middle': [], 'last': 'Rehbein G', 'suffix': ''},
    {'first': '', 'middle'

In [121]:
%lprun -f get_and_process_article get_and_process_article(sha_test, 'custom_license')
