In [1]:
import json
import requests
import re
import pickle
import gzip
import zlib
from io import BytesIO
import tarfile

from tqdm.notebook import tqdm

import pandas as pd
import numpy as np
import boto3

from seq2cite import config, utils, text, aws

import sys
if str(config.src) not in sys.path:
    sys.path.append(str(config.src))

processing_data = __import__('0_processing_data')

%load_ext autoreload
%autoreload 2
%load_ext line_profiler

Loaded spaCy model en_core_sci_sm


# Exploring CORD-19 Dataset

In [2]:
s3, s3_resource = aws.connect_aws_s3()
cord19_bucket = aws.get_cord19_bucket(s3, s3_resource)

In [3]:
def list_bucket_contents(bucket_resource, match='', size_mb=0):
    total_size_gb = 0
    total_files = 0
    match_size_gb = 0
    match_files = 0
    for key in bucket_resource.objects.all():
        key_size_mb = key.size/1024/1024
        total_size_gb += key_size_mb
        total_files += 1
        list_check = False
        if not match:
            list_check = True
        elif match in key.key:
            list_check = True
        if list_check and not size_mb:
            match_files += 1
            match_size_gb += key_size_mb
            print(f'{key.key} ({key_size_mb:3.0f}MB)')
        elif list_check and key_size_mb <= size_mb:
            match_files += 1
            match_size_gb += key_size_mb
            print(f'{key.key} ({key_sizexmb:3.0f}MB)')

        if key.key.endswith('.json'):
            break
    if match:
        print(f'Matched file size is {match_size_gb/1024:3.1f}GB with {match_files} files')            
    
    print(f'Bucket total size is {total_size_gb/1024:3.1f}GB with {total_files} files')
    
    

In [4]:
list_bucket_contents(cord19_bucket)

2020-03-13/COVID.DATA.LIC.AGMT.pdf (  0MB)
2020-03-13/all_sources_metadata_2020-03-13.csv ( 48MB)
2020-03-13/all_sources_metadata_2020-03-13.readme (  0MB)
2020-03-13/biorxiv_medrxiv.tar.gz ( 13MB)
2020-03-13/comm_use_subset.tar.gz (186MB)
2020-03-13/json_schema.txt (  0MB)
2020-03-13/noncomm_use_subset.tar.gz ( 36MB)
2020-03-13/pmc_custom_license.tar.gz ( 19MB)
2020-03-20/biorxiv_medrxiv.tar.gz ( 13MB)
2020-03-20/changelog (  0MB)
2020-03-20/comm_use_subset.tar.gz (183MB)
2020-03-20/custom_license.tar.gz (344MB)
2020-03-20/metadata.csv ( 60MB)
2020-03-20/noncomm_use_subset.tar.gz ( 40MB)
2020-03-27/biorxiv_medrxiv.tar.gz ( 15MB)
2020-03-27/changelog (  0MB)
2020-03-27/comm_use_subset.tar.gz (186MB)
2020-03-27/custom_license.tar.gz (414MB)
2020-03-27/metadata.csv ( 66MB)
2020-03-27/metadata_with_mag_mapping.csv ( 68MB)
2020-03-27/noncomm_use_subset.tar.gz ( 40MB)
2020-04-03/biorxiv_medrxiv.tar.gz ( 18MB)
2020-04-03/changelog (  0MB)
2020-04-03/comm_use_subset.tar.gz (346MB)
2020-04-03/

In [11]:
key = '2020-04-10/noncomm_use_subset.tar.gz'
result = s3.get_object(Bucket=config.cord19_aws_bucket, Key=key)

In [12]:
content = tarfile.open(fileobj=BytesIO(result['Body'].read()), mode="r:gz")

In [13]:
members = content.getmembers()

In [20]:
members[0].name

'noncomm_use_subset/pdf_json/c1ad13d83e926979dbf2bbe52e4944082f28dfea.json'

In [18]:
json.load(content.extractfile(members[0]))

{'paper_id': 'c1ad13d83e926979dbf2bbe52e4944082f28dfea',
 'metadata': {'title': 'Antisense-induced ribosomal frameshifting',
  'authors': [{'first': 'Clark',
    'middle': ['M'],
    'last': 'Henderson',
    'suffix': '',
    'affiliation': {'laboratory': '',
     'institution': 'University of Utah',
     'location': {'addrLine': '15 N 2030 E',
      'postCode': '7410, 84112-5330',
      'settlement': 'Room, Salt Lake City',
      'region': 'UT',
      'country': 'USA'}},
    'email': ''},
   {'first': 'Christine',
    'middle': ['B'],
    'last': 'Anderson',
    'suffix': '',
    'affiliation': {'laboratory': '',
     'institution': 'University of Utah',
     'location': {'addrLine': '15 N 2030 E',
      'postCode': '7410, 84112-5330',
      'settlement': 'Room, Salt Lake City',
      'region': 'UT',
      'country': 'USA'}},
    'email': ''},
   {'first': 'Michael',
    'middle': ['T'],
    'last': 'Howard',
    'suffix': '',
    'affiliation': {'laboratory': '',
     'institution': 

In [None]:
content.extractall()
content.close()

In [183]:
content = gzip.GzipFile(fileobj=BytesIO(result['Body'].read())).read().decode()

In [185]:
content[:500]

'noncomm_use_subset/pdf_json/c1ad13d83e926979dbf2bbe52e4944082f28dfea.json\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x000000644\x000004167\x000000144\x0000000346044\x0013644150710\x00025641\x00 0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00ustar  \x00lucyw\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00users\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x000000000\x000000000\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00

In [None]:
decompressed_data[:100]

In [9]:
def preview_csv_dataset(bucket, key, rows=10):
    data_source = {
            'Bucket': bucket,
            'Key': key
        }
    # Generate the URL to get Key from Bucket
    url = s3.generate_presigned_url(
        ClientMethod = 'get_object',
        Params = data_source
    )

    data = pd.read_csv(url, nrows=rows)
    return data

In [127]:
metadata = pd.read_csv(f's3://{config.cord19_aws_bucket}/2020-04-03/metadata.csv',
                       nrows=1000,
                       skiprows=5000,
                      names=config.metadata_columns,
                      header=0)
metadata

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_pdf_parse,has_pmc_xml_parse,full_text_file,url
0,pxho2kaw,6267da468794f6a4857adb22464ee39527f0cbfc,Elsevier,The three domains of public health: An interna...,10.1016/j.puhe.2007.05.013,,17889089.0,els-covid,Summary By focusing on the Masters of Public H...,2008-02-29,"Thorpe, Allison; Griffiths, Siân; Jewell, Tony...",Public Health,,,True,False,custom_license,https://doi.org/10.1016/j.puhe.2007.05.013
1,cewpqddk,eef3169f465c7567fd11047efc01653d119e4475,Elsevier,Human rights and other provisions in the revis...,10.1016/j.puhe.2007.08.001,,17900637.0,els-covid,"Summary In May 2005, the World Health Assembly...",2007-11-30,"Plotkin, Bruce",Public Health,,,True,False,custom_license,https://doi.org/10.1016/j.puhe.2007.08.001
2,qn8yifcd,6102dd48ba28756830876fe88d80c8a81bcc802e,Elsevier,Evaluating the effectiveness of an emergency p...,10.1016/j.puhe.2007.08.006,,,els-covid,Summary Background The severe acute respirator...,2008-05-31,"Wang, Chongjian; Wei, Sheng; Xiang, Hao; Xu, Y...",Public Health,,,True,False,custom_license,https://doi.org/10.1016/j.puhe.2007.08.006
3,lf90j7mm,2cac9cc5d4bad991c742162a72ee2b3e354e3ab6,Elsevier,Changing pattern of premature mortality burden...,10.1016/j.puhe.2007.08.017,,18387641.0,els-covid,Summary Background This study was conducted in...,2008-05-31,"Cai, Le; Chongsuvivatwong, Virasakdi; Geater, ...",Public Health,,,True,False,custom_license,https://doi.org/10.1016/j.puhe.2007.08.017
4,elay5gi1,f3f80cb590bac6b1ad0fe73b69a890786a70f4c0,Elsevier,"One country, two systems: Public health in China",10.1016/j.puhe.2008.04.015,,18556031.0,els-covid,"Summary This paper, presented in Lisbon in May...",2008-08-31,"Griffiths, S.",Public Health,,,True,False,custom_license,https://doi.org/10.1016/j.puhe.2008.04.015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,s6hi1lx6,,Elsevier,Allergy is a negative prognostic factor for th...,10.1016/s0091-6749(03)81274-5,,,els-covid,,2003-02-28,"Wagenmann, M.; Rietschel, A.; Ganzer, U.",Journal of Allergy and Clinical Immunology,,,False,False,custom_license,https://doi.org/10.1016/s0091-6749(03)81274-5
996,vejgjs3o,2f739ce158fe0605ca31cd2b83e00abfba4c6823,Elsevier,"Viral infections in relation to age, atopy, an...",10.1016/j.jaci.2004.04.006,,15316497.0,els-covid,Abstract Background Viral respiratory tract in...,2004-08-31,"Heymann, Peter W; Carper, Holliday T; Murphy, ...",Journal of Allergy and Clinical Immunology,,,True,False,custom_license,https://doi.org/10.1016/j.jaci.2004.04.006
997,hj1s3ipp,ee786c268711dc91475fd86bccb7662bb8a0994a,Elsevier,Hereditary and acquired angioedema: Problems a...,10.1016/j.jaci.2004.06.047,,15356535.0,els-covid,"Hereditary angioedema (HAE), a rare but life-t...",2004-09-30,"Agostoni, Angelo; Aygören-Pürsün, Emel; Binkle...",Journal of Allergy and Clinical Immunology,,,True,False,custom_license,https://doi.org/10.1016/j.jaci.2004.06.047
998,w1u79yzd,cdd392633a8420a6a4dcd533949b4de54d0238aa,Elsevier,Rhinosinusitis: Establishing definitions for c...,10.1016/j.jaci.2004.09.029,,,els-covid,Background There is a need for more research o...,2004-12-31,"Meltzer, Eli O.; Hamilos, Daniel L.; Hadley, J...",Journal of Allergy and Clinical Immunology,,,True,False,custom_license,https://doi.org/10.1016/j.jaci.2004.09.029


In [123]:
metadata[~pd.isna(metadata['sha'])]

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_pdf_parse,has_pmc_xml_parse,full_text_file,url
2,tb081mou,499c2b2827c6be9930b2de36e276ac4ffbb68830,Elsevier,Topical pharmacotherapy for allergic rhinitis:...,10.1016/0196-0709(93)90110-s,,8285306.0,els-covid,Abstract Nedocromil sodium is a mast-cell stab...,1993-12-31,"Mabry, Richard L.",American Journal of Otolaryngology,,,True,False,custom_license,https://doi.org/10.1016/0196-0709(93)90110-s
3,9s77hwnt,ca105be65dffffd6e0e5b402d45d1e6394161a8d,Elsevier,Diarrheal disease and DRGs,10.1016/0196-4399(86)90096-6,,,els-covid,Abstract This article has presented a rational...,1986-01-01,"Gilligan, Peter H.",Clinical Microbiology Newsletter,,,True,False,custom_license,https://doi.org/10.1016/0196-4399(86)90096-6
4,eg7r71kp,ae27d6930b6a1d2360d8dea0e38bc74e0903a001,Elsevier,The role of the laboratory in the etiologic di...,10.1016/0196-4399(89)90046-9,,,els-covid,,1989-07-15,"Doern, Gary V.",Clinical Microbiology Newsletter,,,True,False,custom_license,https://doi.org/10.1016/0196-4399(89)90046-9
5,hxxuxlek,58df62e6df26193cbd507ae0d37a03ed1181fe57,Elsevier,CAP workload recording,10.1016/0196-4399(89)90064-0,,,els-covid,,1989-05-01,"Bartlett, Raymond C.",Clinical Microbiology Newsletter,,,True,False,custom_license,https://doi.org/10.1016/0196-4399(89)90064-0
7,vetrb61q,e8519f0bc04b01f237d4093cbb7e82ebceb0f8eb,Elsevier,Novel viruses associated with gastroenteritis,10.1016/0196-4399(91)90041-s,,,els-covid,,1991-09-15,"Gray, Larry D.",Clinical Microbiology Newsletter,,,True,False,custom_license,https://doi.org/10.1016/0196-4399(91)90041-s
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,64lztahe,f05b4d191041779559a88438cd08021459033f61,Elsevier,Viral Pathogens of Domestic Animals and Their ...,10.1016/b978-012373944-5.00368-0,,,els-covid,Infectious diseases of farm animals are one of...,2009-12-31,"Murcia, P.; Donachie, W.; Palmarini, M.",Encyclopedia of Microbiology,,,True,False,custom_license,https://doi.org/10.1016/b978-012373944-5.00368-0
996,8axyyj0y,96ce56dd19be7aaf826cd858c824f087f4b99614,Elsevier,Anthropological Perspectives on the Health Tra...,10.1016/b978-012373960-5.00150-7,,,els-covid,The health transition is used to refer to the ...,2008-12-31,"Manderson, L.",International Encyclopedia of Public Health,,,True,False,custom_license,https://doi.org/10.1016/b978-012373960-5.00150-7
997,f4x85a3g,4815158fd87ae667da39dcab37edd1fc6c3b7d43,Elsevier,The Demand for Health Care,10.1016/b978-012373960-5.00164-7,,,els-covid,Health-care demand analysis provides informati...,2008-12-31,"Mwabu, G.",International Encyclopedia of Public Health,,,True,False,custom_license,https://doi.org/10.1016/b978-012373960-5.00164-7
998,nhoyomp2,cb699fab9917aacdcd501f0b4d60f1517ed0c692,Elsevier,Epidemic Investigation,10.1016/b978-012373960-5.00183-0,,,els-covid,One of the key roles of public health is to ma...,2008-12-31,"Mathieu, E.; Sodahlon, Y.",International Encyclopedia of Public Health,,,True,False,custom_license,https://doi.org/10.1016/b978-012373960-5.00183-0


In [15]:
id_ = 'f6af90a5e3b6a85f64c42d66bbdc9ec3ace4b2ae'
subset = 'custom_license'
article = aws.read_item(subset, id_, s3=s3)
article

{'paper_id': 'f6af90a5e3b6a85f64c42d66bbdc9ec3ace4b2ae',
 'metadata': {'title': 'DIAGNOSIS OF BABESIA CABALLI INFECTIONS IN HORSES BY ENZYME-LINKED IMMUNOSORBENT ASSAY (ELISA) AND WESTERN BLOT',
  'authors': [{'first': 'Reinhard',
    'middle': [],
    'last': 'Bose',
    'suffix': '',
    'affiliation': {},
    'email': ''},
   {'first': 'Berit',
    'middle': [],
    'last': 'Peymann',
    'suffix': '',
    'affiliation': {},
    'email': ''}]},
 'abstract': [{'text': 'R. and PEYMANN 3. 1994. Diagnosis of &&sia caba& infections in horses by enzymelinked immunosorbent assay (ELBA) and Western blot. inrernntionalJaurnulfnr Parasitology 24: 341-346. From Bubesia cabal/i in vitro cultures a preparation of 100% infected erythrocytes was obtained. From this, B. cabutli antigens were extracted with the detergent 3-[(3-CholamidopropyI)-dimethylammonio]-lpropane-sulfonate (CHAPS) and used as ELISA antigens. A control antigen of normal erythrocytes from the same donor horse was prepared in an 

In [87]:
article['metadata']['authors'][0]['first'][:1]

'R'

In [16]:
import spacy
import scispacy

In [17]:
nlp = spacy.load("en_core_sci_sm", disable=["ner"])

In [73]:
nlp.vocab.strings["Western"]

14425861432571778490

In [40]:
section = article['body_text'][1]
doc = nlp(section['text'])

In [23]:
tokens = list(doc)

In [49]:
print(tokens)

[Bubesiu, caballi, and, 3, ., equiare, obligate, intraerythrocytic, parasites, of, equines, ., They, are, the, causative, agents, of, equine, babesioses, which, are, endemic, in, most, tropical, and, subtropical, areas, of, the, world, (, Friedhoff, ,, 1982, ;, Friedhoff, ,, Tenter, &, Miller, ,, 1990, ), ., Whereas, 8, ., cabalii, only, invades, erythrocytes, B., equi, is, also, capable, of, infecting, lymphocytes, (, Schein, ,, Rehbein, ,, Voigt, &, Zweygarth, ,, 1981, ), ., Both, parasites, are, transmitted, by, tick, vectors, with, almost, worldwide, distribution, (, Fricdhoff, ,, 1982, ), ., Consequently, ,, it, is, important, to, prevent, the, introduction, of, carrier, animals, into, nonendemic, areas, ,, particuiariy, where, the, diseases, could, be, spread, by, vector, ticks, ., Horses, to, be, exported, into, the, U.S.A., ,, Japan, ,, Australia, or, other, countries, have, to, be, tested, negative, for, babesioses, by, the, camplement, fixation, test, (, CFT, ), or, the, immu

In [24]:
section['cite_spans']

[{'start': 201, 'end': 218, 'text': '(Friedhoff, 1982;', 'ref_id': 'BIBREF5'},
 {'start': 219,
  'end': 252,
  'text': 'Friedhoff, Tenter & Miller, 1990)',
  'ref_id': None},
 {'start': 349,
  'end': 391,
  'text': '(Schein, Rehbein, Voigt & Zweygarth, 1981)',
  'ref_id': 'BIBREF10'},
 {'start': 476, 'end': 493, 'text': '(Fricdhoff, 1982)', 'ref_id': None},
 {'start': 855, 'end': 861, 'text': '(IFAT)', 'ref_id': None},
 {'start': 927, 'end': 942, 'text': '(Weiland, 1985;', 'ref_id': None},
 {'start': 943, 'end': 943, 'text': '', 'ref_id': None},
 {'start': 951, 'end': 968, 'text': 'L Friedhoff, 1986', 'ref_id': None},
 {'start': 1325, 'end': 1346, 'text': '(Base & Daemen, 1992)', 'ref_id': None}]

In [27]:
citation = doc.char_span(349, 391)
print(citation)

(Schein, Rehbein, Voigt & Zweygarth, 1981)


In [30]:
sent_len = len(section['text'])
doc_len = len(' '.join(token.orth_ for token in doc))
print(sent_len, doc_len)

1699 1745


In [34]:
token = tokens[0]

In [44]:
token.idx

0

In [74]:
token.lemma

11248986204349196901

In [75]:
token.lemma_

'bubesiu'

In [85]:
nlp.vocab.strings['<CITE>']

13125041344321955426

In [115]:
idx_start_cite, idx_end_cite = 349, 391
# Get index of first and last token
idx_start_t, idx_end_t = 0, len(tokens)
for t in tokens:
    if t.idx == idx_start_cite:
        idx_start_t = t.i
    if t.idx + len(t) == idx_end_cite:
        idx_end_t = t.i

In [116]:
idx_start_t

61

In [117]:
idx_end_t

71

In [118]:
t.idx

1698

In [119]:
len(t)

1

In [120]:
len(tokens)

312

In [62]:
idx_start_token

61

In [63]:
idx_end_token

72

In [64]:
tokens[61]

(

In [65]:
tokens[71]

)

In [69]:
tokens[idx_start_token-15:idx_start_token]

[Whereas,
 8,
 .,
 cabalii,
 only,
 invades,
 erythrocytes,
 B.,
 equi,
 is,
 also,
 capable,
 of,
 infecting,
 lymphocytes]

In [70]:
len(doc)

312

In [67]:
tokens[idx_end_token:idx_end_token+15]

[.,
 Both,
 parasites,
 are,
 transmitted,
 by,
 tick,
 vectors,
 with,
 almost,
 worldwide,
 distribution,
 (,
 Fricdhoff,
 ,]

In [97]:
list(doc[101:121])[0].idx

565

In [42]:
print(f'START: {sentences[0].start_char} -- END: {sentences[0].end_char}')
print(sentences[0])

START: 0 -- END: 162
In a recent prevalence study of nosocomial infection in the United Kingdom (Meers et al., 1981) , there were surprisingly few reports of virus-associated disease.


In [43]:
sentences[0].text.index(')')

94

In [44]:
print(f'START: {sentences[1].start_char} -- END: {sentences[1].end_char}')
print(sentences[1])

START: 163 -- END: 293
There is, however, clear evidence that enteric viruses may be transmitted to patients in hospitals and other confined communities.


In [45]:
section['cite_spans']

[{'start': 75, 'end': 95, 'text': '(Meers et al., 1981)', 'ref_id': 'BIBREF3'},
 {'start': 335,
  'end': 359,
  'text': '(Nagington et al., 1978)',
  'ref_id': None},
 {'start': 364,
  'end': 388,
  'text': 'Coxsackie (Pether, 1982)',
  'ref_id': None},
 {'start': 678,
  'end': 703,
  'text': '(Krugman and Giles, 1970)',
  'ref_id': None},
 {'start': 996, 'end': 1018, 'text': '(Flewett et al., 1975)', 'ref_id': None},
 {'start': 1040,
  'end': 1064,
  'text': 'Lee and Pickering, 1977)',
  'ref_id': 'BIBREF1'},
 {'start': 1079,
  'end': 1116,
  'text': '(Cubitt, McSwiggan and Arstall, 1980)',
  'ref_id': None},
 {'start': 1133,
  'end': 1163,
  'text': '(Caul, Paver and Clarke, 1975)',
  'ref_id': None}]

In [46]:
sent_ends = [sent.end_char for sent in sentences]
sent_ends

[162, 293, 529, 705, 814, 1165]

In [47]:
cite_ex = section['cite_spans'][2]
cite_ex

{'start': 364, 'end': 388, 'text': 'Coxsackie (Pether, 1982)', 'ref_id': None}

In [51]:
np.searchsorted(sent_ends, cite_ex['end'])

2

In [54]:
np.searchsorted(sent_ends, 529, side='left')

2

In [56]:
sentences[0:1]

[In a recent prevalence study of nosocomial infection in the United Kingdom (Meers et al., 1981) , there were surprisingly few reports of virus-associated disease.]

In [65]:
%time sentences[0].text.replace('(Meers et al., 1981)', '<CITE>')

CPU times: user 125 µs, sys: 2 µs, total: 127 µs
Wall time: 136 µs


'In a recent prevalence study of nosocomial infection in the United Kingdom <CITE> , there were surprisingly few reports of virus-associated disease.'

In [67]:
%time re.sub(re.escape('(Meers et al., 1981)'), '<CITE>', sentences[0].text)

CPU times: user 338 µs, sys: 7 µs, total: 345 µs
Wall time: 354 µs


'In a recent prevalence study of nosocomial infection in the United Kingdom <CITE> , there were surprisingly few reports of virus-associated disease.'

In [71]:
' '.join([s.text for s in sentences[0:2]])

'In a recent prevalence study of nosocomial infection in the United Kingdom (Meers et al., 1981) , there were surprisingly few reports of virus-associated disease. There is, however, clear evidence that enteric viruses may be transmitted to patients in hospitals and other confined communities.'

In [79]:
import csv

In [81]:
with (config.raw / 'cord19_articles.csv').open('w') as fp:
    csvwriter = csv.writer(fp)
    
    csvwriter.writerow(['test1', 'test2', 'test3'])
    csvwriter.writerow(['r1', 'r2', 'r3'])
    csvwriter.writerow(['text with, a comma', 'text with, a semicolon;', 'r3'])

In [75]:
fp = (config.raw / 'cord19_articles.csv').open('w')

In [78]:
fp.write(','.join(['test1', 'test2', 'test3']))
fp.write(','.join(['11', '12', '13']))
fp.write(','.join(['21', '22', '23']))

8

In [82]:
'Hello'.split(';')

['Hello']

In [83]:
'Hello; hello'.split(';')

['Hello', ' hello']

In [84]:
id_

'f94beaebca9e855e0f2f632aefb0c3e44d8fcf54'

In [85]:
%time aws.read_item(subset, id_)

CPU times: user 69.6 ms, sys: 154 ms, total: 223 ms
Wall time: 1.49 s


{'paper_id': 'f94beaebca9e855e0f2f632aefb0c3e44d8fcf54',
 'metadata': {'title': '', 'authors': []},
 'abstract': [],
 'body_text': [{'text': ' Infection (1982) ',
   'cite_spans': [{'start': 1,
     'end': 17,
     'text': 'Infection (1982)',
     'ref_id': None}],
   'ref_spans': [],
   'section': 'Journal of Hospital'},
  {'text': 'In a recent prevalence study of nosocomial infection in the United Kingdom (Meers et al., 1981) , there were surprisingly few reports of virus-associated disease. There is, however, clear evidence that enteric viruses may be transmitted to patients in hospitals and other confined communities. The spread of enteroviruses such as Echo (Nagington et al., 1978) and Coxsackie (Pether, 1982) is known to occur in these environments and in the case of neonates has occasionally resulted in serious illness with significant mortality. Transmission of hepatitis A virus has also been documented as an infection hazard, especially in institutions caring for mentally reta

In [103]:
cord_uid_test = 'test'
sha_test = 'f6af90a5e3b6a85f64c42d66bbdc9ec3ace4b2ae'
article = aws.read_item('custom_license', sha_test)
auth_idxs = processing_data.get_author_idxs(article['metadata']['authors'])

In [105]:
def get_and_process_article(sha, subset):
    uid = 'test'
    article = aws.read_item(subset, sha)
    return processing_data.get_citation_data(uid, article, auth_idxs)

In [106]:
context_citation = get_and_process_article(sha_test, 'custom_license')
context_citation

[('test__BIBREF5',
  [110,
   113,
   120,
   131,
   137,
   141,
   149,
   152,
   157,
   166,
   170,
   182,
   188,
   191,
   195,
   13125041344321955426,
   219,
   228,
   230,
   237,
   239,
   245,
   247,
   251,
   253,
   255,
   263,
   264,
   266,
   274,
   279],
  [2],
  [0, 1],
  [16807736786308693906,
   8804436317604978073,
   9250722957692387333,
   13086200862373010502,
   13667677303918263717,
   16957618830852774490,
   3948419016771548908,
   13354882772425452173]),
 ('test__BIBREF10',
  [255,
   263,
   264,
   266,
   274,
   279,
   287,
   300,
   303,
   308,
   311,
   316,
   324,
   327,
   337,
   13125041344321955426,
   392,
   394,
   399,
   409,
   413,
   425,
   428,
   433,
   441,
   446,
   453,
   463,
   476,
   477,
   486],
  [3, 4, 5, 6],
  [0, 1],
  [5372888352104307880,
   14704457266878097861,
   12638816674900267446,
   1091070107117497771,
   2158564215700098950,
   3842344029291005339,
   5533571732986600803,
   12646065887601

In [107]:
context_citation[2]

('test__BIBREF4',
 [1403,
  1411,
  1415,
  1418,
  1425,
  1435,
  1436,
  1438,
  1441,
  1444,
  1446,
  1449,
  1460,
  1461,
  1466,
  13125041344321955426,
  1497,
  1500,
  1504,
  1522,
  1528,
  1530,
  1533,
  1537,
  1548,
  1552,
  1561,
  1565,
  1567,
  1574,
  1578],
 [7, 8],
 [0, 1],
 [14862748245026736845,
  14749329488166600143,
  3449386869681536401,
  16037325823156266367,
  7425985699627899538,
  15249873481306576041,
  886050111519832510,
  10971851268798804441,
  7562692243244894168,
  12646065887601541794,
  10647088555044889902,
  886050111519832510,
  7425985699627899538,
  2104994216896503478,
  11901859001352538922,
  5455184181422288602,
  3002984154512732771,
  9026182056785675098,
  13612717912386585570,
  716093402860442228,
  12638816674900267446,
  1786564265694385440,
  3842344029291005339])

In [121]:
%time get_and_process_article(sha_test, 'custom_license')

CPU times: user 96 ms, sys: 125 ms, total: 221 ms
Wall time: 1.28 s


[('test__BIBREF5',
  [110,
   113,
   120,
   131,
   137,
   141,
   149,
   152,
   157,
   166,
   170,
   182,
   188,
   191,
   195,
   13125041344321955426,
   217,
   219,
   228,
   230,
   237,
   239,
   245,
   247,
   251,
   253,
   255,
   263,
   264,
   266,
   274],
  [0],
  [0, 1],
  [16807736786308693906,
   8804436317604978073,
   9250722957692387333,
   13086200862373010502,
   13667677303918263717,
   16957618830852774490,
   3948419016771548908,
   13354882772425452173]),
 ('test__BIBREF10',
  [255,
   263,
   264,
   266,
   274,
   279,
   287,
   300,
   303,
   308,
   311,
   316,
   324,
   327,
   337,
   13125041344321955426,
   390,
   392,
   394,
   399,
   409,
   413,
   425,
   428,
   433,
   441,
   446,
   453,
   463,
   476,
   477],
  [1, 2, 3, 4],
  [0, 1],
  [5372888352104307880,
   14704457266878097861,
   12638816674900267446,
   1091070107117497771,
   2158564215700098950,
   3842344029291005339,
   5533571732986600803,
   12646065887601

In [114]:
%lprun -f processing_data.get_citation_data get_and_process_article(sha_test, 'custom_license')


In [131]:
config.raw

PosixPath('/Users/benjaminlevy/Desktop/Benjamin/School/19_20/Classes/spring/mit6.864/project/project/seq2cite/data/raw')

In [157]:
with (config.raw / 'cord19_author_vocab.pickle').open('rb') as f:
    bts = pickle.load(f)

EOFError: Ran out of input

In [156]:
bts

b''

In [144]:
import multiprocessing as mp

In [145]:
dct = mp.Manager().dict()

In [146]:
dct

<DictProxy object, typeid 'dict' at 0x105f5aed0>

In [148]:
dct['a'] = 1
dct['b'] = 43

In [149]:
dct

<DictProxy object, typeid 'dict' at 0x105f5aed0>

In [153]:
dct.items()

[('a', 1), ('b', 43)]