In [1]:
import pandas as pd

In [3]:
# df = pd.read_csv('metadata.csv')

In [4]:
import os
import json
from pprint import pprint
from copy import deepcopy

import numpy as np
import pandas as pd
from tqdm import tqdm

In [5]:
def format_name(author):
    middle_name = " ".join(author['middle'])
    
    if author['middle']:
        return " ".join([author['first'], middle_name, author['last']])
    else:
        return " ".join([author['first'], author['last']])


def format_affiliation(affiliation):
    text = []
    location = affiliation.get('location')
    if location:
        text.extend(list(affiliation['location'].values()))
    
    institution = affiliation.get('institution')
    if institution:
        text = [institution] + text
    return ", ".join(text)

def format_authors(authors, with_affiliation=False):
    name_ls = []
    
    for author in authors:
        name = format_name(author)
        if with_affiliation:
            affiliation = format_affiliation(author['affiliation'])
            if affiliation:
                name_ls.append(f"{name} ({affiliation})")
            else:
                name_ls.append(name)
        else:
            name_ls.append(name)
    
    return ", ".join(name_ls)

def format_body(body_text):
    texts = [(di['section'], di['text']) for di in body_text]
    texts_di = {di['section']: "" for di in body_text}
    
    for section, text in texts:
        texts_di[section] += text

    body = ""

    for section, text in texts_di.items():
        body += section
        body += "\n\n"
        body += text
        body += "\n\n"
    
    return body

def format_bib(bibs):
    if type(bibs) == dict:
        bibs = list(bibs.values())
    bibs = deepcopy(bibs)
    formatted = []
    
    for bib in bibs:
        bib['authors'] = format_authors(
            bib['authors'], 
            with_affiliation=False
        )
        formatted_ls = [str(bib[k]) for k in ['title', 'authors', 'venue', 'year']]
        formatted.append(", ".join(formatted_ls))

    return "; ".join(formatted)

In [6]:
def load_files(dirname):
    filenames = os.listdir(dirname)
    raw_files = []

    for filename in tqdm(filenames):
        filename = dirname + filename
        file = json.load(open(filename, 'rb'))
        raw_files.append(file)
    
    return raw_files

def generate_clean_df(all_files):
    cleaned_files = []
    
    for file in tqdm(all_files):
        features = [
            file['paper_id'],
            file['metadata']['title'],
            format_authors(file['metadata']['authors']),
            format_authors(file['metadata']['authors'], 
                           with_affiliation=True),
            format_body(file['abstract']),
            format_body(file['body_text']),
            format_bib(file['bib_entries']),
            file['metadata']['authors'],
            file['bib_entries']
        ]

        cleaned_files.append(features)

    col_names = ['paper_id', 'title', 'authors',
                 'affiliations', 'abstract', 'text', 
                 'bibliography','raw_authors','raw_bibliography']

    clean_df = pd.DataFrame(cleaned_files, columns=col_names)
    clean_df.head()
    
    return clean_df

In [7]:
biorxiv_dir = 'biorxiv_medrxiv/biorxiv_medrxiv/'
filenames = os.listdir(biorxiv_dir)
print("Number of articles retrieved from biorxiv:", len(filenames))

Number of articles retrieved from biorxiv: 885


In [8]:
all_files = []

for filename in filenames:
    filename = biorxiv_dir + filename
    file = json.load(open(filename, 'rb'))
    all_files.append(file)

In [9]:
file = all_files[0]
print("Dictionary keys:", file.keys())

Dictionary keys: dict_keys(['paper_id', 'metadata', 'abstract', 'body_text', 'bib_entries', 'ref_entries', 'back_matter'])


In [10]:
pprint(file['abstract'])

[{'cite_spans': [],
  'ref_spans': [],
  'section': 'Abstract',
  'text': 'New anti-AIDS treatments must be continually developed in order to '
          'overcome resistance mutations including those emerging in the '
          'newest therapeutic target, the viral integrase (IN). '
          'Multimerization of IN is functionally imperative and provides a '
          'forthcoming therapeutic target. Allosteric inhibitors of IN bind to '
          'non-catalytic sites and prevent correct multimerization not only '
          'restricting viral integration but also the assembly and maturation '
          'of viral particles. Here, we report an allosteric inhibitor peptide '
          'targeting an unexploited SH3-docking platform of retroviral IN. The '
          'crystal structure of the peptide in complex with the HIV-1 IN core '
          'domain reveals a steric interference that would inhibit conserved '
          'docking of SH3-containing domain with the core domain vital for IN 

In [11]:
print("body_text type:", type(file['body_text']))
print("body_text length:", len(file['body_text']))
print("body_text keys:", file['body_text'][0].keys())

body_text type: <class 'list'>
body_text length: 22
body_text keys: dict_keys(['text', 'cite_spans', 'ref_spans', 'section'])


In [12]:
print("body_text content:")
pprint(file['body_text'][:2], depth=3)

body_text content:
[{'cite_spans': [{...},
                 {...},
                 {...},
                 {...},
                 {...},
                 {...},
                 {...},
                 {...},
                 {...},
                 {...},
                 {...},
                 {...},
                 {...},
                 {...},
                 {...},
                 {...}],
  'ref_spans': [],
  'section': '',
  'text': 'In the absence of a curative treatment, the highly active '
          'antiretroviral therapy (HAART) keeps the HIV-1 virus of AIDS '
          'patients under control. HAART combines drugs targeting different '
          'stages of viral replication including the integration step '
          'catalyzed by the integrase protein (IN) (1) . Integration of viral '
          'DNA into host genome involves two steps catalyzed by IN: (i) '
          "cleavage of a dinucleotide from each 3'-end of the viral DNA "
          "(3'processing), and (ii) i

In [13]:
texts = [(di['section'], di['text']) for di in file['body_text']]
texts_di = {di['section']: "" for di in file['body_text']}
for section, text in texts:
    texts_di[section] += text

pprint(list(texts_di.keys()))

['',
 'Fab derived peptide inhibits IN multimerization and strand-transfer activity',
 'The peptide targets two SH3-docking sites at the CCD/CTD interfaces',
 'SH3-docking platform-2 offers an ideal spot for allosteric targeting',
 'Implications for the CTD interdomain interactions',
 'DISCUSSION',
 'EXPERIMENTAL PROCEDURES Preparation of HIV-1 IN Constructs:',
 'Protein Expression and Purification:',
 'In Vitro Fluorescence IN Activity Assays',
 'Structural Analysis:',
 'Accession numbers']


In [14]:
body = ""

for section, text in texts_di.items():
    body += section
    body += "\n\n"
    body += text
    body += "\n\n"

print(body[:3000])



In the absence of a curative treatment, the highly active antiretroviral therapy (HAART) keeps the HIV-1 virus of AIDS patients under control. HAART combines drugs targeting different stages of viral replication including the integration step catalyzed by the integrase protein (IN) (1) . Integration of viral DNA into host genome involves two steps catalyzed by IN: (i) cleavage of a dinucleotide from each 3'-end of the viral DNA (3'processing), and (ii) insertion of this processed viral DNA into the host DNA (strand-transfer) (2) . Clinical IN strand transfer inhibitors (INSTIs) target the catalytic site of the enzyme to specifically inhibit the DNA joining reaction, however, as with all anti-AIDS treatments, the continued success of these drugs is persistently disrupted by resistance mutations (1, 2) . Although 3'-processing can be carried out by monomeric IN (3) , the assembly of IN functional multimers is imperative for the strand-transfer activity (4) (5) (6) (7) (8) , and for vir

In [15]:
print(format_body(file['body_text'])[:3000])



In the absence of a curative treatment, the highly active antiretroviral therapy (HAART) keeps the HIV-1 virus of AIDS patients under control. HAART combines drugs targeting different stages of viral replication including the integration step catalyzed by the integrase protein (IN) (1) . Integration of viral DNA into host genome involves two steps catalyzed by IN: (i) cleavage of a dinucleotide from each 3'-end of the viral DNA (3'processing), and (ii) insertion of this processed viral DNA into the host DNA (strand-transfer) (2) . Clinical IN strand transfer inhibitors (INSTIs) target the catalytic site of the enzyme to specifically inhibit the DNA joining reaction, however, as with all anti-AIDS treatments, the continued success of these drugs is persistently disrupted by resistance mutations (1, 2) . Although 3'-processing can be carried out by monomeric IN (3) , the assembly of IN functional multimers is imperative for the strand-transfer activity (4) (5) (6) (7) (8) , and for vir

In [16]:
print(all_files[0]['metadata'].keys())

dict_keys(['title', 'authors'])


In [17]:
print(all_files[0]['metadata']['title'])

Multimerization of HIV-1 integrase hinges on conserved SH3-docking platforms


In [18]:
authors = all_files[0]['metadata']['authors']
pprint(authors[:3])

[{'affiliation': {'institution': 'Technion -Israel Institute of Technology',
                  'laboratory': '',
                  'location': {'country': 'Israel., Israel',
                               'postCode': '320003, 320003',
                               'settlement': 'Haifa, Haifa'}},
  'email': '',
  'first': 'Meytal',
  'last': 'Galilee',
  'middle': [],
  'suffix': ''},
 {'affiliation': {'institution': 'Technion -Israel Institute of Technology',
                  'laboratory': '',
                  'location': {'country': 'Israel., Israel',
                               'postCode': '320003, 320003',
                               'settlement': 'Haifa, Haifa'}},
  'email': 'alian@technion.ac.il',
  'first': 'Akram',
  'last': 'Alian',
  'middle': [],
  'suffix': ''}]


In [19]:
for author in authors:
    print("Name:", format_name(author))
    print("Affiliation:", format_affiliation(author['affiliation']))
    print()

Name: Meytal Galilee
Affiliation: Technion -Israel Institute of Technology, 320003, 320003, Haifa, Haifa, Israel., Israel

Name: Akram Alian
Affiliation: Technion -Israel Institute of Technology, 320003, 320003, Haifa, Haifa, Israel., Israel



In [20]:
pprint(all_files[4]['metadata'], depth=4)

{'authors': [{'affiliation': {'institution': 'University of Basel',
                              'laboratory': '',
                              'location': {...}},
              'email': '',
              'first': 'Richard',
              'last': 'Neher',
              'middle': ['A'],
              'suffix': ''},
             {'affiliation': {'institution': 'Karolinska University Hospital',
                              'laboratory': '',
                              'location': {...}},
              'email': '',
              'first': 'Robert',
              'last': 'Dyrdak',
              'middle': [],
              'suffix': ''},
             {'affiliation': {'institution': 'University of Basel',
                              'laboratory': '',
                              'location': {...}},
              'email': '',
              'first': 'Valentin',
              'last': 'Druelle',
              'middle': [],
              'suffix': ''},
             {'affiliation': {'institu

In [21]:
authors = all_files[4]['metadata']['authors']
print("Formatting without affiliation:")
print(format_authors(authors, with_affiliation=False))
print("\nFormatting with affiliation:")
print(format_authors(authors, with_affiliation=True))

Formatting without affiliation:
Richard A Neher, Robert Dyrdak, Valentin Druelle, Emma B Hodcroft, Jan Albert

Formatting with affiliation:
Richard A Neher (University of Basel, Basel, Switzerland), Robert Dyrdak (Karolinska University Hospital, Stockholm, Sweden), Valentin Druelle (University of Basel, Basel, Switzerland), Emma B Hodcroft (University of Basel, Basel, Switzerland), Jan Albert (Karolinska University Hospital, Stockholm, Sweden)


In [22]:
bibs = list(file['bib_entries'].values())
pprint(bibs[:2], depth=4)

[{'authors': [{'first': 'K', 'last': 'Anstett', 'middle': [], 'suffix': ''},
              {'first': 'B', 'last': 'Brenner', 'middle': [], 'suffix': ''},
              {'first': 'T', 'last': 'Mesplede', 'middle': [], 'suffix': ''},
              {'first': 'M',
               'last': 'Wainberg',
               'middle': [...],
               'suffix': ''}],
  'issn': '',
  'other_ids': {},
  'pages': '',
  'ref_id': 'b0',
  'title': 'HIV drug resistance against strand transfer integrase inhibitors',
  'venue': 'Retrovirology',
  'volume': '14',
  'year': 2017},
 {'authors': [{'first': 'J', 'last': 'Grawenhoff', 'middle': [], 'suffix': ''},
              {'first': 'A',
               'last': 'Engelman',
               'middle': [...],
               'suffix': ''}],
  'issn': '',
  'other_ids': {},
  'pages': '32--44',
  'ref_id': 'b1',
  'title': 'Retroviral integrase protein and intasome nucleoprotein complex '
           'structures',
  'venue': 'World J Biol Chem',
  'volume': '8',
  

In [23]:
format_authors(bibs[0]['authors'], with_affiliation=False)

'K Anstett, B Brenner, T Mesplede, M A Wainberg'

In [24]:
bib_formatted = format_bib(bibs[:5])
print(bib_formatted)

HIV drug resistance against strand transfer integrase inhibitors, K Anstett, B Brenner, T Mesplede, M A Wainberg, Retrovirology, 2017; Retroviral integrase protein and intasome nucleoprotein complex structures, J Grawenhoff, A N Engelman, World J Biol Chem, 2017; Identification of Phe187 as a crucial dimerization determinant facilitates crystallization of a monomeric retroviral integrase core domain, M Galilee, Alian , A , Structure, 2014; A supramolecular assembly mediates lentiviral DNA integration, A Ballandras-Colas, D P Maskell, E Serrao, J Locke, P Swuec, S R Jonsson, A Kotecha, N J Cook, V E Pye, I A Taylor, V Andresdottir, A N Engelman, A Costa, P Cherepanov, Science, 2017; Cryo-EM structures and atomic model of the HIV-1 strand transfer complex intasome, D O Passos, M Li, R Yang, S V Rebensburg, R Ghirlando, Y Jeon, N Shkriabai, M Kvaratskhelia, R Craigie, D Lyumkis, Science, 2017


In [25]:
cleaned_files = []

for file in tqdm(all_files):
    features = [
        file['paper_id'],
        file['metadata']['title'],
        format_authors(file['metadata']['authors']),
        format_authors(file['metadata']['authors'], 
                       with_affiliation=True),
        format_body(file['abstract']),
        format_body(file['body_text']),
        format_bib(file['bib_entries']),
        file['metadata']['authors'],
        file['bib_entries']
    ]
    
    cleaned_files.append(features)

100%|██████████| 885/885 [00:18<00:00, 48.09it/s] 


In [26]:
col_names = [
    'paper_id', 
    'title', 
    'authors',
    'affiliations', 
    'abstract', 
    'text', 
    'bibliography',
    'raw_authors',
    'raw_bibliography'
]

clean_df = pd.DataFrame(cleaned_files, columns=col_names)
clean_df.head()

Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography
0,f905f78b32f63c6d14a79984dfb33f1b358b8ab4,Multimerization of HIV-1 integrase hinges on c...,"Meytal Galilee, Akram Alian",Meytal Galilee (Technion -Israel Institute of ...,Abstract\n\nNew anti-AIDS treatments must be c...,"\n\nIn the absence of a curative treatment, th...",HIV drug resistance against strand transfer in...,"[{'first': 'Meytal', 'middle': [], 'last': 'Ga...","{'BIBREF0': {'ref_id': 'b0', 'title': 'HIV dru..."
1,abcfffafab399149d4adadd6bb458c4994e2025d,Time-varying transmission dynamics of Novel Co...,"Tao Liu, Jianxiong Hu, Jianpeng Xiao, Guanhao ...","Tao Liu, Jianxiong Hu, Jianpeng Xiao, Guanhao ...",Abstract\n\nRationale: Several studies have es...,"Introduction\n\nEighteen years ago, severe acu...",World Health Organization. Summary of probable...,"[{'first': 'Tao', 'middle': [], 'last': 'Liu',...","{'BIBREF0': {'ref_id': 'b0', 'title': 'World H..."
2,0cb9c296684ca5e71462d825cab2827854a01544,p53 is not necessary for DUX4 pathology,"Darko Bosnakovski, Erik A Toso, Olivia O Recht...","Darko Bosnakovski (Lillehei Heart Institute), ...",Abstract\n\nSummary Statement: DUX4 is thought...,Introduction\n\nFacioscapulohumeral muscular d...,"Trim24 targets endogenous p53 for degradation,...","[{'first': 'Darko', 'middle': [], 'last': 'Bos...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Trim24 ..."
3,9bbfd3d34ee18ea1b9f4669331a6cee9c5992893,Virological assessment of hospitalized cases o...,"Roman Wölfel, Victor M Corman, Wolfgang Guggem...","Roman Wölfel, Victor M Corman, Wolfgang Guggem...","Abstract\n\nemerged in late 2019 1,2 . Initial...","cases, providing proof of active virus replica...",References (01KI1723A) and the European Union ...,"[{'first': 'Roman', 'middle': [], 'last': 'Wöl...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Referen..."
4,1218f278a4f8d83dac14b23c8f698062812ef9d5,Potential impact of seasonal forcing on a SARS...,"Richard A Neher, Robert Dyrdak, Valentin Druel...","Richard A Neher (University of Basel, Basel, S...",Abstract\n\nA novel coronavirus (SARS-CoV-2) f...,\n\n(2.2 with 90% high density interval 1.4-3....,Novel Coronavirus(2019-nCoV) Situation Report ...,"[{'first': 'Richard', 'middle': ['A'], 'last':...","{'BIBREF6': {'ref_id': 'b6', 'title': 'Novel C..."


In [27]:
clean_df.to_csv('biorxiv_clean.csv', index=False)

In [28]:
pmc_dir = 'custom_license/custom_license/'
pmc_files = load_files(pmc_dir)
pmc_df = generate_clean_df(pmc_files)
pmc_df.to_csv('clean_pmc.csv', index=False)
pmc_df.head()

100%|██████████| 16959/16959 [02:45<00:00, 102.74it/s]
100%|██████████| 16959/16959 [01:19<00:00, 213.50it/s]


Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography
0,ab680d5dbc4f51252da3473109a7885dd6b5eb6f,Evolutionary Medicine IV. Evolution and Emerge...,S V Scarpino,"S V Scarpino (Santa Fe Institute, Santa Fe, NM...",,Introduction\n\nThe evolutionary history of hu...,Rapid evolutionary dynamics and disease threat...,"[{'first': 'S', 'middle': ['V'], 'last': 'Scar...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Rapid e..."
1,6599ebbef3d868afac9daa4f80fa075675cf03bc,International aviation emissions to 2025: Can ...,"Andrew Macintosh, Lailey Wallace",Andrew Macintosh (The Australian National Univ...,Abstract\n\nInternational aviation is growing ...,"Introduction\n\nSixty years ago, civil aviatio...",CONSAVE 2050-constrained scenarios on aviation...,"[{'first': 'Andrew', 'middle': [], 'last': 'Ma...","{'BIBREF0': {'ref_id': 'b0', 'title': 'CONSAVE..."
2,eb5c7f3ff921ad6469b79cc8a3c122648204ece4,2 Mechanisms of diarrhoea,"I W Booth, A S Mcneish Baillikre&apos;","I W Booth, A S Mcneish Baillikre&apos;",,INTRODUCTION\n\nAcute infections of the gastro...,A typing scheme for Aeromonas hydrophila based...,"[{'first': 'I', 'middle': ['W'], 'last': 'Boot...","{'BIBREF0': {'ref_id': 'b0', 'title': 'A typin..."
3,b87b790c96c75faa22a085cb560f7b3d8e018b24,Features of Host Cells: Cellular and Molecular...,,,,THE BASIC ORGANIZATION OF THE CELL\n\nThere ar...,Epidemics to eradication: the modern history o...,[],"{'BIBREF0': {'ref_id': 'b0', 'title': 'Epidemi..."
4,68c0bb1989b6ca2b38da32a0d992027db39f80bc,Spring 2020 | 1 Beijing's Hard and Soft Repres...,Victoria Tin-Bor Hui,Victoria Tin-Bor Hui,Abstract\n\nHong Kong's new Police Commissione...,"\n\nIt is also noteworthy that Tang, who was o...",Listed companies appoint mainlanders to boards...,"[{'first': 'Victoria', 'middle': [], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Listed ..."


In [29]:
comm_dir = 'comm_use_subset/comm_use_subset/'
comm_files = load_files(comm_dir)
comm_df = generate_clean_df(comm_files)
comm_df.to_csv('clean_comm_use.csv', index=False)
comm_df.head()

100%|██████████| 9118/9118 [07:06<00:00, 21.37it/s]  
100%|██████████| 9118/9118 [1:57:26<00:00,  1.29it/s]     


Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography
0,5e0c586f047ff909c8ed3fe171c8975a90608d08,Neutralizing antibodies against porcine epidem...,"Lang Gong, Ying Lin, Jianru Qin, Qianniu Li, C...","Lang Gong, Ying Lin, Jianru Qin, Qianniu Li, C...",Abstract\n\nBackground: Porcine epidemic diarr...,Background\n\nPorcine epidemic diarrhea virus ...,Porcine epidemic diarrhoea virus: a comprehens...,"[{'first': 'Lang', 'middle': [], 'last': 'Gong...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Porcine..."
1,1579fbff7af9b156c6f49fee0526e48f852ea460,A Recombinant Newcastle Disease Virus (NDV) Ex...,"Edris Shirvani, Anandan Paldurai, Vinoth K Man...","Edris Shirvani (University of Maryland, Colleg...","Abstract\n\nCurrently, live-attenuated IBV vac...","Result\n\nGeneration of rNDVs expressing S1, S...",Avian Infectious Bronchitis. Version adopted b...,"[{'first': 'Edris', 'middle': [], 'last': 'Shi...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Avian I..."
2,f670f4bb6f79e54d7603f8520853b1f0d105b588,,,,,"\n\nShekouh AR, Thompson CC, Prime W, Campbell...",kinase-mediated signaling in genetic stroke su...,[],"{'BIBREF0': {'ref_id': 'b0', 'title': 'kinase-..."
3,e0668c4b793d0cad26639b070819334a94648123,GP-confirmed complete Achilles tendon rupture ...,"S J Davis, A Lott, E Besada","S J Davis (University of Tromsø, Tromsø, Norwa...",,Introduction\n\nThe incidence of complete Achi...,"Health and travel advice for Hajj pilgrims, Mu...","[{'first': 'S', 'middle': ['J'], 'last': 'Davi...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Health ..."
4,38aa050ad79d8a1d7022c33535255ce9d47914e5,Potent Inhibition of Junín Virus Infection by ...,"C Huang, A G Walker, A M Grant, O A Kolokoltso...","C Huang, A G Walker, A M Grant, O A Kolokoltso...",Abstract\n\nThe new world arenavirus Junín vir...,Introduction\n\nArenaviruses are enveloped RNA...,Arenaviridae: The Viruses and Their Replicatio...,"[{'first': 'C', 'middle': [], 'last': 'Huang',...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Arenavi..."


In [30]:
noncomm_dir = 'noncomm_use_subset/noncomm_use_subset/'
noncomm_files = load_files(noncomm_dir)
noncomm_df = generate_clean_df(noncomm_files)
noncomm_df.to_csv('clean_noncomm_use.csv', index=False)
noncomm_df.head()

100%|██████████| 2353/2353 [00:11<00:00, 199.74it/s]
100%|██████████| 2353/2353 [00:26<00:00, 94.56it/s] 


Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography
0,b2f67d533f2749807f2537f3775b39da3b186051,Caring for persons in detention suffering with...,"Michael Liebrenz, Dinesh Bhugra, Anna Buadze, ...","Michael Liebrenz (University of Bern, Bern, Sw...",,\n\nThere is a disproportionate number of indi...,"Influenza outbreak in a correctional facility,...","[{'first': 'Michael', 'middle': [], 'last': 'L...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Influen..."
1,ad98979eada6e333a276d39efdce21779d538625,Xanthine-based acyclic nucleoside phosphonates...,"Ond Rej Baszczy Nski, Martin Maxmilian Kaiser,...","Ond Rej Baszczy Nski, Martin Maxmilian Kaiser,...",Abstract\n\nWhile noncanonic xanthine nucleoti...,Introduction\n\nThe concentration and ratio of...,Absolute metabolite concentrations and implied...,"[{'first': 'Ond', 'middle': [], 'last': 'Rej B...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Absolut..."
2,464f7d3a460eb51dbc25bd12639b22079a73f85a,Viral lncRNA: A regulatory molecule for contro...,"Ziqiang Wang, Yiwan Zhao, Yaou Zhang","Ziqiang Wang (Tsinghua University, 100084, Bei...",Abstract\n\nLong non-coding RNAs (lncRNAs) are...,Introduction\n\nViruses are important infectio...,"The challenge of viral snRNPs, Cold Spring Har...","[{'first': 'Ziqiang', 'middle': [], 'last': 'W...","{'BIBREF0': {'ref_id': 'b0', 'title': 'The cha..."
3,c436139975d97ef929b5d8452595de40bda0c11c,"Phone: (1) 301-451-9881, jbeigel@niaid.nih.gov","John H Beigel, Pablo Tebas, Marie-Carmelle Eli...","John H Beigel, Pablo Tebas, Marie-Carmelle Eli...",Abstract\n\non behalf of the IRC002 Study Team...,INTRODUCTION\n\nPandemic influenza remains a g...,Meta-analysis: convalescent blood products for...,"[{'first': 'John', 'middle': ['H'], 'last': 'B...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Meta-an..."
4,634128ea7d7736750e1c3cd0a48bb37843d06dac,A Strategy To Estimate Unknown Viral Diversity...,"Simon J Anthony, Jonathan H Epstein, Kris A Mu...","Simon J Anthony, Jonathan H Epstein, Kris A Mu...",Abstract\n\nThe majority of emerging zoonoses ...,"Viral discovery.\n\nA total of 12,793 consensu...",Factors in the emergence of infectious disease...,"[{'first': 'Simon', 'middle': ['J'], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Factors..."


In [31]:
noncomm_df.text[0]

'\n\nThere is a disproportionate number of individuals with mental and somatic illnesses among persons in detention (Bhugra, 2020; Ginn, 2012) . It is also known that infections which are transmitted human to human via droplet or close contact spread particularly well in confined spaces. Since transfer options for further treatment are more difficult (especially in detention facilities) preventive measures are strongly emphasized, particularly in the case of viral droplet infections. For example, in the context of influenza, vaccination of detainees and staff is recommended (NHS 2019) . If such options are not available, prisons and other closed facilities, like asylum centers, shelters, and closed psychiatric hospitals, pose a risk for the rapid spread of such diseases. In the past, Australia for example has described the rapid spread of influenza among prison inmates (Awofeso et al., 2001) . The Spanish flu is also reported to have affected about a quarter of all inmates; a prevalenc