In [71]:
#Libraries
import os
import json
from pprint import pprint
from copy import deepcopy

import numpy as np
import pandas as pd
from tqdm import tqdm

__Funções que tratam o Json__

In [6]:
def load_files(dirname):
    filenames = os.listdir(dirname)
    raw_files = []

    for filename in tqdm(filenames):
        filename = dirname + filename
        file = json.load(open(filename, 'rb'))
        raw_files.append(file)
    
    return raw_files

def generate_clean_df(all_files):
    cleaned_files = []
    
    for file in tqdm(all_files):
        features = [
            file['paper_id'],
            file['metadata']['title'],
            format_authors(file['metadata']['authors']),
            format_authors(file['metadata']['authors'], 
                           with_affiliation=True),
            format_body(file['abstract']),
            format_body(file['body_text']),
            format_bib(file['bib_entries']),
            file['metadata']['authors'],
            file['bib_entries']
        ]

        cleaned_files.append(features)

    col_names = ['paper_id', 'title', 'authors',
                 'affiliations', 'abstract', 'text', 
                 'bibliography','raw_authors','raw_bibliography']

    clean_df = pd.DataFrame(cleaned_files, columns=col_names)
    clean_df.head()
    
    return clean_df

__Funções para extrair informações dos Jsons__

In [7]:
def format_name(author):
    middle_name = " ".join(author['middle'])
    
    if author['middle']:
        return " ".join([author['first'], middle_name, author['last']])
    else:
        return " ".join([author['first'], author['last']])


def format_affiliation(affiliation):
    text = []
    location = affiliation.get('location')
    if location:
        text.extend(list(affiliation['location'].values()))
    
    institution = affiliation.get('institution')
    if institution:
        text = [institution] + text
    return ", ".join(text)

def format_authors(authors, with_affiliation=False):
    name_ls = []
    
    for author in authors:
        name = format_name(author)
        if with_affiliation:
            affiliation = format_affiliation(author['affiliation'])
            if affiliation:
                name_ls.append(f"{name} ({affiliation})")
            else:
                name_ls.append(name)
        else:
            name_ls.append(name)
    
    return ", ".join(name_ls)

def format_body(body_text):
    texts = [(di['section'], di['text']) for di in body_text]
    texts_di = {di['section']: "" for di in body_text}
    
    for section, text in texts:
        texts_di[section] += text

    body = ""

    for section, text in texts_di.items():
        body += section
        body += "\n\n"
        body += text
        body += "\n\n"
    
    return body

def format_bib(bibs):
    if type(bibs) == dict:
        bibs = list(bibs.values())
    bibs = deepcopy(bibs)
    formatted = []
    
    for bib in bibs:
        bib['authors'] = format_authors(
            bib['authors'], 
            with_affiliation=False
        )
        formatted_ls = [str(bib[k]) for k in ['title', 'authors', 'venue', 'year']]
        formatted.append(", ".join(formatted_ls))

    return "; ".join(formatted)

# Biorxiv: Exploration

In [61]:
biorxiv_dir = '/home/leandro/Documents/Data Science/3 - Kaggle_competitions/COVID-19/2020-03-13/biorxiv_medrxiv/biorxiv_medrxiv/'
filenames = os.listdir(biorxiv_dir)
print("Número de artigos: ", len(filenames))

Número de artigos:  803


In [68]:
all_files = []

#Carregando o json
#Diretório em que o json esta + nome do arquivo json
for filename in filenames:
    filename = biorxiv_dir + filename
    file = json.load(open(filename, 'rb'))
    all_files.append(file)

## Testando extrações para UMA amostra

In [70]:
#visualizando algumas keys
file = all_files[0]
print("Dictionary keys:", file.keys())

Dictionary keys: dict_keys(['paper_id', 'metadata', 'abstract', 'body_text', 'bib_entries', 'ref_entries', 'back_matter'])


In [73]:
#visualizando o abstract
pprint(file['abstract'])

[{'cite_spans': [],
  'ref_spans': [],
  'section': 'Abstract',
  'text': 'The geographic spread of 2019 novel coronavirus infections from the '
          'epicenter of Wuhan, China, has provided an opportunity to study the '
          'natural history of the recently emerged virus. Using publicly '
          'available event-date data from the ongoing epidemic, the present '
          'study investigated the incubation period and other time intervals '
          'that govern the epidemiological dynamics of COVID-19 infections. '
          'Our results show that the incubation period falls within the range '
          'of 2-14 days with 95% confidence and has a mean of around 5 days '
          'when approximated using the best-fit lognormal distribution. The '
          'mean time from illness onset to hospital admission (for treatment '
          'and/or isolation) was estimated at 3-4 days without truncation and '
          'at 5-9 days when right truncated. Based on the 95th percen

In [74]:
#Body
print("body_text type:", type(file['body_text']))
print("body_text length:", len(file['body_text']))
print("body_text keys:", file['body_text'][0].keys())

body_text type: <class 'list'>
body_text length: 28
body_text keys: dict_keys(['text', 'cite_spans', 'ref_spans', 'section'])


In [75]:
print("body_text content:")
pprint(file['body_text'][:2], depth=3)

body_text content:
[{'cite_spans': [{...}, {...}, {...}, {...}, {...}],
  'ref_spans': [],
  'section': 'Introduction',
  'text': 'As of 31 January 2020, mainland China reported 11,791 confirmed '
          'cases of novel coronavirus (COVID- 19) infections, causing 259 '
          'deaths [1] . Initially, these infections were thought to result '
          'from zoonotic (animal-to-human) transmission; however, recently '
          'published evidence [2] and the exponential growth of case '
          'incidences show compelling evidence of human-to-human secondary '
          'transmission fueled by travel, with many cases detected in other '
          'parts of the world [3] . This geographic expansion beyond the '
          'initial epicenter of Wuhan provides an opportunity to study the '
          'natural history of COVID-19 infection, as these migration events '
          'limit the risk of infection to the time during which an individual '
          'traveled to an area where 

In [76]:
texts = [(di['section'], di['text']) for di in file['body_text']]
texts_di = {di['section']: "" for di in file['body_text']}
for section, text in texts:
    texts_di[section] += text

pprint(list(texts_di.keys()))

['Introduction',
 'Epidemiological data',
 'Statistical model',
 'Results',
 'Discussion',
 'Figure 2. Estimated cumulative distribution for the incubation period of '
 'COVID-19',
 'Conflicts of Interest:']


Exemplo do resultado final, após a formatação completa da seção title

In [78]:
body = ""

for section, text in texts_di.items():
    body += section
    body += "\n\n"
    body += text
    body += "\n\n"

print(body[:3000])

Introduction

As of 31 January 2020, mainland China reported 11,791 confirmed cases of novel coronavirus (COVID- 19) infections, causing 259 deaths [1] . Initially, these infections were thought to result from zoonotic (animal-to-human) transmission; however, recently published evidence [2] and the exponential growth of case incidences show compelling evidence of human-to-human secondary transmission fueled by travel, with many cases detected in other parts of the world [3] . This geographic expansion beyond the initial epicenter of Wuhan provides an opportunity to study the natural history of COVID-19 infection, as these migration events limit the risk of infection to the time during which an individual traveled to an area where exposure could occur [4] .The incubation period is defined as the time from infection to illness onset. Knowledge of the incubation period of a directly transmitted infectious disease is critical to determine the time period required for monitoring and restric

In [82]:
#Utilizando a função criada no começo
print(format_body(file['body_text'])[:3000])

Introduction

As of 31 January 2020, mainland China reported 11,791 confirmed cases of novel coronavirus (COVID- 19) infections, causing 259 deaths [1] . Initially, these infections were thought to result from zoonotic (animal-to-human) transmission; however, recently published evidence [2] and the exponential growth of case incidences show compelling evidence of human-to-human secondary transmission fueled by travel, with many cases detected in other parts of the world [3] . This geographic expansion beyond the initial epicenter of Wuhan provides an opportunity to study the natural history of COVID-19 infection, as these migration events limit the risk of infection to the time during which an individual traveled to an area where exposure could occur [4] .The incubation period is defined as the time from infection to illness onset. Knowledge of the incubation period of a directly transmitted infectious disease is critical to determine the time period required for monitoring and restric

## Metadata

In [83]:
print(all_files[0]['metadata'].keys())

dict_keys(['title', 'authors'])


In [84]:
print(all_files[0]['metadata']['title'])

Incubation Period and Other Epidemiological Characteristics of 2019 Novel Coronavirus Infections with Right Truncation: A Statistical Analysis of Publicly Available Case Data


In [85]:
authors = all_files[0]['metadata']['authors']
pprint(authors[:3])

[{'affiliation': {'institution': 'Hokkaido University',
                  'laboratory': '',
                  'location': {'country': 'Japan',
                               'settlement': 'Sapporo, Hokkaido'}},
  'email': '',
  'first': 'Natalie',
  'last': 'Linton',
  'middle': ['M'],
  'suffix': ''},
 {'affiliation': {'institution': 'Hokkaido University',
                  'laboratory': '',
                  'location': {'country': 'Japan',
                               'settlement': 'Sapporo, Hokkaido'}},
  'email': '',
  'first': 'Tetsuro',
  'last': 'Kobayashi',
  'middle': [],
  'suffix': ''},
 {'affiliation': {'institution': 'Hokkaido University',
                  'laboratory': '',
                  'location': {'country': 'Japan',
                               'settlement': 'Sapporo, Hokkaido'}},
  'email': '',
  'first': 'Yichi',
  'last': 'Yang',
  'middle': [],
  'suffix': ''}]


In [86]:
#Usando as funções criadas para extrair as informações
for author in authors:
    print("Name:", format_name(author))
    print("Affiliation:", format_affiliation(author['affiliation']))
    print()

Name: Natalie M Linton
Affiliation: Hokkaido University, Sapporo, Hokkaido, Japan

Name: Tetsuro Kobayashi
Affiliation: Hokkaido University, Sapporo, Hokkaido, Japan

Name: Yichi Yang
Affiliation: Hokkaido University, Sapporo, Hokkaido, Japan

Name: Katsuma Hayashi
Affiliation: Hokkaido University, Sapporo, Hokkaido, Japan

Name: Andrei R Akhmetzhanov
Affiliation: Hokkaido University, Sapporo, Hokkaido, Japan

Name: Sung-Mok Jung
Affiliation: Hokkaido University, Sapporo, Hokkaido, Japan

Name: Baoyin Yuan
Affiliation: Hokkaido University, Sapporo, Hokkaido, Japan

Name: Ryo Kinoshita
Affiliation: Hokkaido University, Sapporo, Hokkaido, Japan

Name: Hiroshi Nishiura
Affiliation: Hokkaido University, Sapporo, Hokkaido, Japan



In [87]:
pprint(all_files[4]['metadata'], depth=4)

{'authors': [{'affiliation': {'institution': 'University of Lausanne',
                              'laboratory': '',
                              'location': {...}},
              'email': '',
              'first': 'Lucie',
              'last': 'Kešnerová',
              'middle': [],
              'suffix': ''},
             {'affiliation': {'institution': 'University of Lausanne',
                              'laboratory': '',
                              'location': {...}},
              'email': '',
              'first': 'Olivier',
              'last': 'Emery',
              'middle': [],
              'suffix': ''},
             {'affiliation': {'institution': 'University of Lausanne',
                              'laboratory': '',
                              'location': {...}},
              'email': '',
              'first': 'Michaël',
              'last': 'Troilo',
              'middle': [],
              'suffix': ''},
             {'affiliation': {'institution'

In [88]:
authors = all_files[4]['metadata']['authors']
print("Formatting without affiliation:")
print(format_authors(authors, with_affiliation=False))
print("\nFormatting with affiliation:")
print(format_authors(authors, with_affiliation=True))

Formatting without affiliation:
Lucie Kešnerová, Olivier Emery, Michaël Troilo, Joanito Liberti, Berra Erkosar, Philipp Engel,  Philipp Engel

Formatting with affiliation:
Lucie Kešnerová (University of Lausanne, CH-1015, Lausanne, 7 Switzerland), Olivier Emery (University of Lausanne, CH-1015, Lausanne, 7 Switzerland), Michaël Troilo (University of Lausanne, CH-1015, Lausanne, 7 Switzerland), Joanito Liberti (University of Lausanne, CH-1015, Lausanne, 7 Switzerland), Berra Erkosar (University of Lausanne, CH-1015, Lausanne), Philipp Engel (University of Lausanne, CH-1015, Lausanne, 7 Switzerland),  Philipp Engel


## Bibliography

In [89]:
bibs = list(file['bib_entries'].values())
pprint(bibs[:2], depth=4)

[{'authors': [],
  'issn': '',
  'other_ids': {},
  'pages': '',
  'ref_id': 'b0',
  'title': 'Update on Pneumonia of New Coronavirus Infection as of 21:00 on',
  'venue': '',
  'volume': '',
  'year': 2020},
 {'authors': [{'first': 'Q', 'last': 'Li', 'middle': [], 'suffix': ''},
              {'first': 'X', 'last': 'Guan', 'middle': [], 'suffix': ''},
              {'first': 'P', 'last': 'Wu', 'middle': [], 'suffix': ''},
              {'first': 'X', 'last': 'Wang', 'middle': [], 'suffix': ''},
              {'first': 'L', 'last': 'Zhou', 'middle': [], 'suffix': ''},
              {'first': 'Y', 'last': 'Tong', 'middle': [], 'suffix': ''},
              {'first': 'R', 'last': 'Ren', 'middle': [], 'suffix': ''},
              {'first': 'K', 'last': 'Leung', 'middle': [...], 'suffix': ''},
              {'first': 'E', 'last': 'Lau', 'middle': [...], 'suffix': ''},
              {'first': 'J', 'last': 'Wong', 'middle': [...], 'suffix': ''}],
  'issn': '',
  'other_ids': {'DOI': ['10.1056

In [90]:
format_authors(bibs[1]['authors'], with_affiliation=False)

'Q Li, X Guan, P Wu, X Wang, L Zhou, Y Tong, R Ren, K S M Leung, E H Y Lau, J Y Wong'

In [91]:
bib_formatted = format_bib(bibs[:5])
print(bib_formatted)

Update on Pneumonia of New Coronavirus Infection as of 21:00 on, , , 2020; Early transmission dynamics in Wuhan, China, of novel coronavirus-infected pneumonia, Q Li, X Guan, P Wu, X Wang, L Zhou, Y Tong, R Ren, K S M Leung, E H Y Lau, J Y Wong, N. Engl. J. Med. 2020, None; European Centre for Disease Prevention and Control data, , , 2020; Estimates of short-and long-term incubation periods of Plasmodium vivax malaria in the Republic of Korea, H Nishiura, H W Lee, S H Cho, W G Lee, T S In, S U Moon, G T Chung, T S Kim, Trans R Soc Trop Med Hyg, 2007; New York City Department of Health and Mental Hygiene Swine Influenza Investigation Team, J Lessler, N G Reich, D A Cummings, H P Nair, H T Jordan, N Thompson, N Engl J Med, 2009


## Generate CSV

In [92]:
cleaned_files = []

for file in tqdm(all_files):
    features = [
        file['paper_id'],
        file['metadata']['title'],
        format_authors(file['metadata']['authors']),
        format_authors(file['metadata']['authors'], 
                       with_affiliation=True),
        format_body(file['abstract']),
        format_body(file['body_text']),
        format_bib(file['bib_entries']),
        file['metadata']['authors'],
        file['bib_entries']
    ]
    
    cleaned_files.append(features)

100%|██████████| 803/803 [00:01<00:00, 432.77it/s]


In [95]:
col_names = [
    'paper_id', 
    'title', 
    'authors',
    'affiliations', 
    'abstract', 
    'text', 
    'bibliography',
    'raw_authors',
    'raw_bibliography'
]

clean_df = pd.DataFrame(cleaned_files, columns=col_names)
clean_df.head()

Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography
0,cbc05d14c57b91081970a232ab83bc993f998fe2,Incubation Period and Other Epidemiological Ch...,"Natalie M Linton, Tetsuro Kobayashi, Yichi Yan...","Natalie M Linton (Hokkaido University, Sapporo...",Abstract\n\nThe geographic spread of 2019 nove...,"Introduction\n\nAs of 31 January 2020, mainlan...",Update on Pneumonia of New Coronavirus Infecti...,"[{'first': 'Natalie', 'middle': ['M'], 'last':...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Update ..."
1,403d885a33958731c4607ea632453e25a89e2b57,Novel ionophores active against La Crosse viru...,"Zachary J Sandler, Michelle N Vu, Vineet D Men...","Zachary J Sandler (Loyola University), Michell...",Abstract\n\nBunyaviruses are significant human...,"\n\ndaptomycin, among others. More closely rel...",Zika virus: history of a newly emerging arbovi...,"[{'first': 'Zachary', 'middle': ['J'], 'last':...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Zika vi..."
2,c6039f8933305c9f44a44c81a15b321b6c2848dc,Far-UVC light: A new tool to control the sprea...,"David Welch, Manuela Buonanno, Veljko Grilj, I...",David Welch (Columbia University Medical Cente...,Abstract\n\nAirborne-mediated microbial diseas...,3\n\nAirborne-mediated microbial diseases repr...,"Global, regional, and national life expectancy...","[{'first': 'David', 'middle': [], 'last': 'Wel...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Global,..."
3,96ef1767754a53f792951ba1752440ae94e90c60,Tuning antiviral CD8 T-cell response via proli...,"Adil Doganay Duru, Renhua Sun, Eva B Allerbrin...",Adil Doganay Duru (Karolinska University Hospi...,Abstract\n\nViral escape from CD8 + cytotoxic ...,\n\nY4F is restored following vaccination with...,Immunodominance in major histocompatibility co...,"[{'first': 'Adil', 'middle': [], 'last': 'Doga...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Immunod..."
4,d010cc0209bc5a0eee6259f72c42590c448da1cf,Gut microbiota structure differs between honey,"Lucie Kešnerová, Olivier Emery, Michaël Troilo...","Lucie Kešnerová (University of Lausanne, CH-10...",Abstract\n\nAdult honey bees harbor a speciali...,Introduction\n\nan important characteristic of...,Importance of pollinators in changing landscap...,"[{'first': 'Lucie', 'middle': [], 'last': 'Keš...","{'BIBREF1': {'ref_id': 'b1', 'title': 'Importa..."


In [None]:
#clean_df.to_csv('biorxiv_clean.csv', index=False)