### This notebook extracts authors information from the CORD19 kaggle data set and creates a dataframe as a .csv file. The Schema for the csv file is as follows:


#### format in which each row is entered  --- PaperId, Title, Authors, Institutes, PostalCodes, Countries

Since a paper can have multiple authors, their names have been registered as strings with names separated with semi-colon (;).
Then each authors affiliation info is sequentially stored in the .csv file


for example if a paper P1 has:

authors --- a1,a2,a3 

affiliations --- I1,I2,I3, 

postal codes ---- postcode1,postcode2,postcode3, 

countries ---- c1,c2,c3 

respectively then the row entry would look like

P1_id , Title_of_P1 , a1;a2;a3 , I1;I2;I3 , postcode1;postcode2;postcode3 , c1;c2;c3

Note - Each authors name is as follows = first_middle_last



In [67]:
import os
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt 

%matplotlib inline

In [68]:
#### functions important for extraction ######

def extract_authors_info(d):
    
    auth_name = ''
    auth_inst = ''
    auth_country = ''
    auth_post = ''
    for i in d['metadata']['authors']:
        f = i['first']
        if not i['middle']:
            m=''
        else:
            m=i['middle'][0]
        l = i['last']
        
        if not m:
            name = '%s_%s'%(f,l)
        else:
            name = '%s_%s_%s'%(f,m,l)
        auth_name = auth_name + name + ';'
        try:
            
            auth_inst = auth_inst + i['affiliation']['institution'] + ';'
            auth_post = auth_post + i['affiliation']['location']['postCode'] + ';'
            auth_country = auth_country + i['affiliation']['location']['country'] + ';'
        except:
            auth_inst = ''
            auth_country = ''
            auth_post = ''
    
    
    auth_name = auth_name[:-1]
    auth_inst = auth_inst[:-1]
    auth_country = auth_country[:-1]
    auth_post = auth_post[:-1]
    
    return auth_name,auth_inst,auth_post,auth_country

In [81]:
paths = ['C:/Users/Admin/Documents/Bio_networks_VC/CORD19_data/biorxiv_medrxiv/biorxiv_medrxiv/',
        'C:/Users/Admin/Documents/Bio_networks_VC/CORD19_data/custom_license/custom_license/',
        'C:/Users/Admin/Documents/Bio_networks_VC/CORD19_data/noncomm_use_subset/noncomm_use_subset/',
        'C:/Users/Admin/Documents/Bio_networks_VC/CORD19_data/comm_use_subset/comm_use_subset/']

In [82]:
data_list = []

for p in paths:
    files = os.listdir(p)
    files.sort()

    for f in files:

        p_id,title,a_names,a_inst,a_count,a_pc = '','','','','',''
        with open(p+'%s'%(f), 'r') as json_file:
            data = json.load(json_file)    
            p_id,title = data['paper_id'],data['metadata']['title']
            a_names,a_inst,a_pc,a_count = extract_authors_info(data)
        data_list.append([p_id,title,a_names,a_inst,a_pc,a_count])

In [83]:
DF = pd.DataFrame(data_list,columns=['paperID','Title','Authors','Institutes','PostCode','Country'])

In [84]:
DF.head()

Unnamed: 0,paperID,Title,Authors,Institutes,PostCode,Country
0,0015023cc06b5362d332b3baf348d11567ca2fbb,The RNA pseudoknots in foot-and-mouth disease ...,Joseph_C_Ward;Lidia_Lasecka-Dykes;Chris_Neil;O...,,,
1,00340eea543336d54adda18236424de6a5e91c9d,Analysis Title: Regaining perspective on SARS-...,Carla_Mavian;Simone_Marini;Costanza_Manes;Ilar...,,,
2,004f0f8bb66cf446678dc13cf2701feec4f36d76,Healthcare-resource-adjusted vulnerabilities t...,Hanchu_Zhou;Jiannan_Yang;Kaicheng_Tang;†_;Qing...,,,
3,00d16927588fb04d4be0e6b269fc02f0d3c2aa7b,"Real-time, MinION-based, amplicon sequencing f...",Salman_L_Butt;Eric_C_Erwood;Jian_Zhang;Holly_S...,University of Georgia;University of Georgia;Un...,30602;30602;30602;30602;30602;24061;30602,USA;USA;USA;USA;USA;USA;USA
4,0139ea4ca580af99b602c6435368e7fdbefacb03,A Combined Evidence Approach to Prioritize Nip...,Nishi_Kumari;Ayush_Upadhyay;Kishan_Kalia;Rakes...,,,


In [85]:
DF.to_csv('CORD19_Geo_info.csv',index=True,header=True)

In [90]:
df = pd.read_csv('CORD19_Geo_info.csv',index_col=0,header=0)

In [91]:
df.head()

Unnamed: 0,paperID,Title,Authors,Institutes,PostCode,Country
0,0015023cc06b5362d332b3baf348d11567ca2fbb,The RNA pseudoknots in foot-and-mouth disease ...,Joseph_C_Ward;Lidia_Lasecka-Dykes;Chris_Neil;O...,,,
1,00340eea543336d54adda18236424de6a5e91c9d,Analysis Title: Regaining perspective on SARS-...,Carla_Mavian;Simone_Marini;Costanza_Manes;Ilar...,,,
2,004f0f8bb66cf446678dc13cf2701feec4f36d76,Healthcare-resource-adjusted vulnerabilities t...,Hanchu_Zhou;Jiannan_Yang;Kaicheng_Tang;†_;Qing...,,,
3,00d16927588fb04d4be0e6b269fc02f0d3c2aa7b,"Real-time, MinION-based, amplicon sequencing f...",Salman_L_Butt;Eric_C_Erwood;Jian_Zhang;Holly_S...,University of Georgia;University of Georgia;Un...,30602;30602;30602;30602;30602;24061;30602,USA;USA;USA;USA;USA;USA;USA
4,0139ea4ca580af99b602c6435368e7fdbefacb03,A Combined Evidence Approach to Prioritize Nip...,Nishi_Kumari;Ayush_Upadhyay;Kishan_Kalia;Rakes...,,,


In [98]:
df.loc[df['paperID']=='00d16927588fb04d4be0e6b269fc02f0d3c2aa7b']

Unnamed: 0,paperID,Title,Authors,Institutes,PostCode,Country
3,00d16927588fb04d4be0e6b269fc02f0d3c2aa7b,"Real-time, MinION-based, amplicon sequencing f...",Salman_L_Butt;Eric_C_Erwood;Jian_Zhang;Holly_S...,University of Georgia;University of Georgia;Un...,30602;30602;30602;30602;30602;24061;30602,USA;USA;USA;USA;USA;USA;USA
