In [None]:
!pip install biopython
!pip install tqdm

This notebook takes the following material as references:
- [A references-based atlas of COVID-19 research](https://www.kaggle.com/mmoeller/a-references-based-atlas-of-covid-19-research)
- [Gephi tutorial. Publishing interactive graphs online](http://blog.miz.space/tutorial/2020/01/05/gephi-tutorial-sigma-js-plugin-publishing-interactive-graph-online/)

You will find the visualization of the [reference map](https://yinsenm.github.io/COVID19/network/) for 20K papers with pubmid and more than 20 citations in the database.

In [None]:
import pandas as pd
import numpy as np
from Bio import Entrez
from tqdm.notebook import tqdm
Entrez.api_key = '5adc258af57a356f01f3a432e4a509504708'
Entrez.email = 'yinsenm@gmail.com'

In [None]:
file_path = 'CORD-19-research-challenge' 
df = pd.read_csv('./%s/metadata.csv' % file_path)
df_has_pubmedid = df.dropna(subset=['pubmed_id'])
df_has_pubmedid = df_has_pubmedid.drop_duplicates(subset=['pubmed_id'], keep=False)
pubmedid_list = [int(id) for id in df_has_pubmedid['pubmed_id'].tolist()]
df_has_pubmedid['pubmed_id'] = pubmedid_list
jobs_batch = [pubmedid_list[x:(x + 100)] for x in range(0, len(pubmedid_list), 100)]

In [None]:
refs = []
for i, ids in enumerate(tqdm(jobs_batch)):
    handle  = Entrez.elink(dbfrom='pubmed', id = ids, linkname='pubmed_pubmed_refs')
    results = Entrez.read(handle)
    
    for res in results:
        if res['LinkSetDb'] == []:
            pmids = []
        else:
            pmids = [int(link['Id']) for link in res["LinkSetDb"][0]["Link"]]
        
        refs.append(pmids)    

In [None]:
df_has_pubmedid['refs'] = refs
allrefs = [ref for reflist in df_has_pubmedid['refs'].tolist() for ref in reflist] 
# calculate cross-references in our data set 
seen = {}
commonrefs = []

for x in allrefs:
    if x not in seen:
        seen[x] = 1
    else:
        if seen[x] == 1:
            commonrefs.append(x)
        seen[x] += 1

print('There are %i refs that occur more than once.' % len(commonrefs))

In [None]:
# get a list of shared or cross references for each article in our data set 
# (we throw away the references that are only cited in one article, since they won't help us with the atlas).
shared = []
cross = {id:1 for id in pubmedid_list}
for index, row in df_has_pubmedid.iterrows():
    sharerefs = [ref for ref in row['refs'] if seen[ref] > 1 or ref in cross]
    shared.append(sharerefs)

In [None]:
df_has_pubmedid["sharedrefs"] = shared
df_has_pubmedid['nsharedrefs'] = df_has_pubmedid['sharedrefs'].apply(lambda x : len(x))
df_sub_has_pubmedid = df_has_pubmedid[df_has_pubmedid['nsharedrefs'] > 0]

In [None]:
allids = pubmedid_list
nodes = allids + commonrefs
node_types = [1 for pub in allids] + [0 for ref in commonrefs]
node_titles = [title for title in df['title'].tolist()] + ['' for ref in commonrefs]
# node_abstracts = [abstract for abstract in df['abstract'].tolist()] + ['' for ref in commonrefs] 
nodes_new_paper = [newpaper for newpaper in (df['publish_time'] >= '2020-01-01').astype('int64')] + [0 for ref in commonrefs]
nodes_time = [pub_time for pub_time in df['publish_time']] + ["" for ref in commonrefs]
df_nodes = pd.DataFrame(list(zip(nodes, node_types, node_titles, nodes_new_paper)), columns =['Id', 'Type', 'Label', 'new_paper'])
df_nodes.to_csv('full_nodes.csv', index=False)  

In [None]:
sources = []
targets = []
for _,row in df_sub_has_pubmedid.iterrows():
    source = int(row['pubmed_id'])
    for ref in row['sharedrefs']:
            target = ref
            sources.append(source)
            targets.append(target)

df_edges = pd.DataFrame(list(zip(sources, targets)), columns =['Source', 'Target'])
df_edges.to_csv('full_edges.csv', index=False) 