In [1]:
import pandas as pd
import os
import psycopg2 
from urllib.parse import urlparse
from dotenv import load_dotenv
from uuid import NAMESPACE_URL, uuid5
import requests

In [2]:
load_dotenv()
DATABASE_URL = os.getenv("DATABASE_URL")
result = urlparse(DATABASE_URL)
username = result.username
password = result.password
database = result.path[1:]
hostname = result.hostname
port = result.port

In [3]:
connection = psycopg2.connect(
    database = database,
    user = username,
    password = password,
    host = hostname,
    port = port
)

In [4]:
cols = ['id', 'Program Full Name', 'Program', 'Description',  'URL', 'logo link']

In [7]:
dcc_df = pd.read_csv("data/DCC.tsv", sep="\t")
dcc_df.id = ""
for i,row in dcc_df.iterrows():
    dcc_df.at[i, "id"] = uuid5(NAMESPACE_URL, row["Program"])
    extension = dcc_df.at[i, "logo link"].split("/")[-1].split(".")[-1]
    r = requests.get(dcc_df.at[i, "logo link"], allow_redirects=True)
    open("../drc-portals/public/img/%s.%s"%(row["Program"], extension), 'wb').write(r.content)
    dcc_df.at[i, "logo link"] = "img/%s.%s"%(row["Program"], extension)
dcc_df = dcc_df[cols]
dcc_df.to_csv('output/DCC.tsv', sep="\t", header=False, index=False)
dcc_df.head()

  dcc_df.at[i, "id"] = uuid5(NAMESPACE_URL, row["Program"])


Unnamed: 0,id,Program Full Name,Program,Description,URL,logo link
0,d6bb00c3-7224-5001-b9c5-9838622fba40,4D Nucleome,4DN,The Common Fund’s 4D Nucleome program aims to ...,https://www.4dnucleome.org/,img/4DN.png
1,f65babf7-2875-5725-9635-210d654533f1,Extracellular RNA Communication,ExRNA,The Extracellular RNA Communication program is...,https://exrna.org/,img/ExRNA.png
2,65af85ae-82d5-5b81-bc66-6bddaa6420ce,Gabriella Miller Kids First Pediatric Research,Kids First,The goal of the Gabriella Miller Kids First Pe...,https://kidsfirstdrc.org/,img/Kids First.png
3,b3028db2-209c-5862-8f4d-33c5b312332e,Genotype Tissue Expression,GTEx,The Genotype-Tissue Expression (GTEx) project ...,https://www.gtexportal.org/home/,img/GTEx.png
4,803ad44d-e7a2-550a-95c6-57855bf06be8,Human BioMolecular Atlas Program,HuBMAP,The goal of the Human BioMolecular Atlas Progr...,https://hubmapconsortium.org/,img/HuBMAP.png


In [8]:
cur = connection.cursor()
cur.execute('''
  create table dcc_tmp
  as table dccs
  with no data;
''')

with open('output/DCC.tsv', 'r') as fr:
    cur.copy_from(fr, 'dcc_tmp',
      columns=('id', 'label', 'short_label', 'description', 'homepage', 'icon'),
      null='',
      sep='\t',
    )

In [9]:
cur.execute('''
    insert into dccs (id, label, short_label, description, homepage, icon)
      select id, label, short_label, description, homepage, icon
      from dcc_tmp
      on conflict (id)
        do update
        set label = excluded.label,
            short_label = excluded.short_label,
            description = excluded.description,
            homepage = excluded.homepage,
            icon = excluded.icon
    ;
  ''')
cur.execute('drop table dcc_tmp;')
connection.commit()

In [10]:
cur.close()

In [11]:
dcc_mapper = {
    'KF': '65af85ae-82d5-5b81-bc66-6bddaa6420ce',
    'GTEx': 'b3028db2-209c-5862-8f4d-33c5b312332e',
    'HuBMAP': '803ad44d-e7a2-550a-95c6-57855bf06be8',
    'IDG': 'a1289ebb-0306-59a1-b0fc-e4d03a4790d7',
    'LINCS': 'f3f490cf-fd69-579c-8ea3-472c7cf3fb59',
    'Metabolomics': '089d8d63-3364-526f-9706-80d62d0ec88c',
    'SPARC': '2399794e-74c6-5735-a039-0782cdeeb1e2',
}

In [61]:
publications_df = pd.read_csv("data/publications.tsv", sep="\t")
cols = ["id", "Title", "Year", "Page", "Volume", "Issue", "Journal", "pmid", "PMCID", "DOI", "Authors"]
publications_df.id = ""
for i, row in publications_df.iterrows():
    publications_df.at[i, "id"] = uuid5(NAMESPACE_URL, row["Title"])
    publications_df.at[i, "program"] = dcc_mapper[publications_df.at[i, "program"]]
publications_df[cols].groupby('id').first().to_csv("output/publications.tsv", sep="\t", header=False)
publications_df[cols].head()

Unnamed: 0,id,Title,Year,Page,Volume,Issue,Journal,pmid,PMCID,DOI,Authors
0,b2edba84-bdd1-5f3f-855f-8a45e88d98cf,Toxicology knowledge graph for structural birt...,2023,98,3,1,Communications medicine,37460679,PMC10352311,10.1038/s43856-023-00329-2,"Evangelista JE, Clarke DJB, Xie Z, Marino GB, ..."
1,b3b12752-8f5c-5e5e-8dd0-e46afbc83b4a,Pan-African genome demonstrates how population...,2022,4384,13,1,Nature communications,35927245,PMC9352875,10.1038/s41467-022-31724-3,"Tetikol HS, Turgut D, Narci K, Budak G, Kalay ..."
2,aca50121-fcde-5541-9277-8adf75152db6,RNAget: an API to securely retrieve RNA quanti...,2023,,39,4,"Bioinformatics (Oxford, England)",36897015,PMC10081869,10.1093/bioinformatics/btad126,"Upchurch S, Palumbo E, Adams J, Bujold D, Bour..."
3,2337025f-3e71-50ef-850a-a37a4c7fe7ce,3D virtual reality vs. 2D desktop registration...,2021,e0258103,16,10,PloS one,34705835,PMC8550408,10.1371/journal.pone.0258103,"Bueckle A, Buehling K, Shih PC, Borner K"
4,7d2732f5-4a7e-5b28-a096-7db93ad1946b,Tissue registration and exploration user inter...,2022,1369,5,1,Communications biology,36513738,PMC9747802,10.1038/s42003-022-03644-x,"Borner K, Bueckle A, Herr BW 2nd, Cross LE, Qu..."


In [66]:
publications_df[["id", "program"]].to_csv("output/dcc_publications.tsv", sep="\t", header=False, index=False)

In [56]:
cur = connection.cursor()
cur.execute('''
  create table publication_tmp
  as table publications
  with no data;
''')

with open('output/publications.tsv', 'r') as fr:
    cur.copy_from(fr, 'publication_tmp',
      columns=("id", "title", "year", "page", "volume", "issue", "journal", "pmid", "pmcid", "doi", "authors"),
      null='',
      sep='\t',
    )

In [57]:
cur.execute('''
    insert into publications (id, title, year, page, volume, issue, journal, pmid, pmcid, doi, authors)
      select id, title, year, page, volume, issue, journal, pmid, pmcid, doi, authors
      from publication_tmp
      on conflict (id)
        do update
        set id = excluded.id,
            title = excluded.title,
            year = excluded.year,
            page = excluded.page,
            volume = excluded.volume,
            issue = excluded.issue,
            journal = excluded.journal,
            pmid = excluded.pmid,
            pmcid = excluded.pmcid,
            doi = excluded.doi,
            authors = excluded.authors
    ;
  ''')
cur.execute('drop table publication_tmp;')
connection.commit()

In [58]:
cur.close()


In [59]:
publications_df = pd.read_csv("data/publications.tsv", sep="\t")
publications_df.head()    


Unnamed: 0,program,coreproject,AwardType,Title,Year,Page,Volume,Issue,Journal,pmid,PMCID,DOI,Authors
0,KF,OT2OD030162,CFDE,Toxicology knowledge graph for structural birt...,2023,98,3,1,Communications medicine,37460679,PMC10352311,10.1038/s43856-023-00329-2,"Evangelista JE, Clarke DJB, Xie Z, Marino GB, ..."
1,KF,OT2OD030162,CFDE,Pan-African genome demonstrates how population...,2022,4384,13,1,Nature communications,35927245,PMC9352875,10.1038/s41467-022-31724-3,"Tetikol HS, Turgut D, Narci K, Budak G, Kalay ..."
2,GTEx,OT2OD030161,CFDE,RNAget: an API to securely retrieve RNA quanti...,2023,,39,4,"Bioinformatics (Oxford, England)",36897015,PMC10081869,10.1093/bioinformatics/btad126,"Upchurch S, Palumbo E, Adams J, Bujold D, Bour..."
3,HuBMAP,OT2OD030545,CFDE,3D virtual reality vs. 2D desktop registration...,2021,e0258103,16,10,PloS one,34705835,PMC8550408,10.1371/journal.pone.0258103,"Bueckle A, Buehling K, Shih PC, Borner K"
4,HuBMAP,OT2OD030545,CFDE,Tissue registration and exploration user inter...,2022,1369,5,1,Communications biology,36513738,PMC9747802,10.1038/s42003-022-03644-x,"Borner K, Bueckle A, Herr BW 2nd, Cross LE, Qu..."


In [72]:
cur = connection.cursor()
cur.execute('''
  create table dcc_publication_tmp
  as table dcc_publications
  with no data;
''')

with open('output/dcc_publications.tsv', 'r') as fr:
    cur.copy_from(fr, 'dcc_publication_tmp',
      columns=("publication_id", "dcc_id"),
      null='',
      sep='\t',
    )

In [73]:
cur.execute('''
    insert into dcc_publications (publication_id, dcc_id)
      select publication_id, dcc_id
      from dcc_publication_tmp
      on conflict 
        do nothing
    ;
  ''')
cur.execute('drop table dcc_publication_tmp;')
connection.commit()

In [13]:
cur.close()
connection.close()

In [25]:
import json

In [35]:
dcc = pd.read_csv('ingest/DCC.tsv', sep="\t", header=None)
mapper = {}
for k,v in dcc.iterrows():
    mapper[v[2]] = v[0]
mapper

{'4DN': 'd6bb00c3-7224-5001-b9c5-9838622fba40',
 'ExRNA': 'f65babf7-2875-5725-9635-210d654533f1',
 'Kids First': '65af85ae-82d5-5b81-bc66-6bddaa6420ce',
 'GTEx': 'b3028db2-209c-5862-8f4d-33c5b312332e',
 'HuBMAP': '803ad44d-e7a2-550a-95c6-57855bf06be8',
 'IDG': 'a1289ebb-0306-59a1-b0fc-e4d03a4790d7',
 'LINCS': 'f3f490cf-fd69-579c-8ea3-472c7cf3fb59',
 'Metabolomics': '089d8d63-3364-526f-9706-80d62d0ec88c',
 'MoTrPAC': 'a9aeab22-4fbc-5329-aef6-21110f463c23',
 'SPARC': '2399794e-74c6-5735-a039-0782cdeeb1e2',
 'HMP': 'cbfd44b8-684d-56b9-bfd4-45c0e259f896',
 'Glycoscience': 'e31052b0-ac50-5ede-9828-698ff3610427',
 'UDN': '7be6ce16-142e-508e-a31f-108e70ce72c2',
 'KOMP2': '830ddbac-bf21-5612-af1a-75c713045299',
 'A2CPS': 'e332dadd-8084-5fbc-be41-29d75775aab3',
 'SenNet': 'dd66e8a5-0e05-5a43-a0ca-18cc3698bb36',
 'Bridge2AI': '75b3be39-a021-5d80-b7e2-2a7938a1e11a',
 'iHMP': 'dafdfc72-42c6-5c35-ad56-0b2d284431d1',
 'H3Africa': '138b48df-8867-5d61-87c5-98cf924c60b9'}

In [47]:

outreach = pd.read_csv('data/outreach.tsv', sep="\t")
outreach.id = ""
outreach["active"] = True

for k,v in outreach.iterrows():
    outreach.at[k,"id"] = str(uuid5(NAMESPACE_URL, v["title"]))
    outreach.at[k, "dcc"] = mapper[v["dcc"]]
    outreach.at[k, "tag"] = json.dumps([v["tag"]])
outreach

  outreach.at[k,"id"] = str(uuid5(NAMESPACE_URL, v["title"]))


Unnamed: 0,title,short_description,description,tag,featured,start_date,end_date,start_time,end_time,link,image,dcc,active,id
0,2024 HuBMAP Underrepresented Student Internshi...,The Underrepresented Student Internship Progra...,Applications will be accepted from November 27...,"[""internship""]",True,05/20/2024,08/16/2024,,,https://hubmapconsortium.org/internship-program/,/img/hubmap_internship.png,803ad44d-e7a2-550a-95c6-57855bf06be8,True,64647877-fecd-5514-bdda-dcdbbe6fa4ba
1,Summer Research Program in Biomedical Big Data...,LINCS’ Summer Research Training Program in Bio...,LINCS’ Summer Research Training Program in Bio...,"[""internship""]",False,06/03/2024,08/09/2024,,,https://labs.icahn.mssm.edu/maayanlab/summer-r...,/img/lincs_internship.png,f3f490cf-fd69-579c-8ea3-472c7cf3fb59,True,2adfed62-834f-5984-bd23-2f1029dda967
2,Coursera Course: Big Data Science with the BD2...,In this course the LINCS DCC covers computatio...,In this course the LINCS DCC introduces the da...,"[""online course""]",False,,,,,https://www.coursera.org/learn/bd2k-lincs,/img/lincs_coursera.png,f3f490cf-fd69-579c-8ea3-472c7cf3fb59,True,914748db-5159-5fde-96a6-a966bdfa70b6
3,SenNet + HOA - Hacking the Human Vasculature i...,The goal of this competition is to segment blo...,The “SenNet + HOA - Hacking the Human Vasculat...,"[""online competition""]",False,11/07/2023,02/06/2024,,,https://www.kaggle.com/competitions/blood-vess...,/img/blood-vessel-segmentation.png,dd66e8a5-0e05-5a43-a0ca-18cc3698bb36,True,bf9a64ab-51a9-566a-8f27-1bd2694ab18b
4,GlyGen & CFDE Workshop Summer 2023,This workshop provides an opportunity to lear...,The purpose of this workshop is to bring toget...,"[""workshop""]",True,08/07/2023,,,,https://wiki.glygen.org/GlyGen_CFDE_Workshop_S...,/img/glygen-2023-workshop.png,e31052b0-ac50-5ede-9828-698ff3610427,True,cda5c73a-3bcb-55e5-80bf-d91d3f220bbf
5,HuBMAP Visible Human MOOC (VHMOOC),This 10h course introduces the HuBMAP project ...,This 10h course introduces the HuBMAP project ...,"[""online course""]",False,,,,,https://expand.iu.edu/browse/sice/cns/courses/...,/img/hubmap_mooc.png,803ad44d-e7a2-550a-95c6-57855bf06be8,True,79cf307e-bad9-5594-9c2c-a07a6453b5a0


In [48]:
cols = ['title', 'short_description', 'description', 'tag', 'featured',
       'start_date', 'end_date', 'start_time', 'end_time', 'link', 'image',
       'dcc', 'id', 'active']
outreach[['id', 'title', 'short_description', 'description', 'tag', 'featured','active',
       'start_date', 'end_date', 'start_time', 'end_time', 'link', 'image']].to_csv("output/outreach.tsv", sep="\t", header=None, index=None)

In [49]:
outreach[["id", "dcc"]].to_csv("output/dcc_outreach.tsv", sep="\t", header=None, index=None)

In [50]:
outreach[["id", "dcc"]]

Unnamed: 0,id,dcc
0,64647877-fecd-5514-bdda-dcdbbe6fa4ba,803ad44d-e7a2-550a-95c6-57855bf06be8
1,2adfed62-834f-5984-bd23-2f1029dda967,f3f490cf-fd69-579c-8ea3-472c7cf3fb59
2,914748db-5159-5fde-96a6-a966bdfa70b6,f3f490cf-fd69-579c-8ea3-472c7cf3fb59
3,bf9a64ab-51a9-566a-8f27-1bd2694ab18b,dd66e8a5-0e05-5a43-a0ca-18cc3698bb36
4,cda5c73a-3bcb-55e5-80bf-d91d3f220bbf,e31052b0-ac50-5ede-9828-698ff3610427
5,79cf307e-bad9-5594-9c2c-a07a6453b5a0,803ad44d-e7a2-550a-95c6-57855bf06be8


In [13]:
import boto3
from botocore.exceptions import ClientError
from glob import glob


In [14]:
def upload_file(file_name, bucket, object_name=None):
    """Upload a file to an S3 bucket

    :param file_name: File to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name. If not specified then file_name is used
    :return: True if file was uploaded, else False
    """

    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = os.path.basename(file_name)

    # Upload the file
    s3_client = boto3.client('s3')
    try:
        response = s3_client.upload_file(file_name, bucket, object_name)
    except ClientError as e:
        print(e)
        return False
    return True


In [15]:
bucket = 'cfde-drc'

In [22]:
for f in glob("output/*"):
    filename = f.replace('output', 'database/110723')
    print(filename)
    upload_file(f,bucket, filename)

database/110723/DCC.tsv
database/110723/dcc_outreach.tsv
database/110723/dcc_publications.tsv
database/110723/outreach.tsv
database/110723/publications.tsv


## Database Ingestion

You can run this part to ingest the db entries. Make sure you download the contents of cfde-drc/database/<current date> and put it in ingest/ folder

In [84]:
import pandas as pd
import os
import psycopg2
import pathlib
from urllib.parse import urlparse
from dotenv import load_dotenv
from uuid import NAMESPACE_URL, uuid5

In [None]:
# load .env from drc-portals potentially
load_dotenv('../drc-portals/.env')
load_dotenv()
DATABASE_URL = os.getenv("DATABASE_URL")
result = urlparse(DATABASE_URL)
username = result.username
password = result.password
database = result.path[1:]
hostname = result.hostname
port = result.port

In [None]:
connection = psycopg2.connect(
    database = database,
    user = username,
    password = password,
    host = hostname,
    port = port
)

In [None]:
# Fetch data for ingest
if not pathlib.Path('ingest').exists():
  pathlib.Path('ingest').mkdir()
if not pathlib.Path('ingest/DCC.tsv').exists():
  import urllib.request
  urllib.request.urlretrieve('https://cfde-drc.s3.amazonaws.com/database/110723/DCC.tsv', 'ingest/DCC.tsv')
if not pathlib.Path('ingest/dcc_publications.tsv').exists():
  import urllib.request
  urllib.request.urlretrieve('https://cfde-drc.s3.amazonaws.com/database/110723/dcc_publications.tsv', 'ingest/dcc_publications.tsv')
if not pathlib.Path('ingest/publications.tsv').exists():
  import urllib.request
  urllib.request.urlretrieve('https://cfde-drc.s3.amazonaws.com/database/110723/publications.tsv', 'ingest/publications.tsv')

In [None]:
cur = connection.cursor()
cur.execute('''
  create table dcc_tmp
  as table dccs
  with no data;
''')

with open('ingest/DCC.tsv', 'r') as fr:
    cur.copy_from(fr, 'dcc_tmp',
      columns=('id', 'label', 'short_label', 'description', 'homepage', 'icon'),
      null='',
      sep='\t',
    )

In [None]:
cur.execute('''
    insert into dccs (id, label, short_label, description, homepage, icon)
      select id, label, short_label, description, homepage, icon
      from dcc_tmp
      on conflict (id)
        do update
        set label = excluded.label,
            short_label = excluded.short_label,
            description = excluded.description,
            homepage = excluded.homepage,
            icon = excluded.icon
    ;
  ''')
cur.execute('drop table dcc_tmp;')
connection.commit()

In [None]:
cur = connection.cursor()
cur.execute('''
  create table publication_tmp
  as table publications
  with no data;
''')

with open('ingest/publications.tsv', 'r') as fr:
    cur.copy_from(fr, 'publication_tmp',
      columns=("id", "title", "year", "page", "volume", "issue", "journal", "pmid", "pmcid", "doi", "authors"),
      null='',
      sep='\t',
    )

In [None]:
cur.execute('''
    insert into publications (id, title, year, page, volume, issue, journal, pmid, pmcid, doi, authors)
      select id, title, year, page, volume, issue, journal, pmid, pmcid, doi, authors
      from publication_tmp
      on conflict (id)
        do update
        set id = excluded.id,
            title = excluded.title,
            year = excluded.year,
            page = excluded.page,
            volume = excluded.volume,
            issue = excluded.issue,
            journal = excluded.journal,
            pmid = excluded.pmid,
            pmcid = excluded.pmcid,
            doi = excluded.doi,
            authors = excluded.authors
    ;
  ''')
cur.execute('drop table publication_tmp;')
connection.commit()

In [None]:
cur = connection.cursor()
cur.execute('''
  create table dcc_publication_tmp
  as table dcc_publications
  with no data;
''')

with open('ingest/dcc_publications.tsv', 'r') as fr:
    cur.copy_from(fr, 'dcc_publication_tmp',
      columns=("publication_id", "dcc_id"),
      null='',
      sep='\t',
    )

In [None]:
cur.execute('''
    insert into dcc_publications (publication_id, dcc_id)
      select publication_id, dcc_id
      from dcc_publication_tmp
      on conflict 
        do nothing
    ;
  ''')
cur.execute('drop table dcc_publication_tmp;')
connection.commit()

cur.close()
connection.close()

In [8]:
program = "Human Heredity & Health in Africa"
uuid5(NAMESPACE_URL, program)

UUID('138b48df-8867-5d61-87c5-98cf924c60b9')

In [1]:
import pandas as pd

In [4]:
outreach = pd.read_csv("output/outreach.tsv", sep="\t", header=None)
outreach

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,64647877-fecd-5514-bdda-dcdbbe6fa4ba,2024 HuBMAP Underrepresented Student Internshi...,The Underrepresented Student Internship Progra...,Applications will be accepted from November 27...,"[""internship""]",True,True,05/20/2024,08/16/2024,11/07/2023,02/01/2024,https://hubmapconsortium.org/internship-program/,/img/hubmap_internship.png
1,2adfed62-834f-5984-bd23-2f1029dda967,Summer Research Program in Biomedical Big Data...,LINCS’ Summer Research Training Program in Bio...,LINCS’ Summer Research Training Program in Bio...,"[""internship""]",False,True,06/03/2024,08/09/2024,,02/01/2024,https://labs.icahn.mssm.edu/maayanlab/summer-r...,/img/lincs_internship.png
2,914748db-5159-5fde-96a6-a966bdfa70b6,Coursera Course: Big Data Science with the BD2...,In this course the LINCS DCC covers computatio...,In this course the LINCS DCC introduces the da...,"[""online course""]",False,True,,,,,https://www.coursera.org/learn/bd2k-lincs,/img/lincs_coursera.png
3,bf9a64ab-51a9-566a-8f27-1bd2694ab18b,SenNet + HOA - Hacking the Human Vasculature i...,The goal of this competition is to segment blo...,The “SenNet + HOA - Hacking the Human Vasculat...,"[""online competition""]",False,True,11/07/2023,02/06/2024,,01/30/2024,https://www.kaggle.com/competitions/blood-vess...,/img/blood-vessel-segmentation.png
4,cda5c73a-3bcb-55e5-80bf-d91d3f220bbf,GlyGen & CFDE Workshop Summer 2023,This workshop provides an opportunity to lear...,The purpose of this workshop is to bring toget...,"[""workshop""]",True,True,08/07/2023,,,,https://wiki.glygen.org/GlyGen_CFDE_Workshop_S...,/img/glygen-2023-workshop.png
5,79cf307e-bad9-5594-9c2c-a07a6453b5a0,HuBMAP Visible Human MOOC (VHMOOC),This 10h course introduces the HuBMAP project ...,This 10h course introduces the HuBMAP project ...,"[""online course""]",False,True,,,,,https://expand.iu.edu/browse/sice/cns/courses/...,/img/hubmap_mooc.png


In [9]:
outreach[7] = pd.to_datetime(outreach[7]).apply(lambda dt: dt.tz_localize(tz='America/New_York').isoformat())
outreach[8] = pd.to_datetime(outreach[8]).apply(lambda dt: dt.tz_localize(tz='America/New_York').isoformat())
outreach[9] = pd.to_datetime(outreach[9]).apply(lambda dt: dt.tz_localize(tz='America/New_York').isoformat())
outreach[10] = pd.to_datetime(outreach[10]).apply(lambda dt: dt.tz_localize(tz='America/New_York').isoformat())

In [20]:
outreach.to_csv("output/outreach.tsv", header=None, sep="\t")

In [65]:
landmark = pd.read_csv("data/landmark_publications.tsv", sep="\t")
landmark.head()

Unnamed: 0,DCC,Title,Journal,Year,pmid,doi,citation
0,4DN,The 4D nucleome project,Nature,2017,28905911,10.1038/nature23884,"Dekker J, Belmont AS, Guttman M, Leshyk VO, Li..."
1,ExRNA,The NIH Extracellular RNA Communication Consor...,Journal of Extracellular Vesicles,2015,26320938,10.3402/jev.v4.27493,"Ainsztein AM, Brooks PJ, Dugan VG, Ganguly A, ..."
2,GTEx,The Genotype-Tissue Expression (GTEx) project,Nature Genetics,2013,23715323,10.1038/ng.2653,GTEx Consortium. The Genotype-Tissue Expressio...
3,GTEx,Human genomics. The Genotype-Tissue Expression...,Science,2015,25954001,10.1126/science.1262110,GTEx Consortium. Human genomics. The Genotype-...
4,HuBMAP,The human body at cellular resolution: the NIH...,Nature,2019,31597973,10.1038/s41586-019-1629-x,HuBMAP Consortium. The human body at cellular ...


In [66]:
pub_cols = ["id", "title", "year", "page", "volume", "issue", "journal", "pmid", "pmcid", "doi", "authors", "landmark"]
dcc_pub_cols = ["publication_id", "dcc_id"]

In [67]:
publications = pd.read_csv('output/publications.tsv', sep="\t", header=None)
publications["landmark"] = False
publications.columns = pub_cols
publications = publications.set_index("id")
publications.head()

Unnamed: 0_level_0,title,year,page,volume,issue,journal,pmid,pmcid,doi,authors,landmark
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2337025f-3e71-50ef-850a-a37a4c7fe7ce,3D virtual reality vs. 2D desktop registration...,2021,e0258103,16,10.0,PloS one,34705835,PMC8550408,10.1371/journal.pone.0258103,"Bueckle A, Buehling K, Shih PC, Borner K",False
27147bd8-9969-5750-a964-698000f6944a,Computational screen to identify potential tar...,2023,e13809,22,6.0,Aging cell,37082798,PMC10265163,10.1111/acel.13809,"Deng EZ, Fleishman RH, Xie Z, Marino GB, Clark...",False
29ddbfb6-4131-5874-ba94-75b664fc8335,Contribution of Circulating Host and Microbial...,2023,11786469231182510,16,,International journal of tryptophan research :...,37441265,PMC10334013,10.1177/11786469231182510,"Morgan EW, Dong F, Annalora AJ, Murray IA, Wol...",False
3370a33a-d28e-5027-a3a0-e8489a4259ad,Lactate-dependent transcriptional regulation c...,2023,4129,14,1.0,Nature communications,37452018,PMC10349100,10.1038/s41467-023-39672-2,"Takata N, Miska JM, Morgan MA, Patel P, Billin...",False
33d76b34-d7db-505e-a551-233d36740154,Modular and mechanistic changes across stages ...,2022,436,22,1.0,BMC cancer,35448980,PMC9022252,10.1186/s12885-022-09479-3,"Rahiminejad S, Maurya MR, Mukund K, Subramaniam S",False


In [68]:
dccs = pd.read_csv('output/DCC.tsv', sep="\t", header=None)
dcc_mapper = {}
for i, row in dccs.iterrows():
    dcc_mapper[row[2]] = row[0]
dcc_mapper

{'ExRNA': 'f65babf7-2875-5725-9635-210d654533f1',
 'Kids First': '65af85ae-82d5-5b81-bc66-6bddaa6420ce',
 'GTEx': 'b3028db2-209c-5862-8f4d-33c5b312332e',
 'HuBMAP': '803ad44d-e7a2-550a-95c6-57855bf06be8',
 'IDG': 'a1289ebb-0306-59a1-b0fc-e4d03a4790d7',
 'LINCS': 'f3f490cf-fd69-579c-8ea3-472c7cf3fb59',
 'Metabolomics': '089d8d63-3364-526f-9706-80d62d0ec88c',
 'MoTrPAC': 'a9aeab22-4fbc-5329-aef6-21110f463c23',
 'SPARC': '2399794e-74c6-5735-a039-0782cdeeb1e2',
 'HMP': 'cbfd44b8-684d-56b9-bfd4-45c0e259f896',
 'Glycoscience': 'e31052b0-ac50-5ede-9828-698ff3610427',
 'UDN': '7be6ce16-142e-508e-a31f-108e70ce72c2',
 'KOMP2': '830ddbac-bf21-5612-af1a-75c713045299',
 'A2CPS': 'e332dadd-8084-5fbc-be41-29d75775aab3',
 'SenNet': 'dd66e8a5-0e05-5a43-a0ca-18cc3698bb36',
 'Bridge2AI': '75b3be39-a021-5d80-b7e2-2a7938a1e11a',
 'iHMP': 'dafdfc72-42c6-5c35-ad56-0b2d284431d1',
 'H3Africa': '138b48df-8867-5d61-87c5-98cf924c60b9',
 '4DN': 'd6bb00c3-7224-5001-b9c5-9838622fba40'}

In [69]:
dcc_pubs = pd.read_csv('output/dcc_publications.tsv', sep="\t", header=None)
dcc_pubs.columns = dcc_pub_cols
dcc_pubs.head()

Unnamed: 0,publication_id,dcc_id
0,b2edba84-bdd1-5f3f-855f-8a45e88d98cf,65af85ae-82d5-5b81-bc66-6bddaa6420ce
1,b3b12752-8f5c-5e5e-8dd0-e46afbc83b4a,65af85ae-82d5-5b81-bc66-6bddaa6420ce
2,aca50121-fcde-5541-9277-8adf75152db6,b3028db2-209c-5862-8f4d-33c5b312332e
3,2337025f-3e71-50ef-850a-a37a4c7fe7ce,803ad44d-e7a2-550a-95c6-57855bf06be8
4,7d2732f5-4a7e-5b28-a096-7db93ad1946b,803ad44d-e7a2-550a-95c6-57855bf06be8


In [70]:
len(dcc_pubs.index)

30

In [71]:
lm_pmids = ",".join([str(i) for i in list(landmark.pmid)])
res = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=%s&retmode=json"%lm_pmids)
res.ok

True

In [72]:
info = res.json()["result"]

In [73]:
dcc_pub_index = len(dcc_pubs.index)
for i, row in landmark.iterrows():
    pmid = row["pmid"]
    dcc = row["DCC"]
    dcc_id = dcc_mapper[dcc]
    meta = info[str(pmid)]
    title = meta["title"]
    uid = uuid5(NAMESPACE_URL, title)
    year = meta["sortpubdate"].split("/")[0]
    page = meta["pages"]
    volume = meta["volume"]
    issue = meta["issue"]
    journal = meta["fulljournalname"]
    pmcid = ''
    doi = ''
    for i in meta["articleids"]:
        if i["idtype"] == 'pmcid':
            pmcid = i['value'].split(";")[0].replace("pmc-id: ","").strip()
        if i["idtype"] == 'doi':
            doi = i['value']
    authors = ", ".join([i["name"] for i in meta["authors"]])
    publications.loc[uid] = [
        title,
        year,
        page,
        volume,
        issue,
        journal,
        pmid,
        pmcid,
        doi,
        authors,
        True
    ]
    dcc_pubs.loc[dcc_pub_index] = [
        uid,
        dcc_id
    ]
    dcc_pub_index += 1
    

In [74]:
dcc_pubs.shape

(44, 2)

In [76]:
publications.head()

Unnamed: 0_level_0,title,year,page,volume,issue,journal,pmid,pmcid,doi,authors,landmark
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2337025f-3e71-50ef-850a-a37a4c7fe7ce,3D virtual reality vs. 2D desktop registration...,2021,e0258103,16,10.0,PloS one,34705835,PMC8550408,10.1371/journal.pone.0258103,"Bueckle A, Buehling K, Shih PC, Borner K",False
27147bd8-9969-5750-a964-698000f6944a,Computational screen to identify potential tar...,2023,e13809,22,6.0,Aging cell,37082798,PMC10265163,10.1111/acel.13809,"Deng EZ, Fleishman RH, Xie Z, Marino GB, Clark...",False
29ddbfb6-4131-5874-ba94-75b664fc8335,Contribution of Circulating Host and Microbial...,2023,11786469231182510,16,,International journal of tryptophan research :...,37441265,PMC10334013,10.1177/11786469231182510,"Morgan EW, Dong F, Annalora AJ, Murray IA, Wol...",False
3370a33a-d28e-5027-a3a0-e8489a4259ad,Lactate-dependent transcriptional regulation c...,2023,4129,14,1.0,Nature communications,37452018,PMC10349100,10.1038/s41467-023-39672-2,"Takata N, Miska JM, Morgan MA, Patel P, Billin...",False
33d76b34-d7db-505e-a551-233d36740154,Modular and mechanistic changes across stages ...,2022,436,22,1.0,BMC cancer,35448980,PMC9022252,10.1186/s12885-022-09479-3,"Rahiminejad S, Maurya MR, Mukund K, Subramaniam S",False


In [78]:
publications.to_csv('output/publications.tsv', header=None, sep="\t")
dcc_pubs.to_csv('output/dcc_publications.tsv', header=None, index=None, sep="\t")

In [81]:
dccs.head()

Unnamed: 0,0,1,2,3,4,5,6
0,f65babf7-2875-5725-9635-210d654533f1,Extracellular RNA Communication,ExRNA,The Extracellular RNA Communication program is...,https://exrna.org/,/img/exRNA.png,True
1,65af85ae-82d5-5b81-bc66-6bddaa6420ce,Gabriella Miller Kids First Pediatric Research,Kids First,The goal of the Gabriella Miller Kids First Pe...,https://kidsfirstdrc.org/,/img/Kids First.png,True
2,b3028db2-209c-5862-8f4d-33c5b312332e,Genotype Tissue Expression,GTEx,The Common Fund's Genotype-Tissue Expression (...,https://www.gtexportal.org/home/,/img/GTEx.png,True
3,803ad44d-e7a2-550a-95c6-57855bf06be8,Human BioMolecular Atlas Program,HuBMAP,The goal of the Human BioMolecular Atlas Progr...,https://hubmapconsortium.org/,/img/HuBMAP.png,True
4,a1289ebb-0306-59a1-b0fc-e4d03a4790d7,Illuminating the Druggable Genome,IDG,To improve our scientific understanding of und...,https://druggablegenome.net/,/img/IDG.png,True


In [82]:
for i, row in dccs.iterrows():
    dccs.at[i, 3] = "%s (Description was taken from: https://commonfund.nih.gov.)"%row[3]

In [89]:
dccs.to_csv('output/DCC.tsv', index=None, header=None, sep="\t")

In [87]:
dccs.shape

(19, 7)