**Gathers studies from the [HCA Publication list](https://www.humancellatlas.org/publications/#), [CZ CELLxGENE Discover](https://cellxgene.cziscience.com/), the [HCA Data Portal](https://data.humancellatlas.org/), and (unofficial) [Bionetwork-specific dataset lists](https://docs.google.com/spreadsheets/d/1AHsovASiXq_woqhUDE-_Yl69qPpIWdRvWImy_3P_-K4/edit?usp=sharing).</br>
Identifies outdated or invalid DOIs associated with those studies, and then unifies across resources based on common DOIs.**

A version of the output can be found in [this Google Sheet](https://docs.google.com/spreadsheets/d/1o68QT8HiDihx2dMK-zR8KLgL0gJa_750-Py4cv6vXk8/edit?usp=sharing).

In [None]:
import numpy as np
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import quote

In [None]:
def doi_checker(doi):
    out = {'DOI': doi, 'invalid DOI': ''}

    r = requests.get('https://api.crossref.org/works/' + str(doi))
    if r.status_code != 200:
        out['invalid DOI'] = 'yes'
        return out
    r = r.json()

    if r['message']['DOI'].lower() != doi.lower():
        out['updated_doi'] = r['message']['DOI']

    if r['message']['relation']:
        if 'is-preprint-of' in r['message']['relation']:
            doi = r['message']['relation']['is-preprint-of'][0]['id']
            out['updated_doi'] = doi
            r = requests.get('https://api.crossref.org/works/' + str(doi)).json()

    out['Title'] = cleanhtml(r['message']['title'][0])

    if r['message']['container-title']:
        out['Journal'] = r['message']['container-title'][0]
    elif 'institution' in r['message']:
        out['Journal'] = r['message']['institution'][0]['name']
    else:
        out['Journal'] = r['message'].get('group-title')

    if 'published' in r['message']:
        out['Year'] = r['message']['published']['date-parts'][0][0]

    first_auths = []
    for a in r['message'].get('author',[]):
        if a['sequence'] == 'first':
            if 'name' in a:
                first_auths.append((a['name']))
            else:
                first_auths.append((a['given'] + ' ' + a['family']))
    out['First authors'] = ','.join(first_auths)
    
    return out

In [None]:
CLEANR = re.compile('<.*?>') 


def cleanhtml(raw_html):
    cleantext = re.sub(CLEANR, '', raw_html).replace('\n','')
    cleantext = ' '.join(cleantext.split()).replace('‚Äì','-')
    return cleantext

In [None]:
def check_pubs(): 
    url = 'https://www.humancellatlas.org/publications/#'
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    results = soup.find_all('p',class_='links')
    DOIs = []
    for result in results:
        if len(result.get_text().split('DOI: ',1)) > 1:
            output = result.get_text().split('DOI: ',1)[1].split(' ')[0]
            DOIs.append(output)

    pubs = pd.DataFrame(DOIs, columns=['DOI'])
    pubs['HCA Publication'] = 'yes'

    return pubs

In [None]:
def check_cxg():
    cxg  = []
    url = 'https://api.cellxgene.cziscience.com/curation/v1/collections'
    headers = {'Content-Type': 'application/json'}
    r = requests.get(url, headers=headers, params={})
    collections = r.json()
    
    for c in collections:
        dcp_urls = []
        dcp_url_types = []
        for l in c['links']:
            url = l['link_url']
            if 'data.humancellatlas.org/explore/projects/' in url:
                dcp_urls.append(url)
                dcp_url_types.append(l['link_type'])
        cxg.append({
            'DOI': c['doi'],
            'CELLxGENE ID': c['collection_id'],
            'CELLxGENE URL': c['collection_url'],
            'HCA Portal URL (via CxG)': ','.join(dcp_urls),
            'HCA Portal URL types (via CxG)': ','.join(dcp_url_types)
        })
    
    return pd.DataFrame(cxg)

In [None]:
#https://service.azul.data.humancellatlas.org/#/
def check_dcp():
    dcp = []

    proj_endpoint = f'https://service.azul.data.humancellatlas.org/index/projects?catalog=dcp31'
    r = requests.get(proj_endpoint).json()
    projects = r['hits']
    next_flag = False
    if r['pagination']['next']:
        next_flag = True
    while next_flag == True:
        next_endpoint = r['pagination']['next']
        r = requests.get(next_endpoint).json()
        projects.extend(r['hits'])
        if not r['pagination']['next']:
            next_flag = False

    for p in projects:
        p = p['projects'][0]
        prj_id = p['projectId']
        if p['publications']:
            for pub in p['publications']:
                dcp.append({
                    'DOI': str(pub['doi']).strip('.'),
                    'HCA Portal ID': prj_id,
                    'HCA Portal URL': 'https://data.humancellatlas.org/explore/projects/' + prj_id
                })
        else:
            dcp.append({
                'HCA Portal ID': prj_id,
                'HCA Portal URL': 'https://data.humancellatlas.org/explore/projects/' + prj_id
            })
                

    return pd.DataFrame(dcp)

In [None]:
hcapubs = check_pubs()
print(str(hcapubs.shape[0]) + ' HCA Publications')

In [None]:
cxg = check_cxg()
print(str(cxg.shape[0]) + ' CELLxGENE Collections')

In [None]:
dcp = check_dcp()

doi_corrections_dcp = {
    '77dedd59-1376-4887-9bca-dc42b56d5b7a': {
        'current': '0.1016/j.cell.2022.09.039',
        'updated': '10.1016/j.cell.2022.09.039'
    },
    'c844538b-8854-4a95-bd01-aacbaf86d97f': {
        'current': '10.1073/pnas.22042691',
        'updated': '10.1073/pnas.2204269120'
    },
    'ede2e0b4-6652-464f-abbc-0b2d964a25a0': {
        'current': '10.1073',
        'updated': '10.1073/pnas.2025813118'
    },
    'e255b1c6-1143-4fa6-83a8-528f15b41038': {
        'current': 'None',
        'updated': '10.1038/s41556-023-01108-w'
    },
    '8559a8ed-5d8c-4fb6-bde8-ab639cebf03c': {
        'current': 'None',
        'updated': '10.1371/journal.pone.0263005'
    },
    '41fb1734-a121-4616-95c7-3b732c9433c7': {
        'current': 'nan',
        'updated': '10.1126/sciimmunol.aba9953'
    },
    'f83165c5-e2ea-4d15-a5cf-33f3550bffde': {
        'current': 'nan',
        'updated': '10.1038/s41586-018-0698-6'
    },
    '77780d56-03c0-481f-aade-2038490cef9f': {
        'current': 'nan',
        'updated': '10.1016/j.xgen.2022.100142'
    },
    '116965f3-f094-4769-9d28-ae675c1b569c': {
        'current': 'nan',
        'updated': '10.1126/sciimmunol.abg5539'
    },
    'a2a2f324-cf24-409e-a859-deaee871269c': {
        'current': 'nan',
        'updated': '10.1016/j.crmeth.2022.100325'
    },
    'aff9c3cd-6b84-4fc2-abf2-b9c0b3038277': {
        'current': 'nan',
        'updated': '10.1016/j.crmeth.2022.100325'
    },
    'c302fe54-d22d-451f-a130-e24df3d6afca': {
        'current': 'nan',
        'updated': '10.1016/j.gpb.2020.11.002'
    },
    '5b910a43-7fb5-4ea7-b9d6-43dbd1bf2776': {
        'current': 'None',
        'updated': '10.3389/fendo.2022.874915'
    },
    '9a23ac2d-93dd-4bac-9bb8-040e6426db9d': {
        'current': 'nan',
        'updated': '10.1101/2022.02.14.480397'
    },
    '78b2406d-bff2-46fc-8b61-20690e602227': {
        'current': 'None',
        'updated': '10.1016/j.cell.2016.11.010'
    },
    '3cfcdff5-dee1-4a7b-a591-c09c6e850b11': {
        'current': 'None',
        'updated': '10.3389/fimmu.2021.689019'
    },
    'e526d91d-cf3a-44cb-80c5-fd7676b55a1d': {
        'current': 'None',
        'updated': '10.1038/s41591-021-01332-7'
    }
}

for k,v in doi_corrections_dcp.items():
    if str(dcp.loc[dcp['HCA Portal ID'] == k, 'DOI'].iloc[0]) == v['current']:
        print('updating DOI for DCP project ' + k)
        dcp.loc[dcp['HCA Portal ID'] == k, 'DOI'] = v['updated']
    else:
        print('no update for DCP project ' + k)

dcp.replace({'DOI': {'None': np.nan}}, inplace=True)
print(str(dcp.shape[0]) + ' HCA Portal Projects')

In [None]:
pub_info = []
checked_dois = []

In [None]:
#pull publication info from CrossRef
merged = hcapubs.merge(cxg, on='DOI', how='outer').merge(dcp, on='DOI', how='outer')

for doi in merged['DOI']:
    if doi and doi not in checked_dois:
        pub_info.append(doi_checker(doi))
        checked_dois.append(doi)

for doi in pd.DataFrame(pub_info)['updated_doi']:
    if doi and doi not in checked_dois:
        pub_info.append(doi_checker(doi))
        checked_dois.append(doi)
pub_df = pd.DataFrame(pub_info)
pub_df

In [None]:
#inspect all invalid DOIs
merged[merged['DOI'].isin(pub_df[pub_df['invalid DOI'] == 'yes']['DOI'].tolist())]

In [None]:
#update each resource with updated DOIs
update_df = pub_df[pub_df['updated_doi'].isna() == False]
need_update = pd.Series(update_df.updated_doi.values,update_df.DOI.values).to_dict()

update_count = hcapubs[hcapubs['DOI'].isin(need_update.keys())].shape[0]
if update_count != 0:
    print(f"Updating {str(update_count)} HCA Publication DOIs")
    hcapubs.replace({'DOI': need_update}, inplace=True)

update_count = cxg[cxg['DOI'].isin(need_update.keys())].shape[0]
if update_count != 0:
    print(f"Updating {str(update_count)} CELLxGENE DOIs")
    cxg.replace({'DOI': need_update}, inplace=True)

update_count = dcp[dcp['DOI'].isin(need_update.keys())].shape[0]
if update_count != 0:
    print(f"Updating {str(update_count)} HCA Portal DOIs")
    dcp.replace({'DOI': need_update}, inplace=True)

In [None]:
#add identifiers for pre-publication studies shared across resources
pseudo_dois = {
    'AIDA': {
        'cxg': 'ced320a1-29f3-47c1-a735-513c7084d508',
        'dcp': 'f0f89c14-7460-4bab-9d42-22228a91f185'
    },
    'Tumor_Nephrectomy_UofM': {
        'cxg': 'a98b828a-622a-483a-80e0-15703678befd',
        'dcp': '29ed827b-c539-4f4c-bb6b-ce8f9173dfb7'
    },
    'Pediatric_Pilocytic_Astrocytomas': {
        'cxg': '10bf5c50-8d85-4c5f-94b4-22c1363d9f31',
        'dcp': '575c0ad9-c78e-469b-9fdf-9a68dd881137'
    }
}

for k,v in pseudo_dois.items():
    cxg.loc[cxg['CELLxGENE ID'] == v['cxg'], 'DOI'] = k
    dcp.loc[dcp['HCA Portal ID'] == v['dcp'], 'DOI'] = k

In [None]:
#collapse each on DOI
mult = cxg[(cxg.duplicated(subset='DOI', keep=False)) & (cxg['DOI'].isna() == False)].groupby('DOI').agg(','.join).reset_index()
cxg = cxg[(cxg.duplicated(subset='DOI', keep=False) == False) | (cxg['DOI'].isna())]
cxg = pd.concat([cxg,mult])

mult = dcp[(dcp.duplicated(subset='DOI', keep=False)) & (dcp['DOI'].isna() == False)].groupby('DOI').agg(','.join).reset_index()
dcp = dcp[(dcp.duplicated(subset='DOI', keep=False) == False) | (dcp['DOI'].isna())]
dcp = pd.concat([dcp,mult])

hcapubs.drop_duplicates(inplace=True)

In [None]:
#inspect CELLxGENE Collections that need HCA Portal URL added
cxg[(cxg['DOI'].isin(dcp['DOI'])) & (cxg['HCA Portal URL (via CxG)'] == '')].merge(dcp[['DOI','HCA Portal ID']], on='DOI', how='left')

In [None]:
#bring in the bionetwork lists
atlas_df = pd.DataFrame()
atlas_sheet = '1AHsovASiXq_woqhUDE-_Yl69qPpIWdRvWImy_3P_-K4'
for tissue in ['kidney-1','lung-1','retina-1','gut-1','blood-1','brain-1','skin-2','craniofacial-2','breast','adipose','tonsil']:
    url = f'https://docs.google.com/spreadsheets/d/{atlas_sheet}/gviz/tq?tqx=out:csv&sheet={quote(tissue)}'
    df = pd.read_csv(url)[['DOI']]
    df = df[df['DOI'].isna() == False]
    df.drop_duplicates(inplace=True)
    df['tissue atlas(es)'] = tissue.split('-')[0]
    atlas_df = pd.concat([atlas_df, df])
atlas_df = atlas_df.groupby('DOI', as_index=False).agg(','.join)
print(str(atlas_df.shape[0]) + ' studies from Bionetwork lists')

In [None]:
for doi in atlas_df['DOI'].unique():
    if doi and doi not in checked_dois and doi not in pseudo_dois.keys():
        pub_info.append(doi_checker(doi))
        checked_dois.append(doi)
pub_df = pd.DataFrame(pub_info)
pub_df

In [None]:
#check for DOI updates needed in atlas sheets
pub_df[(pub_df['DOI'].isin(atlas_df['DOI'])) & ((pub_df['updated_doi'].isna() == False) | (pub_df['invalid DOI'] == 'yes'))]

In [None]:
#create the merged table
final_cols = ['DOI','Title','Journal','Year','First authors',
              'HCA Publication','CELLxGENE ID','CELLxGENE URL',
              'HCA Portal ID','HCA Portal URL', 'tissue atlas(es)']

no_nan_cxg = cxg[cxg['DOI'].isna() == False]
nan_cxg = cxg[cxg['DOI'].isna()]

no_nan_dcp = dcp[dcp['DOI'].isna() == False]
nan_dcp = dcp[dcp['DOI'].isna()]

all_dois = no_nan_cxg.merge(no_nan_dcp,on='DOI',how='outer').merge(hcapubs,on='DOI',how='outer').merge(atlas_df,on='DOI',how='outer')
all_dois = all_dois.merge(pub_df,on='DOI',how='left')
all_dois = pd.concat([all_dois, nan_cxg, nan_dcp]).fillna('')
all_dois.loc[all_dois['DOI'].isin(list(pseudo_dois.keys()) + ['']), 'top'] = 'yes'
all_dois.sort_values('top', inplace=True)
all_dois = all_dois[final_cols]
all_dois