In [None]:
import csv
import fsspec
import h5py
import pandas as pd
import requests
import sys
import time
from anndata.experimental import read_elem
from cellxgene_mods import CxG_API
from datetime import datetime
from urllib.parse import quote

sys.path.append('..')
from public_resources.public_resources import doi_checker,pubtator_search


CxG_API.config()
fs = fsspec.filesystem('s3')
today = datetime.today()

pub_collections = CxG_API.get_collections()
pub_dataset_ids = {d['dataset_id']:c['collection_id'] for c in pub_collections for d in c['datasets']}
pub_collection_ids = [c['collection_id'] for c in pub_collections]
pub_collection_dois = {c['collection_id']:c['doi'] for c in pub_collections}
pub_dataset_ver_ids = {d['dataset_id']:d['dataset_version_id'] for c in pub_collections for d in c['datasets']}

priv_collections = CxG_API.get_collections(visibility='PRIVATE')
priv_dataset_ids = {d['dataset_id']:c['collection_id'] for c in priv_collections for d in c['datasets']}
priv_collection_ids = [c['collection_id'] for c in priv_collections]

nonrev_priv_collections = [c for c in priv_collections if not c.get('revision_of')]

ds_coll = {d['dataset_id']:c['collection_id'] for c in pub_collections + priv_collections for d in c['datasets']}

uuid_pattern = '[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}'
outpre = 'CxGmonthly'

# DOI review
**check for preprints that have been published**

In [None]:
pub_info = []
for c in pub_collections + priv_collections:
    if c.get('publisher_metadata') and c['publisher_metadata']['journal'] in ['bioRxiv', 'medRxiv']:
        r = doi_checker(c['doi'])
        pub_info.append(r)
pub_df = pd.DataFrame(pub_info)
if 'invalid DOI' in pub_df.columns:
    display(pub_df[pub_df['invalid DOI'].isna() == False])

**check for Collections that might now have DOIs**

In [None]:
#2f75d249-1bec-459b-bf2b-b86221097ced is found in 10.24272/j.issn.2095-8137.2022.531 but this is reuse, not the data generation
#same with 6b701826-37bb-4356-9792-ff41fc4c3161 in 10.1038/s43587-024-00762-5


#should add a biorxiv search too
#https://api.biorxiv.org/
for c in pub_collections:
    if not c.get('doi'):
        dois = pubtator_search(c['collection_id'])
        if dois:
            print(c['collection_id'], ','.join(dois))
        time.sleep(0.5)

# long-term private Collections
**pull private Collections older than a specified cut-off**

In [None]:
year_cutoff = 1.5

day_cutoff = 365.25 * year_cutoff

sorted_collections = sorted(nonrev_priv_collections, key=lambda nonrev_priv_collections: nonrev_priv_collections['created_at'])

f = open(f'{outpre}_long_private_collections.csv', 'w', encoding='UTF8')

writer = csv.writer(f)
writer.writerow([
    'collection',
    'name',
    'doi',
    'contact name',
    'contact email',
    'created at',
    'number of datasets'
])

for collection in sorted_collections:
    date1 = datetime.strptime(collection['created_at'].split('T')[0], '%Y-%m-%d')
    difference = today - date1
    gap = difference.days

    if gap > day_cutoff:
        writer.writerow([
            collection['collection_url'],
            collection['name'],
            collection['doi'],
            collection['contact_name'],
            collection['contact_email'],                
            collection['created_at'],
            len(collection['datasets'])
        ])

f.close()

# private URLs made public
**looked for private Collection URLs in PubMed**

In [None]:
#should add in biorxiv search too
#https://api.biorxiv.org/

pubtator_res = []
for c in priv_collection_ids:
    dois = pubtator_search(c)
    if dois:
        new = {'collection_id':c, 'source': ','.join(dois)}
        pubtator_res.append(new)
    time.sleep(0.5)

**filter AirTable report for possible private URL sharing**

In [None]:
at_export = 'Sources of private collections_datasets-Grid view.csv'
df = pd.read_csv(at_export)

month = df.loc[0]['time period']
ignore_sources = ['lattice.atlassian.net','lattice-data.org','Direct / None']

df = df[
    (df['time period'] == month) &
    (df['source'].isin(ignore_sources) == False) &
    (df['url'].str.contains(uuid_pattern))
]
df.drop(columns=['dataset name','time period'], inplace=True)

act = df[df['url'].str.startswith('/collections/')]
act['collection_id'] = act['url'].apply(lambda x: x.split('/')[2])
act = act[act['collection_id'].isin(priv_collection_ids)]

ds_df = df[df['url'].str.startswith('/e/')]
ds_df['dataset_id'] = ds_df['url'].apply(lambda x: x.split('/')[2].split('.')[0])
ds_df = ds_df[ds_df['dataset_id'].isin(priv_dataset_ids.keys())]
ds_df['collection_id'] = ds_df['dataset_id'].map(ds_coll)

act = pd.concat([act, ds_df, pd.DataFrame(pubtator_res)]).fillna({'visitors':0, 'dataset_id': ''})
act = act.groupby('collection_id').agg({
    'source': lambda x: ','.join(set(x)),
    'dataset_id': lambda x: ','.join(set(x)).strip(','),
    'visitors': 'sum'
}).sort_values('visitors', ascending=False)
act.reset_index(inplace=True)
act['collection_url'] = 'https://cellxgene.cziscience.com/collections/' + act['collection_id'].astype(str)
act = act[['collection_url','source','visitors','dataset_id']]

act.to_csv(f'{outpre}_private_URL.csv', index=False)

# is_primary_data evaluation
**read in the Sheets**

In [None]:
sheet = '1ax9b5sxmxSJgrjncXG5WGilgIGKm2EEWmILpoN6pLzY'

tab = 'published Collection DOIs'
url = f'https://docs.google.com/spreadsheets/d/{sheet}/gviz/tq?tqx=out:csv&sheet={quote(tab)}'
pri_df = pd.read_csv(url)
pri_df = pri_df[[c for c in pri_df.columns if 'Unnamed' not in c]]

tab = 'reused data'
url = f'https://docs.google.com/spreadsheets/d/{sheet}/gviz/tq?tqx=out:csv&sheet={quote(tab)}'
reuse_df = pd.read_csv(url)
reuse_df = reuse_df[[c for c in reuse_df.columns if 'Unnamed' not in c]]

#probably need to isolate the published reused (away from the in progress jira tickets)
pub_reuse_df = reuse_df[
    (reuse_df['Dataset'].str.startswith('CXG-') == False) &
    (reuse_df['Dataset'].str.startswith('WRN-') == False)
]

#Confirm that all Collection IDs are valid
invalid_coll_id = [c for c in set(pri_df['Collection'].tolist() + pub_reuse_df['Collection'].tolist()) if c not in pub_collection_ids]
if invalid_coll_id:
    print('Invalid Collection IDs',invalid_coll_id)

#Confirm that all Collection IDs have the appropriate DOI documented in the table
for i,row in pri_df[['Collection','DOI of Collection']].iterrows():
    if row['Collection'] not in invalid_coll_id:
        if pub_collection_dois[row['Collection']] and row['DOI of Collection'] != pub_collection_dois[row['Collection']]:
            print('DOI not up-to-date',row['Collection'])

#Confirm that Dataset IDs are found in the corresponding Collection (reused tab)
for i,row in pub_reuse_df[['Dataset','Collection']].drop_duplicates().iterrows():
    if row['Dataset'] not in pub_dataset_ids:
        print('Invalid Dataset ID', row['Dataset'])
    elif row['Collection'] != pub_dataset_ids[row['Dataset']]:
        print('Invalid Dataset/Collection pairing',row['Dataset'],row['Collection'])

#Confirm that all Published Collections are represented in the primary tab
missing = [c for c in pub_collection_ids if c not in pri_df['Collection'].tolist()]
if missing:
    print('Add to primary tab', missing)

#Any primary tab w/ "NA" should have all cells accounted for in reused tab
for collection_id in pri_df[pri_df['original data is_primary_data'].isna()]['Collection'].unique():
    if collection_id not in pub_reuse_df['Collection'].unique():
        print('Collection annotated as no original data generated, but no reuse noted',collection_id)

#check that any n/a do not have DOIs in the reused tab (no original data to reuse)
for doi in pri_df[pri_df['original data is_primary_data'].isna()]['DOI of Collection'].unique():
    if doi in reuse_df['DOI of reused data'].unique():
        print('DOI annotated as no original data generated, but marked as reused',doi)

#Any primary tab w/ "FALSE" should have some accounted for in reused tab as TRUE
for doi in pri_df[pri_df['original data is_primary_data'] == False]['DOI of Collection'].unique():
    if doi not in pub_reuse_df[pub_reuse_df['is_primary_data'] == True]['DOI of reused data'].unique():
        print('DOI is FALSE in primary Collection, but no TRUE reuse accounted for',doi)

#Fill in empty feature_count
for i,row in pub_reuse_df[pub_reuse_df['Feature count'].isna()][['Collection','Dataset']].drop_duplicates().iterrows():
    ds = CxG_API.get_dataset(row['Collection'],row['Dataset'])
    print('Feature counts:',row['Dataset'],ds['feature_count'])

**Check that reused obs are accurately annotated**

In [None]:
errors = []
for dataset_id in pub_reuse_df['Dataset'].unique():
    ds_df = pub_reuse_df[pub_reuse_df['Dataset'] == dataset_id]
    reused_obs_indices = []

    dataset_version_id = pub_dataset_ver_ids[dataset_id]
    with h5py.File(fs.open(f'corpora-data-prod/{dataset_version_id}/local.h5ad')) as f:
        obs = read_elem(f['obs'])

    for i,row in ds_df.iterrows():
        prop = row['obs field']
        if prop not in obs.columns and prop != 'all':
            row['error'] = 'obs field not in obs'
            errors.append(row)
            continue
        else:
            values = row['obs field values'].split(',') if not pd.isna(row['obs field values']) else None
            obs_count = row['observation count']
            if prop == 'all':
                obs_by_this_row = obs
                if values:
                    row['errors'] = 'obs field:"all" should not have values annotated'
                    errors.append(row)
                    continue
            else:
                not_in_obs = [v for v in values if v not in obs[prop].unique()]
                if not_in_obs:
                    row['error'] = 'value not in obs column'
                    errors.append(row)
                    continue
                obs_by_this_row = obs[obs[prop].isin(values)]

            if obs_by_this_row.shape[0] != obs_count:
                row['error'] = f'inconsistent cell count {obs_count} vs {obs_by_this_row.shape[0]}'
                errors.append(row)
                continue

            if list(obs_by_this_row['is_primary_data'].unique()) != [row['is_primary_data']]:
                row['error'] = 'inconsistent is_primary_data'
                errors.append(row)
                continue
    
            reused_obs_indices.extend(obs_by_this_row.index)
    if len(reused_obs_indices) != len(set(reused_obs_indices)):
        print('Overlapping reused data',dataset_id)
pd.DataFrame(errors)

**Check all DOIs in the sheet to confirm they are valid & up-to-date**

In [None]:
pub_info = []
for doi in set(list(pri_df['DOI of Collection'].unique()) + list(reuse_df['DOI of reused data'].unique())):
    r = doi_checker(doi)
    pub_info.append(r)
pub_df = pd.DataFrame(pub_info)
if 'invalid DOI' in pub_df.columns:
    display(pub_df[pub_df['invalid DOI'].isna() == False].sort_values('DOI'))

**Check for DOIs that are marked `True` in multiple places**

In [None]:
ok = {
    '10.1002/hep4.1854': [ #1 Visium slide in a 2ry Collection
        '0c8a364b-97b5-4cc8-a593-23c38c6f0ac5','44531dd9-1388-4416-a117-af0a99de2294'
    ],
    '10.1038/s41467-018-06318-7': [ #4 donors in a 2ry Collection, 1 donor in another 2ry Collection
        '0c8a364b-97b5-4cc8-a593-23c38c6f0ac5','44531dd9-1388-4416-a117-af0a99de2294'
    ],
    '10.1038/s41467-024-49037-y': [ #mouse & human in separate Collections
        '67ba665e-0611-4b53-a522-40c2e0dc6df7','71f4bccf-53d4-4c12-9e80-e73bfb89e398'
    ],
    '10.1101/2020.11.20.20227355': [ #2 studies from 1 preprint
        '0434a9d4-85fd-4554-b8e3-cf6c582bb2fa','eb735cc9-d0a7-48fa-b255-db726bf365af'
    ],
    '10.1016/j.devcel.2020.11.010': [ #organoid data in a 2ry Collection
        '6282a908-f162-44a2-99a3-8a942e4271b2','17481d16-ee44-49e5-bcf0-28c0780d8c4a'
    ],
    '10.1016/j.cell.2021.04.028': [ #organoid data in a 2ry Collection
        '6282a908-f162-44a2-99a3-8a942e4271b2','dfc09a93-bce0-4c77-893d-e153d1b7f9fa'
    ],
    '10.1016/j.devcel.2020.01.033': [ #organoid data & tissue data in separate 2ry Collections
        '6282a908-f162-44a2-99a3-8a942e4271b2','dfc09a93-bce0-4c77-893d-e153d1b7f9fa'
    ],
    '10.1016/j.devcel.2020.07.023': [ #organoid data & tissue data in separate 2ry Collections
        '6282a908-f162-44a2-99a3-8a942e4271b2','dfc09a93-bce0-4c77-893d-e153d1b7f9fa'
    ],
    '10.1016/j.stem.2020.11.008': [ #organoid data & tissue data in separate 2ry Collections
        '6282a908-f162-44a2-99a3-8a942e4271b2','dfc09a93-bce0-4c77-893d-e153d1b7f9fa'
    ],
    '10.1038/s41586-019-1373-2': [ #organoid data & tissue data in separate 2ry Collections
        '6282a908-f162-44a2-99a3-8a942e4271b2','854c0855-23ad-4362-8b77-6b1639e7a9fc'
    ],
    '10.1038/s41467-021-25968-8': [ #different tissues in separate 2ry Collections
        '0f7d022a-46c7-4e64-be4c-e34adbb78089','48c15b0c-6039-4e0f-9668-b6b3c0b830ad'
    ],
    '10.1016/j.molmet.2022.101595': [ #mouse data is False in Collection with DOI, more mouse data from that study is in 2ry Collection
        '0a77d4c0-d5d0-40f0-aa1a-5e1429bcbd7e','296237e2-393d-4e31-b590-b03f74ac5070'
    ]
}

comb_pri_df = pd.concat([
    pub_reuse_df[pub_reuse_df['is_primary_data'] == True].rename(columns={'DOI of reused data':'DOI'})[['DOI','is_primary_data','Collection']],
    pri_df[pri_df['original data is_primary_data'] == True].rename(columns={'DOI of Collection':'DOI'})[['DOI','original data is_primary_data','Collection']]
])

poss_dup_true = comb_pri_df
poss_dup_true['doi_coll'] = comb_pri_df.apply(lambda x: f"{x['DOI']}{x['Collection']}", axis=1)
poss_dup_true.drop_duplicates(inplace=True)
poss_dup_true.drop(columns='doi_coll', inplace=True)

poss_dup_true = poss_dup_true[
    (poss_dup_true.duplicated(subset='DOI', keep=False)) &
    (poss_dup_true['DOI'].isna() == False)
]

ok_list = []
for doi in poss_dup_true['DOI'].unique():
    if doi in ok:
        collections_in_table = poss_dup_true[poss_dup_true['DOI'] == doi]['Collection'].unique()
        if set(collections_in_table) == set(ok[doi]):
            ok_list.append(doi)

poss_dup_true[poss_dup_true['DOI'].isin(ok_list) == False].sort_values('DOI')

**Check for DOIs that are not marked `True` anywhere**

In [None]:
#10.1186/s13059-020-02210-0	- coming in megagut submission
comb_nonpri_df = pd.concat([
    pub_reuse_df[pub_reuse_df['is_primary_data'] == False].rename(columns={'DOI of reused data':'DOI'})[['DOI','is_primary_data']],
    pri_df[pri_df['original data is_primary_data'] == False].rename(columns={'DOI of Collection':'DOI'})[['DOI','original data is_primary_data']]
])

comb_nonpri_df[comb_nonpri_df['DOI'].isin(comb_pri_df['DOI'].unique()) == False]

**IN PROGRESS - Compare instances of each DOI that is present multiple places**
- obs count
- feature count
- feature reference

**Determine which should be `True`**

# in progress DOIs
**generate a report of DOIs currently in private Collections**

In [None]:
date = today.strftime('%Y_%m_%d')
priv_dois = set([
    c['doi'] for c in nonrev_priv_collections if c.get('doi')
])
filename = f'{outpre}_cxg_private_dois_{date}.txt'
with open(filename, 'w') as file:
    for doi in priv_dois:
        file.write(f'{doi}\n')