In [None]:
import csv
import pandas as pd
import requests
import time
from cellxgene_mods import CxG_API
from datetime import datetime
from pub_check import *


CxG_API.config()

pub_collections = CxG_API.get_collections()
pub_dataset_ids = {d['dataset_id']:c['collection_id'] for c in pub_collections for d in c['datasets']}
pub_collection_ids = [c['collection_id'] for c in pub_collections]

priv_collections = CxG_API.get_collections(visibility='PRIVATE')
priv_dataset_ids = {d['dataset_id']:c['collection_id'] for c in priv_collections for d in c['datasets']}
priv_collection_ids = [c['collection_id'] for c in priv_collections]

nonrev_priv_collections = [c for c in priv_collections if not c.get('revision_of')]

ds_coll = {d['dataset_id']:c['collection_id'] for c in pub_collections + priv_collections for d in c['datasets']}

wrk_dir = '/Users/jason/Downloads/'
uuid_pattern = '[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}'

# DOI review
**check for preprints that have been published**

In [None]:
pub_info = []
for c in pub_collections + priv_collections:
    if c.get('publisher_metadata') and c['publisher_metadata']['journal'] in ['bioRxiv', 'medRxiv']:
        r = doi_checker(c['doi'])
        pub_info.append(r)
pub_df = pd.DataFrame(pub_info)
if 'invalid DOI' in pub_df.columns:
    display(pub_df[pub_df['invalid DOI'].isna() == False])

**check for Collections that might now have DOIs**

In [None]:
#2f75d249-1bec-459b-bf2b-b86221097ced is found in 10.24272/j.issn.2095-8137.2022.531 but this is reuse, not the data generation
for c in pub_collections:
    if not c.get('doi'):
        dois = pubtator_search(c['collection_id'])
        if dois:
            print(c['collection_id'], ','.join(dois))
        time.sleep(0.5)

# long-term private Collections
**pull private Collections older than a specified cut-off**

In [None]:
year_cutoff = 1.5

day_cutoff = 365.25 * year_cutoff
today = datetime.today()

sorted_collections = sorted(nonrev_priv_collections, key=lambda nonrev_priv_collections: nonrev_priv_collections['created_at'])

f = open(f'{wrk_dir}long_private_collections.csv', 'w', encoding='UTF8')

writer = csv.writer(f)
writer.writerow([
    'collection',
    'name',
    'doi',
    'contact name',
    'contact email',
    'created at',
    'number of datasets'
])

for collection in sorted_collections:
    date1 = datetime.strptime(collection['created_at'].split('T')[0], '%Y-%m-%d')
    difference = today - date1
    gap = difference.days

    if gap > day_cutoff:
        writer.writerow([
            collection['collection_url'],
            collection['name'],
            collection['doi'],
            collection['contact_name'],
            collection['contact_email'],                
            collection['created_at'],
            len(collection['datasets'])
        ])

f.close()

# private URLs made public
**looked for private Collection URLs in PubMed**

In [None]:
pubtator_res = []
for c in priv_collection_ids:
    dois = pubtator_search(c)
    if dois:
        new = {'collection_id':c, 'source': ','.join(dois)}
        pubtator_res.append(new)
    time.sleep(0.5)

**filter AirTable report for possible private URL sharing**

In [None]:
at_export = 'Sources of private collections_datasets-Grid view.csv'
df = pd.read_csv(f'{wrk_dir}{at_export}')

month = df.loc[0]['time period']
ignore_sources = ['lattice.atlassian.net','lattice-data.org','Direct / None']

df = df[
    (df['time period'] == month) &
    (df['source'].isin(ignore_sources) == False) &
    (df['url'].str.contains(uuid_pattern))
]
df.drop(columns=['dataset name','time period'], inplace=True)

act = df[df['url'].str.startswith('/collections/')]
act['collection_id'] = act['url'].apply(lambda x: x.split('/')[2])
act = act[act['collection_id'].isin(priv_collection_ids)]

ds_df = df[df['url'].str.startswith('/e/')]
ds_df['dataset_id'] = ds_df['url'].apply(lambda x: x.split('/')[2].split('.')[0])
ds_df = ds_df[ds_df['dataset_id'].isin(priv_dataset_ids.keys())]
ds_df['collection_id'] = ds_df['dataset_id'].map(ds_coll)

act = pd.concat([act, ds_df, pd.DataFrame(pubtator_res)]).fillna({'visitors':0, 'dataset_id': ''})
act = act.groupby('collection_id').agg({
    'source': lambda x: ','.join(set(x)),
    'dataset_id': lambda x: ','.join(set(x)).strip(','),
    'visitors': 'sum'
}).sort_values('visitors', ascending=False)
act.reset_index(inplace=True)
act['collection_url'] = 'https://cellxgene.cziscience.com/collections/' + act['collection_id'].astype(str)
act = act[['collection_url','source','visitors','dataset_id']]

act.to_csv(f'{wrk_dir}private_URL_{month}.csv', index=False)

# is_primary_data evaluation
**read in the Sheets**

In [None]:
sheet = '1ax9b5sxmxSJgrjncXG5WGilgIGKm2EEWmILpoN6pLzY'

tab = 'primary'
url = f'https://docs.google.com/spreadsheets/d/{sheet}/gviz/tq?tqx=out:csv&sheet={tab}'
pri_df = pd.read_csv(url)
pri_df = pri_df[[c for c in pri_df.columns if 'Unnamed' not in c]]

tab = 'reused'
url = f'https://docs.google.com/spreadsheets/d/{sheet}/gviz/tq?tqx=out:csv&sheet={tab}'
reuse_df = pd.read_csv(url)
reuse_df = reuse_df[[c for c in reuse_df.columns if 'Unnamed' not in c]]

#probably need to isolate the published reused (away from the in progress jira tickets)
pub_reuse_df = reuse_df[
    (reuse_df['dataset'].str.startswith('CXG-') == False) &
    (reuse_df['dataset'].str.startswith('WRN-') == False)
]

#Confirm that all Collection IDs are valid
invalid_coll_id = [c for c in set(pri_df['collection'].tolist() + pub_reuse_df['collection'].tolist()) if c not in pub_collection_ids]
if invalid_coll_id:
    print('Invalid Collection IDs',invalid_coll_id)

#Confirm that Dataset IDs are found in the corresponding Collection (reused tab)
for i,row in pub_reuse_df[['dataset','collection']].drop_duplicates().iterrows():
    if row['dataset'] not in pub_dataset_ids:
        print('Invalid Dataset ID', row['dataset'])
    elif row['collection'] != pub_dataset_ids[row['dataset']]:
        print('Invalid Dataset/Collection pairing',row['dataset_id'],row['collection_id'])

#Confirm that all Published Collections are represented in the primary tab
missing = [c for c in pub_collection_ids if c not in pri_df['collection'].tolist()]
if missing:
    print('Add to primary tab', missing)
    
#Any primary tab w/ "NA" should have all cells accounted for in reused tab
for collection_id in pri_df[pri_df['original data is_primary_data'].isna()]['collection'].unique():
    if collection_id not in pub_reuse_df['collection'].unique():
        print('Collection annotated as no original data generated, but no reuse noted',collection_id)

#check that any n/a do not have DOIs in the reused tab (no original data to reuse)
for doi in pri_df[pri_df['original data is_primary_data'].isna()]['DOI'].unique():
    if doi in reuse_df['DOI'].unique():
        print('DOI annotated as no original data generated, but marked as reused',doi)

#Any primary tab w/ "FALSE" should have some accounted for in reused tab as TRUE
for doi in pri_df[pri_df['original data is_primary_data'] == False]['DOI'].unique():
    if doi not in pub_reuse_df[pub_reuse_df['is_primary_data'] == True]['DOI'].unique():
        print('DOI is FALSE in primary Collection, but no TRUE reuse accounted for',doi)

#ATTN - Fill in empty feature_count? Validate feature_count?

**Check that reused obs are accurately annotated**

In [None]:
errors = []
for dataset_id in pub_reuse_df['dataset'].unique():
    ds_df = pub_reuse_df[pub_reuse_df['dataset'] == dataset_id]
    reused_obs_indices = []
    #ATTN - obs is this thing
    for i,row in ds_df.iterrows():
        prop = row['obs field']
        if prop not in obs.columns and prop != 'all':
            row['error'] = 'obs field not in obs'
            errors.append(row)
        else:
            values = row['obs field values'].split(',') if not pd.isna(row['obs field values']) else None
            obs_count = row['observation count']
            if prop == 'all':
                obs_by_this_row = obs
                if values:
                    row['errors'] = 'obs field:"all" should not have values annotated'
                    errors.append(row)
            else:
                not_in_obs = [v for v in values if v not in obs[prop].unique()]
                if not_in_obs:
                    row['error'] = 'value not in obs column'
                    errors.append(row)
                obs_by_this_row = obs[obs[prop].isin(values)]

            if obs_by_this_row.shape[0] != obs_count:
                row['error'] = f"inconsistent cell count {obs_count} vs {obs_by_this_row.shape[0]}"
                errors.append(row)

            if list(obs_by_this_row['is_primary_data'].unique()) != row['is_primary_data']:
                row['error'] = 'inconsistent is_primary_data'
                errors.append(row)
    
            reused_obs_indices.extend(obs_by_this_row.index)
    if len(reused_obs_indices) != len(set(reused_obs_indices)):
        #dup_indices = [i for i in reused_obs_indices if reused_obs_indices.count(i) > 1]
        print('Overlapping reused data',dataset_id)
pd.DataFrame(errors)

**Check all DOIs in the sheet to confirm they are valid & up-to-date**

In [None]:
pub_info = []
for doi in set(list(pri_df['DOI'].unique()) + list(reuse_df['DOI'].unique())):
    r = doi_checker(doi)
    pub_info.append(r)
pub_df = pd.DataFrame(pub_info)
if 'invalid DOI' in pub_df.columns:
    display(pub_df[pub_df['invalid DOI'].isna() == False].sort_values('DOI'))

**Check for DOIs that are marked `True` in multiple places**

In [None]:
comb_pri_df = pd.concat([
    pub_reuse_df[pub_reuse_df['is_primary_data'] == True][['DOI','is_primary_data']],
    pri_df[pri_df['original data is_primary_data'] == True][['DOI','original data is_primary_data']]
])
#ATTN - THINK THIS WILL FLAG DOI REUSED & PRIMARY IN MULTIPLE DATASETS WITHIN 1 COLLECTION
comb_pri_df[
    (comb_pri_df.duplicated(subset='DOI', keep=False)) &
    (comb_pri_df['DOI'].isna() == False)
].sort_values('DOI')

**Check for DOIs that are not marked `True` anywhere**

In [None]:
#get all DOIs that are not is_primary_data TRUE anywhere
comb_nonpri_df = pd.concat([
    pub_reuse_df[pub_reuse_df['is_primary_data'] == False][['DOI','is_primary_data']],
    pri_df[pri_df['original data is_primary_data'] == False][['DOI','original data is_primary_data']]
])

comb_nonpri_df[comb_nonpri_df['DOI'].isin(comb_pri_df['DOI'].unique()) == False]

In [None]:
#COMPARE ALL INSTANCES OF DOIs PRESENT MULTIPLE TIMES
#OBS COUNT
#FEATURE COUNT
#FEATURE REFERENCE

# in progress DOIs
**generate a report of DOIs currently in private Collections**

In [None]:
date = today.strftime('%Y_%m_%d')
priv_dois = set([
    c['doi'] for c in nonrev_priv_collections if c.get('doi')
])
filename = f'{wrk_dir}cxg_private_dois_{date}.txt'
with open(filename, 'w') as file:
    for doi in priv_dois:
        file.write(f'{doi}\n')