In [None]:
import json
import os
import pandas as pd
from urllib.parse import quote
import requests
import sys

In [None]:
#one time generation of per donor per collection from our curated spreadsheet

donor_updates = {}

sheet_id = '1bELrjC18WH7wVyxlfKPvWjvUKKqy7y4iFav9ddNooAg'
tab_name = 'migrate only'
url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={quote(tab_name)}'
donor_meta = pd.read_csv(url)
for i,row in donor_meta[['collection','donor_id','new dev_stage']].iterrows():
    c = row['collection']
    d = row['donor_id']
    t = row['new dev_stage']

    if c in donor_updates:
        donor_updates[c][d] = t
    else:
        donor_updates[c] = {d:t}

#with open('/Users/jason/GitClones/CZI/single-cell-curation/cellxgene_schema_cli/cellxgene_schema/donor_updates.json', 'w', encoding='utf-8') as f:
#    json.dump(donor_updates, f, ensure_ascii=False, indent=4)

In [None]:
scc_repo_loc = os.path.expanduser('~/GitClones/CZI/')
api_key_file_path = os.path.expanduser('~/Documents/keys/cxg-api-key.txt')

In [None]:
sys.path.append(os.path.abspath(scc_repo_loc + 'single-cell-curation/notebooks/curation_api/python/'))


from src.utils.config import set_api_access_config
from src.collection import get_collections, get_collection
from src.dataset import get_dataset,get_datasets

In [None]:
set_api_access_config(api_key_file_path)

collections = {}
datasets = get_datasets()
for d in datasets:
    c_id = d['collection_id']
    dev_stages = {t['ontology_term_id']:t['label'] for t in d['development_stage']}
    if c_id in collections:
        collections[c_id]['donor_id'].extend(d['donor_id'])
        collections[c_id]['development_stage'].update(dev_stages)
    else:
        collections[c_id] = {
            'donor_id': d['donor_id'],
            'development_stage': dev_stages
        }
print(f'{len(collections)} Public Collections')

pub_cs = get_collections()
public_names = {c['name']:c['collection_id'] for c in pub_cs}

priv_cs = get_collections(visibility='PRIVATE')
priv_collections = {c['name']:c for c in priv_cs if not c.get('revision_of')}
print(f'{len(priv_collections)} Private Collections')

In [None]:
current_terms = ['unknown']

for ont in ['hsapdv','mmusdv']:
    url = f'https://www.ebi.ac.uk/ols4/api/ontologies/{ont}/terms?obsoletes=false&size=500'
    r = requests.get(url).json()
    terms = [t['obo_id'] for t in r['_embedded']['terms']]
    current_terms.extend(terms)
len(current_terms)

In [None]:
# looks like json files still exist in our repo so commenting out dir change for now
# mig_loc = scc_repo_loc + 'single-cell-curation/cellxgene_schema_cli/cellxgene_schema/'
automigrate_terms = json.load(open('automigrate_terms.json'))
donor_updates = json.load(open('donor_updates.json'))

In [None]:
#Flag any old term from automigrate_terms.json that is not deprecated
[t for t in automigrate_terms.keys() if t in current_terms]

In [None]:
#Flag any new term from automigrate_terms.json that is not in the new version
[t for t in automigrate_terms.values() if t not in current_terms]

In [None]:
#Flag any new term from donor_updates.json that is not in the new version
[v for val in donor_updates.values() for v in val.values() if v not in current_terms]

In [None]:
#Confirm every key from donor_updates.json is either a public Collection ID or a private Collection name
for k,v in donor_updates.items():
    if k in collections:
        not_present = [d for d in v.keys() if d not in collections[k]['donor_id']]
        if not_present:
            print(f'ERROR: {",".join(not_present)} not in {k}')
    elif k in priv_collections:
        c_id = priv_collections[k]['collection_id']
        donors = []
        for d in priv_collections[k]['datasets']:
            dataset = get_dataset(c_id, d['dataset_id'])
            donors.extend(dataset['donor_id'])
        not_present = [d for d in v.keys() if d not in donors]
        if not_present:
            print(f'ERROR: {",".join(not_present)} not in {k}')
    elif k in public_names:
        print(f'ERROR: update to {public_names[k]} for {k}')
    else:
        print(f'ERROR: {k} not a public collection_id or private name')

In [None]:
# create list of private collection ids not in revision then get json metadata into master list of dicts
# no param to get private collections through get_datasets() so need to use get_collection() per private collection id
priv_cs_list = [c['collection_id'] for c in priv_cs if not c.get('revision_of')]
private_datasets = [get_collection(c) for c in priv_cs_list]

In [None]:
private_collections = {}
for c in private_datasets:
    c_id = c['collection_id']
    datasets = c['datasets']
    for d in datasets:
        # skip datasets that have not met validation
        if d['processing_status'] != 'SUCCESS':
            print(f"Current processing error with dataset {d['dataset_version_id']} in collection {c_id}")
            continue
        dev_stages = {t['ontology_term_id']:t['label'] for t in d['development_stage']}
        if c_id in private_collections:
            private_collections[c_id]['donor_id'].extend(d['donor_id'])
            private_collections[c_id]['development_stage'].update(dev_stages)
        else:
            private_collections[c_id] = {
                'donor_id': d['donor_id'],
                'development_stage': dev_stages
            }

In [None]:
# function to browse all private or public Collections and see what deprecated terms are not covered in this migration
def uncovered_terms(collection_dict):
    ''' 
    collection_dict format:
    {collection_id:
        {
            donor_id: list[donors],
            development_stage: dict{dev_term_id: term}
        }    
    }
    '''
    for k,v in collection_dict.items():
        dev_stages = v['development_stage']
        deprecated = [t for t in dev_stages.keys() if t not in current_terms and t.startswith('UBERON:') is False]
        not_migrated = [t for t in deprecated if t not in automigrate_terms.keys()]
        if not_migrated and k not in donor_updates:
            print(k)
            for t in not_migrated:
                print('--',t,':',dev_stages[t])
            print('')

In [None]:
# public collections with deprecated terms not covered in this migration
uncovered_terms(collections)

In [None]:
# private collections with deprecated terms not covered in this migration
uncovered_terms(private_collections)

In [None]:
#for each Collection in donor_updates, confirm that each donor specified donor's current dev_stage is deprecated
#for each Collection in donor_updates, confirm that there is no other donor's with a deprecated dev_stage