In [None]:
import os
import pandas as pd
import sys

**Specify the locations of the cloned [single-cell-curation repo](https://github.com/chanzuckerberg/single-cell-curation) & your API key file**

In [None]:
scc_repo_loc = os.path.expanduser('~/GitClones/CZI/')
api_key_file_path = os.path.expanduser('~/Documents/keys/cxg-api-key.txt')

In [None]:
sys.path.append(os.path.abspath(scc_repo_loc + 'single-cell-curation/notebooks/curation_api/python/'))


from src.utils.config import set_api_access_config
from src.collection import get_collection

In [None]:
set_api_access_config(api_key_file_path) #use env='dev' to work on the dev site

**Specify the Published ID of the Collection (not the Revision ID)**

In [None]:
collection_id = ''

In [None]:
collection = get_collection(collection_id)
rev_id = collection['revising_in']
rev = get_collection(rev_id)

In [None]:
should_differ = [
    'collection_id', 'collection_url', 'collection_version_id',
    'created_at', 'revising_in', 'revision_of', 'visibility'
]
should_be_absent = [
    'processing_status'
]

for k,v in rev.items():
    if k not in collection.keys():
        if k not in should_be_absent:
            print('not present: ' + k)
    elif collection.get(k) != v and k not in should_differ:
        print('not same: ' + k)
        if k not in ['datasets','publisher_metadata']:
            print('--- published: ' + v)
            print('----- revised: ' + collection[k])

**If `datasets` differ then continue to parse out how they differ**

In [None]:
should_differ = [
    'dataset_version_id','explorer_url','assets','revised_at','citation'
]

ont_fields = [
    'assay','cell_type','development_stage','disease',
    'self_reported_ethnicity','sex','tissue'
]

pub_datasets = {d['dataset_id']: d for d in collection['datasets']}
rev_datasets = {d['dataset_id']: d for d in rev['datasets']}

comp = {}
for ds_id,v in rev_datasets.items():
    comp[ds_id] = {'title': v['title']}
    if ds_id not in pub_datasets.keys():
        comp[ds_id]['in published'] = 'not present'
    else:
        for prop,rev_val in v.items():
            if prop not in should_differ:
                pub_val = pub_datasets[ds_id].get(prop)
                if prop in ont_fields:
                    rev_val = [t['label'] for t in rev_val]
                    pub_val = [t['label'] for t in pub_val]
                if isinstance(rev_val, list) and prop != 'assets':
                    rev_val.sort()
                    pub_val.sort()
                if pub_val != rev_val:
                    if prop == 'mean_genes_per_cell' and round(rev_val, 5) == round(pub_val, 5):
                        continue
                    comp[ds_id][prop + '_REV'] = rev_val
                    comp[ds_id][prop + '_PUB'] = pub_val

comp_df = pd.DataFrame(comp).transpose()
comp_df = comp_df.dropna(subset=[c for c in comp_df.columns if c != 'title'], how='all')
comp_df.fillna('')

**Identify properties to investigate differences between long lists**

In [None]:
fields = [
    'tissue'
]

In [None]:
for f in fields:
    temp = comp_df[(comp_df[f + '_REV'] != comp_df[f + '_PUB']) & (comp_df[f + '_PUB'].isna() == False)]
    for i,row in temp.iterrows():
        print(i + '-' + f)
        p = row[f + '_PUB']
        r = row[f + '_REV']
        only_in_pub = [e for e in p if e not in r]
        only_in_rev = [e for e in r if e not in p]
        print('only in pub:' + ','.join(only_in_pub))
        print('only in rev:' + ','.join(only_in_rev))
        print('---------')