In [None]:
import pandas as pd
import sys
from IPython.display import display
from cellxgene_mods import CxG_API,compare_revision,report

sys.path.append('..')
from public_resources.public_resources import detect_sequence_data


CxG_API.config() # set env='dev' or 'staging' if working in either of those test environments

**Specify the Collection**

In [None]:
collection_id = ''
my_dir = ''

**General QA**
- If Revsion, review updates
- Check for consortia
- Check for duplicate Dataset titles
- Review is_primary_data curation within Collection

In [None]:
collection = CxG_API.get_collection(collection_id)

if collection.get('revising_in') or collection.get('revision_of'):
    collection = compare_revision(collection)
    
if not collection.get('consortia'):
    report('No consortia - confirm this is correct', 'WARNING')

df = pd.DataFrame(collection['datasets'])[['title','dataset_id','is_primary_data','cell_count','primary_cell_count']]
if not df[df.duplicated(subset='title', keep=False)].empty:
    report('non-unique Dataset titles','ERROR')
    display(df[df.duplicated(subset='title', keep=False)].sort_values('title'))

df.sort_values('is_primary_data', ascending=False)

**Validate links for presence of raw sequence data**

In [None]:
raw_data_link = False
for l in collection['links']:
    url = l['link_url']
    raw_formats_present = detect_sequence_data(url)
    if isinstance(raw_formats_present, tuple) and len(raw_formats_present) == 2:
        raw_formats_present,error = raw_formats_present
        report(error, 'ERROR')

    if raw_formats_present and raw_formats_present != 'undetermined':
        raw_data_link = True
        if l['link_type'] != 'RAW_DATA':
            report(f'raw data found at {url}, expecting link_type:RAW DATA, not {l["link_type"]}', 'ERROR')
        else:
            report(f'raw data found at {url}, link_type:RAW DATA', 'GOOD')
    elif not raw_formats_present and l['link_type'] == 'RAW_DATA':
        report(f'link_type:RAW DATA but raw data not found at {url}', 'ERROR')

if not raw_data_link:
    report('No raw data link present', 'WARNING')