In [1]:
from datetime import datetime
import lattice
import json
import requests
import pandas as pd
from urllib.parse import urljoin


mode = 'prod'
connection = lattice.Connection(mode)
prod_server = connection.server

In [5]:
demo_server = 'http://ec2-13-57-3-12.us-west-1.compute.amazonaws.com/'

In [3]:
def compare_search(add_url, audit=False):
    url = urljoin(prod_server, add_url)
    results = requests.get(url, auth=connection.auth).json()
    if audit:
        prod = {}
        for f in results['facets']:
            if f['field'].startswith('audit'):
                for t in f['terms']:
                    prod[t['key']] = t['doc_count']
    else:
        prod = {r['key']: r['doc_count'] for r in results['facets'][0]['terms']}

    url = urljoin(demo_server, add_url)
    results = requests.get(url, auth=connection.auth).json()
    if audit:
        demo = {}
        for f in results['facets']:
            if f['field'].startswith('audit'):
                for t in f['terms']:
                    demo[t['key']] = t['doc_count']
    else:
        demo = {r['key']: r['doc_count'] for r in results['facets'][0]['terms']}

    issues = []
    for k,v in prod.items():
        if v != demo.get(k, 0):
            issues.append({'type': k, 'prod': v, 'demo': demo.get(k, 0)})
    for k,v in demo.items():
        if k not in prod:
            issues.append({'type': k, 'prod': 0, 'demo': v})
    if issues:
        return pd.DataFrame(issues).set_index('type')
    else:
        return 'No difference'

In [6]:
#ensure that the demo is done indexing
url = urljoin(demo_server, '_indexer')
r = requests.get(url, auth=connection.auth).json()
print(r['status'])
if r['status'] == 'waiting':
    if(r['results']):
        print(r['results'][0]['cycle_took'])
else:
    start = datetime.strptime(r['started'], '%Y-%m-%dT%H:%M:%S.%f')
    now = datetime.utcnow()
    elapsed = now - start
    print(elapsed)

waiting


In [None]:
print('COMPARE total object counts per type')
compare_search('search/?type=*')

In [None]:
print('COMPARE object counts with INTERNAL ACTION')
compare_search('search/?type=*&audit.INTERNAL_ACTION=*')

In [None]:
print('COMPARE object counts with AUDITS')
compare_search('search/?type=*', audit=True)

In [None]:
#check properties of OntologyTerm objects
slim_fields = [
    'organ_slims',
    'system_slims',
    'cell_slims',
    'development_slims',
    'disease_slims',
    'ethnicity_slims',
    'qa_slims'
]

url = urljoin(prod_server, 'search/?type=OntologyTerm&limit=all&field=' + '&field='.join(slim_fields))
results = requests.get(url, auth=connection.auth).json()
prod_terms = {}
for r in results['@graph']:
    prod_terms[r['@id']] = {}
    for sf in slim_fields:
        if sf in r:
            prod_terms[r['@id']][sf] = r[sf]

url = urljoin(prod_server, 'search/?type=OntologyTerm&limit=all&field=' + '&field='.join(slim_fields))
results = requests.get(url, auth=connection.auth).json()
demo_terms = {}
for r in results['@graph']:
    demo_terms[r['@id']] = {}
    for sf in slim_fields:
        if sf in r:
            demo_terms[r['@id']][sf] = r[sf]

issues = []
for k,v in prod_terms.items():
    if k in demo_terms:
        for k2,v2 in v.items():
            if v2 != demo_terms[k].get(k2):
                issues.append({
                    'term': k,
                    'slim': k2,
                    'prod': v2,
                    'demo': demo_terms[k].get(k2)
                })
    else:
        issues.append({
            'term': k,
            'demo': 'absent'
        })
pd.DataFrame(issues)

# Validate S3 URIs

In [None]:
obj_type = 'RawSequenceFile'
#RawMatrixFile, etc...
#CAN I JUST DO A SEARCH WITH THE FILE/DATAFILE CLASS?

errors = []
success = 0

#DEMO?
url = urljoin(server, f'search/?limit=all&format=json&type={obj_type}&status!=deleted&s3_uri=*&field=s3_uri&dataset=%2Fdatasets%2FLATDS692ZDC%2F&audit.ERROR.category=file+not+validated')

results = requests.get(url, auth=connection.auth).json()
for o in results['@graph']:
    uri = o['s3_uri']
    bucket = uri.split('/')[2]
    file = uri.replace('s3://' + bucket + '/', '')
    try:
        s3.Object(bucket, file).load()
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            errors.append((o['@id'], uri))
        else:
            print('some other error')
            errors.append((o['@id'], uri))
    else:
        success += 1

print(f'{str(success)} valid URIs')
print(f'{str(len(errors))} invalid URIs')
for e in errors:
    print(e[0] + ': ' + e[1])

# always enforce unique arrays

In [None]:
server = 'https://www.lattice-data.org/' #DEMO?
url = urljoin(server, 'profiles/?format=json')
results = requests.get(url).json()
addprops = [] #WHAT IS THIS FOR
for k,v in results.items():
    if k not in ['_subtypes','@type']:
        for p,s in v['properties'].items():
            if s['type'] == 'object':
                for p2, s2 in s['properties'].items():
                    if s2['type'] == 'array' and p2 != '@type' and s2.get('notSubmittable') != True:
                        if not s2.get('uniqueItems'):
                            print(k + '.' + p + '.' + p2)
            elif s['type'] == 'array' and p != '@type' and s.get('notSubmittable') != True:
                if not s.get('uniqueItems'):
                    print(k + '.' + p)

# additional props qa

In [None]:
server = 'https://www.lattice-data.org/' #DEMO?
url = urljoin(server, 'profiles/?format=json')
results = requests.get(url).json()
addprops = []
for k,v in results.items():
    if k not in ['_subtypes','@type']:
        for p,s in v['properties'].items():
            if s['type'] == 'object':
                print(k + '.' + p)
                print(s.get('required'))
                if s.get('additionalProperties') != False:
                    addprops.append(k + '.' + p)
                print('--------')
            elif s['type'] == 'array':
                if s['items']['type'] == 'object':
                    print(k + '.' + p)
                    print(s['items'].get('required'))
                    if s['items'].get('additionalProperties') != False:
                        addprops.append(k + '.' + p)
                    print('--------')
addprops

# validate slim labels

In [None]:
'/Users/jason/GitClones/Lattice-Data/encoded/src/encoded/types/ontology_term.py'
#https://github.com/Lattice-Data/encoded/blob/dev/src/encoded/types/ontology_term.py
#https://raw.githubusercontent.com/Lattice-Data/encoded/refs/heads/dev/src/encoded/types/ontology_term.py

In [18]:
from ..cellxgene_resources.cellxgene_mods import *

ImportError: attempted relative import with no known parent package

In [8]:
import sys

sys.path.append('../../encoded/src/encoded/types/')
from ontology_term import system_slim_terms


system_slim_terms

ModuleNotFoundError: No module named 'snovault'

In [None]:
download py?
#https://raw.githubusercontent.com/Lattice-Data/encoded/refs/heads/dev/src/encoded/types/ontology_term.py
then check things
then remove?

In [20]:
system_slim_terms = {
    'UBERON:0000363': 'reticuloendothelial system', #subclass of UBERON:0002405
    'UBERON:0002405': 'immune system',
    'UBERON:0004535': 'cardiovascular system', #subclass of UBERON:0001009
    'UBERON:0001009': 'circulatory system',
    'UBERON:0001017': 'central nervous system', #subclass of UBERON:0001016
    'UBERON:0000010': 'peripheral nervous system', #subclass of UBERON:0001016
    'UBERON:0001016': 'nervous system',
    'UBERON:0000383': 'musculature of body', #subclass of UBERON:0002204
    'UBERON:0001434': 'skeletal system', #subclass of UBERON:0002204
    'UBERON:0002204': 'musculoskeletal system',
    'UBERON:0001032': 'sensory system', #subclass of UBERON:0004456
    'UBERON:0004456': 'entire sense organ system',
    'UBERON:0001007': 'digestive system',
    'UBERON:0000949': 'endocrine system',
    'UBERON:0002330': 'exocrine system',
    'UBERON:0002390': 'hematopoietic system',
    'UBERON:0002416': 'integumental system',
    'UBERON:0001008': 'renal system',
    'UBERON:0000990': 'reproductive system',
    'UBERON:0001004': 'respiratory system'
}

organ_slim_terms = {
    'UBERON:0001155': 'colon', #subclass of UBERON:0000059,UBERON:0000160,UBERON:0001555
    'UBERON:0000059': 'large intestine', #subclass of UBERON:0000160,UBERON:0001555
    'UBERON:0002108': 'small intestine', #subclass of UBERON:0000160,UBERON:0001555
    'UBERON:0000160': 'intestine', #subclass of UBERON:0001555
    'UBERON:0001723': 'tongue', #subclass of UBERON:0000165,UBERON:0001555
    'UBERON:0001829': 'major salivary gland', #subclass of UBERON:0000165,UBERON:0002365,UBERON:0001555
    'UBERON:0000165': 'mouth', #subclass of UBERON:0001555
    'UBERON:0001043': 'esophagus', #subclass of UBERON:0001555
    'UBERON:0000945': 'stomach', #subclass of UBERON:0001555
    'UBERON:0006562': 'pharynx', #subclass of UBERON:0001555
    'UBERON:0001350': 'coccyx', #subclass of UBERON:0001474,UBERON:0004288
    'UBERON:0004288': 'skeleton',
    'UBERON:0002371': 'bone marrow', #subclass of UBERON:0001474
    'UBERON:0001474': 'bone element',
    'UBERON:0000007': 'pituitary gland', #subclass of UBERON:0000955,UBERON:0002368
    'UBERON:0003547': 'brain meninx', #subclass of UBERON:0000955
    'UBERON:0000955': 'brain',
    'UBERON:0002182': 'main bronchus', #subclass of UBERON:0002185
    'UBERON:0002185': 'bronchus',
    'UBERON:0000998': 'seminal vesicle', #subclass of UBERON:0000991,UBERON:0000473
    'UBERON:0000473': 'testis', #subclass of UBERON:0000991
    'UBERON:0000992': 'ovary', #subclass of UBERON:0000991
    'UBERON:0000991': 'gonad',
    'UBERON:0002073': 'hair follicle', #subclass of UBERON:0002097,UBERON:0000483
    'UBERON:0001820': 'sweat gland', #subclass of UBERON:0002097,UBERON:0000483,UBERON:0002365
    'UBERON:0001821': 'sebaceous gland', #subclass of UBERON:0002097,UBERON:0000483,UBERON:0002365
    'UBERON:0002097': 'skin of body',
    'UBERON:0001817': 'lacrimal gland', #subclass of UBERON:0000970,UBERON:0002365
    'UBERON:0000970': 'eye',
    'UBERON:0000029': 'lymph node', #subclass of UBERON:0005057
    'UBERON:0002106': 'spleen', #subclass of UBERON:0005057
    'UBERON:0002370': 'thymus', #subclass of UBERON:0002368,UBERON:0005057
    'UBERON:0002107': 'liver', #subclass of UBERON:0002368,UBERON:0002365
    'UBERON:0002369': 'adrenal gland', #subclass of UBERON:0002368
    'UBERON:0002046': 'thyroid gland', #subclass of UBERON:0002368
    'UBERON:0001132': 'parathyroid gland', #subclass of UBERON:0002368
    'UBERON:0002368': 'endocrine gland',
    'UBERON:0001911': 'mammary gland', #subclass of UBERON:0002365
    'UBERON:0000414': 'mucous gland', #subclass of UBERON:0002365
    'UBERON:0002365': 'exocrine gland',
    'UBERON:0000178': 'blood', #subclass of UBERON:0006314
    'UBERON:0006314': 'bodily fluid',
    'UBERON:0003509': 'arterial blood vessel', #subclass of UBERON:0001981,UBERON:0002049
    'UBERON:0001638': 'vein', #subclass of UBERON:0001981,UBERON:0002049
    'UBERON:0001981': 'blood vessel', #subclass of UBERON:0002049
    'UBERON:0001473': 'lymphatic vessel', #subclass of UBERON:0002049
    'UBERON:0000043': 'tendon', #subclass of UBERON:0002384
    'UBERON:0001013': 'adipose tissue', #subclass of UBERON:0002384
    'UBERON:0001987': 'placenta', #subclass of UBERON:0016887
    'UBERON:0000310': 'breast',
    'UBERON:0001103': 'diaphragm',
    'UBERON:0001690': 'ear',
    'UBERON:0000922': 'embryo',
    'UBERON:0003889': 'fallopian tube',
    'UBERON:0002110': 'gallbladder',
    'UBERON:0000948': 'heart',
    'UBERON:0002113': 'kidney',
    'UBERON:0001737': 'larynx',
    'UBERON:0002101': 'limb',
    'UBERON:0002048': 'lung',
    'UBERON:0001744': 'lymphoid tissue',
    'UBERON:0001021': 'nerve',
    'UBERON:0000004': 'nose',
    'UBERON:0001264': 'pancreas',
    'UBERON:0000989': 'penis',
    'UBERON:0002407': 'pericardium',
    'UBERON:0002367': 'prostate gland',
    'UBERON:0002240': 'spinal cord',
    'UBERON:0003126': 'trachea',
    'UBERON:0000056': 'ureter',
    'UBERON:0000057': 'urethra',
    'UBERON:0001255': 'urinary bladder',
    'UBERON:0000995': 'uterus',
    'UBERON:0000996': 'vagina',
    'UBERON:0001555': 'digestive tract',
    'UBERON:0002049': 'vasculature',
    'UBERON:0002384': 'connective tissue',
    'UBERON:0005057': 'immune organ',
    'UBERON:0007844': 'cartilage element',
    'UBERON:0016887': 'entire extraembryonic component',
    'UBERON:0000483': 'epithelium'
}

cell_slim_terms = {
    'CL:0000236': 'B cell', #subclass of CL:0000542,CL:0000738,CL:0000988
    'CL:0000084': 'T cell', #subclass of CL:0000542,CL:0000738,CL:0000988
    'CL:0000542': 'lymphocyte', #subclass of CL:0000763,CL:0000738,CL:0000988
    'CL:0000094': 'granulocyte', #subclass of CL:0000763,CL:0000738,CL:0000988
    'CL:0000576': 'monocyte', #subclass of CL:0000763,CL:0000738,CL:0000988
    'CL:0000763': 'myeloid cell', #subclass of CL:0000988
    'CL:0000738': 'leukocyte', #subclass of CL:0000988
    'CL:0000988': 'hematopoietic cell',
    'CL:0000312': 'keratinocyte', #subclass of CL:0000066
    'CL:0000115': 'endothelial cell', #subclass of CL:0000066
    'CL:0000066': 'epithelial cell',
    'CL:0000057': 'fibroblast', #subclass of CL:0002320
    'CL:0000669': 'pericyte', #subclass of CL:0002320
    'CL:0002320': 'connective tissue cell',
    'CL:0002321': 'embryonic cell (metazoa)',
    'CL:0002494': 'cardiocyte',
    'CL:0000148': 'melanocyte',
    'CL:0000056': 'myoblast',
    'CL:0002319': 'neural cell',
    'CL:0000192': 'smooth muscle cell',
    'CL:0000034': 'stem cell',
    'EFO:0004905': 'induced pluripotent stem cell', #subclass of CL:0000034
    'EFO:0002886': 'stem cell derived cell line' #subclass of CL:0000034
}

disease_slim_terms = {
    'MONDO:0005015': 'diabetes mellitus', #subclass of MONDO:0004335,MONDO:0005066
    'MONDO:0004335': 'digestive system disorder',
    'MONDO:0005066': 'metabolic disease',
    'MONDO:0002280': 'anemia',
    'MONDO:0005578': 'arthritic joint disease',
    'MONDO:0005113': 'bacterial infectious disease',
    'MONDO:0004992': 'cancer',
    'MONDO:0005044': 'hypertensive disorder',
    'MONDO:0005240': 'kidney disorder',
    'MONDO:0005084': 'mental disorder',
    'MONDO:0100081': 'sleep disorder',
    'MONDO:0007179': 'autoimmune disease'
}

development_slim_terms = {
    "HsapDv:0000002": "embryonic stage",
    "HsapDv:0000037": "fetal stage",
    "HsapDv:0000262": "newborn stage (0-28 days)",
    "HsapDv:0000260": "nursing stage (0-11 months)",
    "HsapDv:0000265": "child stage (1-4 yo)",
    "HsapDv:0000271": "juvenile stage (5-14 yo)",
    "HsapDv:0000258": "adult stage"
}

ethnicity_slim_terms = {
    "HANCESTRO:0009": "East Asian",
    "HANCESTRO:0006": "South Asian",
    "HANCESTRO:0007": "South East Asian",
    "HANCESTRO:0008": "Asian",
    "HANCESTRO:0005": "European",
    "HANCESTRO:0014": "Hispanic or Latin American",
    "HANCESTRO:0010": "African",
    "HANCESTRO:0017": "Oceanian",
    "HANCESTRO:0016": "African American or Afro-Caribbean",
    "HANCESTRO:0015": "Greater Middle Eastern (Middle Eastern, North African or Persian)",
    "HANCESTRO:0013": "Native American"
}

qa_slim_terms = {
    "CL:0000000": "cell",
    "EFO:0010183": "single cell library construction",
    'NCIT:C81239': 'Cause of Death',
    'NCIT:C7057': 'Disease, Disorder or Finding',
    'NCIT:C3394': 'Suicide',
    'NCIT:C28554': 'Dead'
}

In [None]:
#NO NEED TO REDO SEARCH - START FROM ABOVE
url = urljoin(demo_server, 'search/?type=OntologyTerm&limit=all&field=' + '&field='.join(slim_fields))
results = requests.get(url, auth=connection.auth).json()

#ACTUALLY JUST NEED TO SEE IF ANY SLIMS ARE MISSING ONTOLOGYTERM OBJECTS
#WE CHECK AUDITS ABOVE, RIGHT?

In [21]:
#BETTER WAY TO GET THE SLIM TERMS?
for term_list in [
    system_slim_terms,
    organ_slim_terms,
    cell_slim_terms,
    disease_slim_terms,
    development_slim_terms,
    ethnicity_slim_terms,
    qa_slim_terms
]:
    for i,l in term_list.items():
        print(i)
        print(l)

UBERON:0000363
reticuloendothelial system
UBERON:0002405
immune system
UBERON:0004535
cardiovascular system
UBERON:0001009
circulatory system
UBERON:0001017
central nervous system
UBERON:0000010
peripheral nervous system
UBERON:0001016
nervous system
UBERON:0000383
musculature of body
UBERON:0001434
skeletal system
UBERON:0002204
musculoskeletal system
UBERON:0001032
sensory system
UBERON:0004456
entire sense organ system
UBERON:0001007
digestive system
UBERON:0000949
endocrine system
UBERON:0002330
exocrine system
UBERON:0002390
hematopoietic system
UBERON:0002416
integumental system
UBERON:0001008
renal system
UBERON:0000990
reproductive system
UBERON:0001004
respiratory system
UBERON:0001155
colon
UBERON:0000059
large intestine
UBERON:0002108
small intestine
UBERON:0000160
intestine
UBERON:0001723
tongue
UBERON:0001829
major salivary gland
UBERON:0000165
mouth
UBERON:0001043
esophagus
UBERON:0000945
stomach
UBERON:0006562
pharynx
UBERON:0001350
coccyx
UBERON:0004288
skeleton
UBERON: