In [None]:
import ftplib
import json
import numpy as np
import os
import pandas as pd
import requests
import sys
import tarfile
import xml.etree.ElementTree as ET

**Specify the locations of the cloned [single-cell-curation repo](https://github.com/chanzuckerberg/single-cell-curation) & your API key file**

In [None]:
scc_repo_loc = os.path.expanduser('~/GitClones/CZI/')
api_key_file_path = os.path.expanduser('~/Documents/keys/cxg-api-key-dev.txt')

In [None]:
sys.path.append(os.path.abspath(scc_repo_loc + 'single-cell-curation/notebooks/curation_api/python/'))


from src.utils.config import set_api_access_config
from src.collection import get_collection


set_api_access_config() #use env='dev' to work on the dev site

In [None]:
base_urls = {
    'data.humancellatlas.org/explore/projects/': 'hca',
    'ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GS': 'geo',
    'ncbi.nlm.nih.gov/projects/gap': 'dbgap',
    'ncbi.nlm.nih.gov/bioproject/?term=PRJ': 'bioproj',
    'ega-archive.org': 'ega',
    'ebi.ac.uk/ena/browser/view/': 'ena',
    'ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB': 'arrex',
    'assets.nemoarchive.org/': 'nemo'
}


def parse_url(url):
    for k,v in base_urls.items():
        if k in url:
            return v
    return 'other'

In [None]:
ncbi_raw_data_formats = ['fastq','TenX','bam']


def validate_raw_ncbi(acc):
    prj_flag = False
    if acc.startswith('GS'):
        url1 = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=bioproject&term={acc}[Project Accession]&retmode=json'
        r1 = requests.get(url1).json()
        if r1['esearchresult']['idlist']:
            i = r1['esearchresult']['idlist'][0] #list of ids, ideally - only search entry type:Series
            url2 = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=bioproject&id={i}'
            r2 = requests.get(url2)
            responseXml = ET.fromstring(r2.text)
            for a in responseXml.iter('ArchiveID'):
                prj = a.attrib['accession']
                prj_flag = True
    else:
        prj = acc
        prj_flag = True

    #some GSE don't return any results searching bioproject - e.g. GSM5027160
    if not prj_flag:
        url5 = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term={acc}[GEO Accession]&retmode=json'
        r5 = requests.get(url5).json()
        if r5['esearchresult']['idlist']:
            i = r5['esearchresult']['idlist'][0] #list of ids, ideally - only search entry type:Series

            url6 = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gds&id={i}'
            r6 = requests.get(url6)
            for line in r6.text.split('\n'):
                if line.startswith('SRA Run Selector:'):
                    prj_flag = True
                    prj = line.split('acc=')[-1]

    if prj_flag:
        url3 = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=sra&term={prj}&retmode=json&retmax=100000'
        r3 = requests.get(url3).json()
        idlist = r3['esearchresult']['idlist']
        sublists = [idlist[i:i+500] for i in range(0, len(idlist), 500)]
        for sub in sublists:
            ids = ','.join(sub)
            url4 = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=sra&id={ids}'
            r4 = requests.get(url4)
            #parse the records for needed information & write report
            responseXml = ET.fromstring(r4.text)
            for ep in responseXml.iter('EXPERIMENT_PACKAGE'):
                for run in ep.iter('RUN'):
                    for cf in run.iter('CloudFile'):
                        if cf.attrib['filetype'] in ncbi_raw_data_formats:
                            return True
    return False

In [None]:
hca_raw_data_formats = ['fastq.gz','fastq','fq.gz']


def validate_raw_hca(url):
    api_base = 'https://service.azul.data.humancellatlas.org' 
    pj_id = url.split('/')[-1]
    query = {
        'projectId': {'is': [pj_id]}
    }
    q_url = f'{api_base}/index/files/?filters={json.dumps(query)}&size=250'
    r = requests.get(q_url, headers={'Content-Type': 'application/json'}).json()
    hits = r['hits']
    while r['pagination']['next']:
        next_endpoint = r['pagination']['next']
        r = requests.get(next_endpoint).json()
        hits.extend(r['hits'])
    formats_in_prj = set([f['format'] for h in hits for f in h['files']])
    present = [f for f in hca_raw_data_formats if f in formats_in_prj]
    if present:
        return True
    return False

In [None]:
def validate_raw_arrex(url):
    acc = url.split('/')[-1]
    api_base = 'https://www.ebi.ac.uk/biostudies/api/v1'
    q_url = f'{api_base}/studies/{acc}/info'
    r = requests.get(q_url).json()
    ftp_link = r['ftpLink']

    ftp = ftplib.FTP('ftp.ebi.ac.uk', 'anonymous', 'anonymous@')
    ftp.cwd(ftp_link.replace('ftp://ftp.ebi.ac.uk','') + '/Files')

    filename = f'{acc}.sdrf.txt' #f'{acc}.sdrf.txt' is a tab-delimited with some fastq names
    with open(filename, 'wb') as file:
        ftp.retrbinary(f'RETR {filename}', file.write)

    erxs = []
    df = pd.read_csv(filename, sep='\t')
    for c in df.columns:
        if '[ENA_EXPERIMENT]' in c:
            erxs.extend(df[c].dropna().unique())
    os.remove(filename)

    ftp.quit()

    if erxs:
        return True
    return False

In [None]:
nemo_raw_data_formats = ['fastq.tar']


def validate_raw_nemo(url):
    for df in pd.read_html(url):
        if 'Dataset Collection URL' in df['Field'].unique():
            coll_url = df.loc[df['Field'] == 'Dataset Collection URL']['Value'].iloc[0]
            if str(coll_url).endswith('.tgz'):
                r = requests.get(coll_url)
                with open('temp.tgz','wb') as f:
                    f.write(r.content)
                file_list = []
                tar = tarfile.open('temp.tgz', 'r:gz')
                for item in tar:
                    if item.name.endswith('fetch.txt'):
                        tar.extract(item)
                        with open(item.name, 'r') as f:
                            for line in (f.read().split('\n')):
                                file_list.append(line.split('\t')[0])
                        os.remove(item.name)
                        os.rmdir(item.name.split('/')[0])
                        os.remove('temp.tgz')
                raw_files = [f for f in file_list if f.endswith(tuple(nemo_raw_data_formats))]
                if raw_files:
                    return True
        else:
            i = df.loc[df['Field'] == 'Identifier']['Value'].iloc[0].split(':')[1]
            raw_present = validate_raw_nemo('https://assets.nemoarchive.org/' + i)
            if raw_present == True:
                return True
    return False

In [None]:
#https://metadata.ega-archive.org/spec
ega_raw_data_formats = ['fastq.gz','bam','cram']


def validate_raw_ega(url):
    acc = url.split('/')[-1]
    obj_type = url.split('/')[-2]
    api_base = 'https://metadata.ega-archive.org'

    if obj_type == 'studies':
        ds_query = f'{api_base}/studies/{acc}/datasets?limit=100000'
        response = requests.get(ds_query).json()
        datasets = [d['accession_id'] for d in response]
    else:
        datasets = [acc]

    for d in datasets:
        files_query = f'{api_base}/datasets/{d}/files?limit=100000'
        response = requests.get(files_query).json()
        raw_files = [r for r in response if r['extension'] in ega_raw_data_formats]
        if raw_files:
            return True
    return False

**Specify the Collection to upload to**<br>
If a Revision, use the Revision ID, not the Published ID

In [None]:
collection_id = ''

In [None]:
#check is_primary_data
collection = get_collection(collection_id)
pd.DataFrame(collection['datasets'])[['title','dataset_id','is_primary_data','cell_count','primary_cell_count']]

In [None]:
#check raw data links
#check GEO link is named after accession
#array express too?
#dbgap shouldn't include the version
for l in collection['links']:
    raw_present = False
    url,ltype = l['link_url'],l['link_type']
    name = 'NO NAME' if not l['link_name'] else l['link_name']
    resource = parse_url(url)

    if resource in ['geo','bioproj','ena','dbgap']:
        acc = url.split('=')[-1].split('/')[-1]
        raw_present = validate_raw_ncbi(acc)
        if acc not in name:
            print(f'ERROR: expecting {url} to be named {acc}, not {name}')
    elif resource == 'hca':
        raw_present = validate_raw_hca(url)
    elif resource == 'arrex':
        raw_present = validate_raw_arrex(url)
    elif resource == 'nemo':
        raw_present = validate_raw_nemo(url)
    elif resource == 'ega':
        raw_present = validate_raw_ega(url)

    if raw_present and ltype != 'RAW_DATA':
        print(f'ERROR: raw data found at {url}, expecting RAW DATA link_type, not {ltype}')
    elif not raw_present and ltype == 'RAW_DATA':
        print(f'ERROR: link_type:RAW DATA but raw data not found at {url}')