In [1]:
import shutil
import json
import os
from google.cloud import storage

## create submission packages for each of the new datasets

### Make Directories

In [2]:
testdir = os.environ['TMPDIR'] + 'process_10x_v2_datasets'
os.environ['testdir'] = testdir
os.makedirs(testdir, exist_ok=True)

## Make the dependencies

In [36]:
skylab_directory = '../../../skylab'

In [37]:
# grab the dependencies; these are static
workflow_dependencies = {
    "StarAlignBamSingleEnd.wdl": "{0}/library/tasks/StarAlignBamSingleEnd.wdl".format(skylab_directory),
    "FastqToUBam.wdl": "{0}/library/tasks/FastqToUBam.wdl".format(skylab_directory),
    "Attach10xBarcodes.wdl": "{0}/library/tasks/Attach10xBarcodes.wdl".format(skylab_directory),
    "SplitBamByCellBarcode.wdl": "{0}/library/tasks/SplitBamByCellBarcode.wdl".format(skylab_directory),
    "TagGeneExon.wdl": "{0}/library/tasks/TagGeneExon.wdl".format(skylab_directory),
    "CorrectUmiMarkDuplicates.wdl": "{0}/library/tasks/CorrectUmiMarkDuplicates.wdl".format(skylab_directory),
    "CollectMultiplePicardMetrics.wdl": "{0}/library/tasks/CollectMultiplePicardMetrics.wdl".format(skylab_directory),
    "MergeSortBam.wdl": "{0}/library/tasks/MergeSortBam.wdl".format(skylab_directory),
    "CreateCountMatrix.wdl": "{0}/library/tasks/CreateCountMatrix.wdl".format(skylab_directory),
    "AlignTagCorrectUmis.wdl": "{0}/library/subworkflows/AlignTagCorrectUmis.wdl".format(skylab_directory)    
}

In [4]:
# copy all the dependencies to the temp dir
dependency_dir = testdir + '/dependencies'
os.makedirs(dependency_dir, exist_ok=True)
for v in workflow_dependencies.values():
    shutil.copy(v, dependency_dir)

In [5]:
%%bash 
# make the dependencies zip
cd ${testdir}/dependencies 
zip -cq dependencies.zip ./*

In [6]:
dependencies_zip = testdir + '/dependencies/dependencies.zip'

## Copy the workflow to the testing dir

In [39]:
%%bash 
skylab_directory=../../../skylab
cp "${skylab_directory}"/pipelines/optimus/Optimus.wdl ${testdir}

## Make the inputs json files

### Make the 10x inputs

In [8]:
# static inputs first, easiest
static_inputs = {
    "Optimus.whitelist": "gs://broad-dsde-mint-dev-teststorage/10x/whitelist/737K-august-2016.txt",
}

### Make the genome inputs

In [9]:
human_genome_inputs = {
    "Optimus.tar_star_reference": "gs://hca-dcp-mint-test-data/reference/GRCh38_Gencode/GRCh38_GencodeV27_Primary.tar",
    "Optimus.annotations_gtf": "gs://hca-dcp-mint-test-data/reference/GRCh38_Gencode/gencode.v27.primary_assembly.annotation.gtf",
    "Optimus.ref_genome_fasta": "gs://hca-dcp-mint-test-data/reference/GRCh38_Gencode/GRCh38.primary_assembly.genome.fa"
}

In [10]:
mouse_genome_inputs = {
    "Optimus.tar_star_reference": "gs://hca-dcp-mint-test-data/reference/GRCm38_Gencode/GRCm38_star_genome.tar",
    "Optimus.annotations_gtf": "gs://hca-dcp-mint-test-data/reference/GRCm38_Gencode/gencode.vM16.primary_assembly.annotation.gtf",
    "Optimus.ref_genome_fasta": "gs://hca-dcp-mint-test-data/reference/GRCm38_Gencode/GRCm38.primary_assembly.genome.fa"
}

In [11]:
mouse_human_genome_inputs = {
    "Optimus.tar_star_reference": "gs://hca-dcp-mint-test-data/reference/GRCm38_GRCh38_Gencode/GRCm38_GRCh38_star_genome.tar",
    "Optimus.annotations_gtf": "gs://hca-dcp-mint-test-data/reference/GRCm38_GRCh38_Gencode/mmhg.gtf",
    "Optimus.ref_genome_fasta": "gs://hca-dcp-mint-test-data/reference/GRCm38_GRCh38_Gencode/mmhg.fa"
}

In [12]:
human_flu_genome_inputs = {
    "Optimus.tar_star_reference": "gs://hca-dcp-mint-test-data/reference/Flu_GRCh38_Gencode/Flu_GRCh38_star_genome.tar",
    "Optimus.annotations_gtf": "gs://hca-dcp-mint-test-data/reference/Flu_GRCh38_Gencode/hg_flu.gtf",
    "Optimus.ref_genome_fasta": "gs://hca-dcp-mint-test-data/reference/Flu_GRCh38_Gencode/hg_flu.fa"
}

In [13]:
# notes on the flu data:
# the flu file was not formatted properly; it needs gene_name and transcript_name fields in addition to gene_id and transcript_id, 
# and fields must all be terminated with semicolons. 

### Make the sample inputs and combine with appropriate genomes

In [14]:
%%bash
mkdir -p ${testdir}/inputs

In [15]:
# these samples have i1 files
sample = [
    ('A549_virus/10hr', human_flu_genome_inputs),
    ('A549_virus/6hr',  human_flu_genome_inputs),
    ('A549_virus/8hr',  human_flu_genome_inputs),
    ('A549_virus/8hr-rep2',  human_flu_genome_inputs),
    ('A549_virus/uninfected',  human_flu_genome_inputs),
    ('hgmm_12k', mouse_human_genome_inputs),
    ('hgmm_1k', mouse_human_genome_inputs),
    ('hgmm_6k', mouse_human_genome_inputs),
    ('pbmc4k', human_genome_inputs),
    ('pbmc8k', human_genome_inputs),
    ('t_3k', human_genome_inputs),
    ('t_4k', human_genome_inputs),
]


for s, g in sample:
    i1 = !gsutil ls gs://hca-dcp-mint-test-data/10x/$s/ | grep "I1"
    r1 = !gsutil ls gs://hca-dcp-mint-test-data/10x/$s/ | grep "R1"
    r2 = !gsutil ls gs://hca-dcp-mint-test-data/10x/$s/ | grep "R2"

    specific_inputs = {
        "Optimus.sample_id": s.replace('/', '_'),
        "Optimus.i1": i1,
        "Optimus.r1": r1,
        "Optimus.r2": r2
    }
    
    # build the sample inputs
    inputs_dict = {**static_inputs, **g, **specific_inputs}
    with open(testdir + '/inputs/{}_inputs.json'.format(s.replace('/', '_')), 'w') as f:
        json.dump(inputs_dict, f)

In [16]:
# these samples are r1 + r2 only (no i1 files)
sample = [
    ('kidney/human-organoid', human_genome_inputs),
    ('kidney/mouse-e18', mouse_genome_inputs),
    ('cd14mono/donor2', human_genome_inputs),
    ('rs_pbmc_barcoded/adt', mouse_human_genome_inputs),
    ('rs_pbmc_barcoded/hto', mouse_human_genome_inputs),
    ('rs_pbmc_barcoded/rna', mouse_human_genome_inputs)
]

for s, g in sample:
    r1 = !gsutil ls gs://hca-dcp-mint-test-data/10x/$s/ | grep "_1"
    r2 = !gsutil ls gs://hca-dcp-mint-test-data/10x/$s/ | grep "_2"

    specific_inputs = {
        "Optimus.sample_id": s.replace('/', '_'),
        "Optimus.r1": r1,
        "Optimus.r2": r2
    }
    
    # build the sample inputs
    inputs_dict = {**static_inputs, **g, **specific_inputs}
    with open(testdir + '/inputs/{}_inputs.json'.format(s.replace('/', '_')), 'w') as f:
        json.dump(inputs_dict, f)

In [40]:
# this is the main pipeline wdl
wdl = '{0}/pipelines/optimus/Optimus.wdl'.format(skylab_directory)

## submit everything to cromwell

In [17]:
import json
import os
from google.cloud import storage
import cromwell_manager as cwm

In [19]:
storage_client = storage.Client(project='broad-dsde-mint-dev')
with open(os.path.expanduser('~/.ssh/mint_cromwell_config.json')) as f:
    cromwell_server = cwm.Cromwell(**json.load(f))

In [20]:
input_dir = testdir + '/inputs/'
inputs = os.listdir(input_dir)

In [27]:
workflows = {}

In [149]:
for input_json in inputs:
    name = input_json.rpartition('_')[0].split('/')[-1]
    workflows[name] = cwm.Workflow.from_submission(
        wdl=wdl, 
        inputs_json=input_dir + input_json, 
        cromwell_server=cromwell_server,
        workflow_dependencies=dependencies_zip,
        storage_client=storage_client)

Successful Runs & Data Locations: 
```
A549_virus_10hr {'status': 'Succeeded', 'id': '4761f352-d5a4-4c10-852d-54cb24057021'}
A549_virus_6hr {'status': 'Succeeded', 'id': '01c3eb26-1d20-45dd-90d1-77b58faf43f4'}
A549_virus_8hr-rep2 {'status': 'Succeeded', 'id': '9866bf21-a96b-49c2-93d4-bbd87a5ea0f2'}
A549_virus_8hr {'status': 'Succeeded', 'id': '31c8a21e-e53a-4f04-88cb-6073cf811328'}
A549_virus_uninfected {'status': 'Succeeded', 'id': 'b4fe7d27-742f-4936-ab4f-60f14821846d'}
kidney_human-organoid {'status': 'Running', 'id': '1c988f53-9133-4e10-80b5-d9a5f602884b'}
kidney_mouse-e18 {'status': 'Running', 'id': 'ce9dda49-2b6e-462e-a323-78badc605ad3'}
cd14mono_donor2 {'status': 'Succeeded', 'id': '5a42e722-c674-47ea-ade5-5bd4ebfa0696'}
hgmm_12k {'status': 'Succeeded', 'id': '9b97f79c-8582-48d2-87b3-78936b4e45db'}
hgmm_1k {'status': 'Succeeded', 'id': 'f34bc386-5edf-4c63-b15f-ca19c3ca2a64'}
hgmm_6k {'status': 'Succeeded', 'id': 'cb6c3743-6640-443f-8981-db84430757f7'}
pbmc4k {'status': 'Succeeded', 'id': '12a865e4-3632-4b82-99c7-84dbbd73c39b'}
pbmc8k {'status': 'Succeeded', 'id': '32c84fc4-48dc-4aaa-943b-6a96aa7ff106'}
rs_pbmc_barcoded_adt {'status': 'Succeeded', 'id': '658e6717-65fb-42d5-a640-7a99e5521c1d'}
rs_pbmc_barcoded_hto {'status': 'Succeeded', 'id': 'aa8d8ca9-14b1-42b0-9bc5-a28bd13f0839'}
rs_pbmc_barcoded_rna {'status': 'Succeeded', 'id': 'b6d0d002-c597-4bb3-aff4-587673f284db'}
t_3k {'status': 'Succeeded', 'id': '92b813f7-748d-4727-a579-b7f737d01b8d'}
t_4k {'status': 'Succeeded', 'id': '8f17344c-6ef2-48c0-a6d3-9b7f39fa6f89'}
kidney_human-organoid {'status': 'Succeeded', 'id': '1c988f53-9133-4e10-80b5-d9a5f602884b'}
kidney_mouse-e18 {'status': 'Succeeded', 'id': 'ce9dda49-2b6e-462e-a323-78badc605ad3'}
```

## Move all the outputs to our public bucket. 

In [42]:
# these are the succeeding workflows
workflows = {
    "A549_virus/10hr":'4761f352-d5a4-4c10-852d-54cb24057021',
    "A549_virus/6hr":'01c3eb26-1d20-45dd-90d1-77b58faf43f4',
    "A549_virus/8hr-rep2":'9866bf21-a96b-49c2-93d4-bbd87a5ea0f2',
    "A549_virus/8hr":'31c8a21e-e53a-4f04-88cb-6073cf811328',
    "A549_virus/uninfected":'b4fe7d27-742f-4936-ab4f-60f14821846d',
    "cd14mono/donor2":'5a42e722-c674-47ea-ade5-5bd4ebfa0696',
    "hgmm_12k":'9b97f79c-8582-48d2-87b3-78936b4e45db',
    "hgmm_1k":'f34bc386-5edf-4c63-b15f-ca19c3ca2a64',
    "hgmm_6k":'cb6c3743-6640-443f-8981-db84430757f7',
    "pbmc4k":'12a865e4-3632-4b82-99c7-84dbbd73c39b',
    "pbmc8k":'32c84fc4-48dc-4aaa-943b-6a96aa7ff106',
    "rs_pbmc_barcoded/adt":'658e6717-65fb-42d5-a640-7a99e5521c1d',
    "rs_pbmc_barcoded/hto":'aa8d8ca9-14b1-42b0-9bc5-a28bd13f0839',
    "rs_pbmc_barcoded/rna":'b6d0d002-c597-4bb3-aff4-587673f284db',
    "t_3k":'92b813f7-748d-4727-a579-b7f737d01b8d',
    "t_4k":'8f17344c-6ef2-48c0-a6d3-9b7f39fa6f89',
    "kidney_human/organoid":'1c988f53-9133-4e10-80b5-d9a5f602884b',
    "kidney_mouse/e18":'ce9dda49-2b6e-462e-a323-78badc605ad3'
}

In [59]:
!gsutil ls gs://hca-dcp-mint-test-data/10x/rs_pbmc_barcoded/

gs://hca-dcp-mint-test-data/10x/rs_pbmc_barcoded/adt/
gs://hca-dcp-mint-test-data/10x/rs_pbmc_barcoded/hto/
gs://hca-dcp-mint-test-data/10x/rs_pbmc_barcoded/rna/


In [51]:
outputs_to_produce = [
    'Optimus.matrix_summary', 
    'Optimus.matrix',
    'Optimus.bam',
    'Optimus.picard_metrics'
]
for name, wf in workflows.items():
    outputs = cromwell_server.outputs(wf).json()['outputs']
    for k in outputs_to_produce:
        output_location = outputs[k]
        !gsutil cp -m output_location gs://hca-dcp-mint-test-data/10x/${name}/