In [1]:
# for workflow management
import json
import os
from google.cloud import storage
import cromwell_manager as cwm

with open(os.path.expanduser('~/.ssh/mint_cromwell_config.json')) as f:
    cromwell_server = cwm.Cromwell(**json.load(f))

storage_client = storage.Client(project='broad-dsde-mint-dev')

In [2]:
# later this will be set!
os.environ['wdltool'] = os.path.expanduser('~/google_drive/software/wdltool-0.14.jar')

## Create an UMI file for testing

Mock up an UMI file with randomers; kallisto expects a newline-separated file with only one umi on each line. Upload this to the bucket. 

In [None]:
import sctools
rd = sctools.fastq.Reader('/Users/carra1/local_test_data/SRR1295257_1.fastq.gz', 'r')
file_length = len(rd)

import random
alphabet = 'ACGT'
with open('/Users/carra1/local_test_data/umis.txt', 'w') as f:
    for _ in range(file_length):
        f.write(''.join(random.choices(alphabet, k=4)) + '\n')

In [20]:
!gsutil cp ~/local_test_data/umis.txt gs://broad-dsde-mint-dev-teststorage/patel_ap/SRR1295257_mock_umis.txt

Copying file:///Users/carra1/local_test_data/umis.txt [Content-Type=text/plain]...
\ [1 files][  4.4 MiB/  4.4 MiB]                                                
Operation completed over 1 objects/4.4 MiB.                                      


## Run the test wdl

In [10]:
inputs_json = {
    "test_kallisto.transcriptome_fasta": "gs://broad-dsde-mint-dev-teststorage/reference/GRCh38_Gencode/gencode.v27.transcripts.fa.gz",
    "test_kallisto.r1": "gs://broad-dsde-mint-dev-teststorage/patel_ap/SRR1295257_1.fastq.gz",
    "test_kallisto.r2": "gs://broad-dsde-mint-dev-teststorage/patel_ap/SRR1295257_2.fastq.gz",
    "test_kallisto.umi": "gs://broad-dsde-mint-dev-teststorage/patel_ap/SRR1295257_mock_umis.txt",
    "test_kallisto.k": 15,
}

wdl = 'test_kallisto.wdl'

dependencies = {
    'Kallisto.wdl': '../pipelines/tasks/Kallisto.wdl',
}

In [15]:
cwm.Workflow.validate(
    wdl=wdl, inputs_json=inputs_json, cromwell_server=cromwell_server, storage_client=storage_client,
    workflow_dependencies=dependencies)

CWM:2017-11-13 23:22:55.023595:creating temporary directory
CWM:2017-11-13 23:22:55.024036:writing dependencies
CWM:2017-11-13 23:22:55.035982:writing wdl
CWM:2017-11-13 23:22:55.036902:running wdltool validate
CWM:2017-11-13 23:22:56.415079:validation successful
CWM:2017-11-13 23:22:56.897222:checking docker image humancellatlas/kallisto:0.43.1... OK.


In [16]:
test_kallisto = cwm.Workflow.from_submission(
    wdl=wdl, inputs_json=inputs_json, cromwell_server=cromwell_server, storage_client=storage_client,
    workflow_dependencies=dependencies)

In [17]:
test_kallisto.status

{'id': '2d57ff39-b10d-4a6d-8ed3-acdd478eff48', 'status': 'Succeeded'}

## Verify the outputs

In [19]:
test_kallisto.outputs

{'id': '2d57ff39-b10d-4a6d-8ed3-acdd478eff48',
 'outputs': {'test_kallisto.Mkref.index': 'gs://broad-dsde-mint-dev-cromwell-execution/cromwell-executions/test_kallisto/2d57ff39-b10d-4a6d-8ed3-acdd478eff48/call-Mkref/kallisto.idx',
  'test_kallisto.PseudoSingleEndUMI.pseudo_cells': 'gs://broad-dsde-mint-dev-cromwell-execution/cromwell-executions/test_kallisto/2d57ff39-b10d-4a6d-8ed3-acdd478eff48/call-PseudoSingleEndUMI/matrix.cells',
  'test_kallisto.PseudoSingleEndUMI.pseudo_ec': 'gs://broad-dsde-mint-dev-cromwell-execution/cromwell-executions/test_kallisto/2d57ff39-b10d-4a6d-8ed3-acdd478eff48/call-PseudoSingleEndUMI/matrix.ec',
  'test_kallisto.PseudoSingleEndUMI.pseudo_tsv': 'gs://broad-dsde-mint-dev-cromwell-execution/cromwell-executions/test_kallisto/2d57ff39-b10d-4a6d-8ed3-acdd478eff48/call-PseudoSingleEndUMI/matrix.tsv',
  'test_kallisto.PseudoSingleEndUMI.run_log': 'gs://broad-dsde-mint-dev-cromwell-execution/cromwell-executions/test_kallisto/2d57ff39-b10d-4a6d-8ed3-acdd478eff48

### Verify PseudoSingleEndUMI
pseudo single-end umi should produce pseudo results only; given that I mocked up a random UMI file, I don't expect to see much overlap between multiple transcripts

In [23]:
%%bash
# look at pseudo single-end umi log
gsutil cat gs://broad-dsde-mint-dev-cromwell-execution/cromwell-executions/test_kallisto/2d57ff39-b10d-4a6d-8ed3-acdd478eff48/call-PseudoSingleEndUMI/run_info.json

{
	"n_targets": 200401,
	"n_bootstraps": 0,
	"n_processed": 913447,
	"kallisto_version": "0.43.1",
	"index_version": 10,
	"start_time": "Tue Nov 14 04:51:01 2017",
	"call": "kallisto pseudo --index /cromwell_root/broad-dsde-mint-dev-cromwell-execution/cromwell-executions/test_kallisto/2d57ff39-b10d-4a6d-8ed3-acdd478eff48/call-Mkref/kallisto.idx --output-dir . --single --umi --threads 4 --batch batch.txt"
}


Visualize first 10 lines of .ec (equivalence class) file. This is a two column file of read_index and ec_index. We expect this to mostly contain matched indices at the beginning of the file.

In [24]:
# look at first 10 lines of file
!gsutil cat gs://broad-dsde-mint-dev-cromwell-execution/cromwell-executions/test_kallisto/2d57ff39-b10d-4a6d-8ed3-acdd478eff48/call-PseudoSingleEndUMI/matrix.ec | head -n 10

0	0
1	1
2	2
3	3
4	4
5	5
6	6
7	7
8	8
9	9


In [26]:
# look at ec class counts -- visualizing number of UMIs mapped (column 2) to each EC (column 0). I don't know what the column 1 means yet.
# note the lack of column headers. :(
!gsutil cat gs://broad-dsde-mint-dev-cromwell-execution/cromwell-executions/test_kallisto/2d57ff39-b10d-4a6d-8ed3-acdd478eff48/call-PseudoSingleEndUMI/matrix.tsv | head -n 10

90	0	3
96	0	65
126	0	7
129	0	2
140	0	1
261	0	2
271	0	5
303	0	1
330	0	2
433	0	1


### verify QuantSingleEnd & QuantPairedEnd 
Here, we just want to see pseudobam outputs; the samtools view error is expected; it is the result of piping to head

In [28]:
# view single end
!gsutil cat gs://broad-dsde-mint-dev-cromwell-execution/cromwell-executions/test_kallisto/2d57ff39-b10d-4a6d-8ed3-acdd478eff48/call-QuantSingleEnd/pseudo.bam | samtools view | head -n 10

GLPB22-B5C:585:h0tpladxx:1:1101:3628:2241	4	*	0	0	*	*	0	0	TCCTGGAAGCAGGGCATTTGTTGCT	@BCFFFFFHHHGHJJJJJJIJJJJI
GLPB22-B5C:585:h0tpladxx:1:1101:6746:2219	4	*	0	0	*	*	0	0	TTTCAGTACAGAGTAGATACAGAAT	?@@DDDDDHHHHGCFH@GEHIIIIH
GLPB22-B5C:585:h0tpladxx:1:1101:8068:2250	4	*	0	0	*	*	0	0	TTGCTGGCGCGCGTGCGTTCATTGC	@BBDDFFFHFHHHEFGHGHIIJJJE
GLPB22-B5C:585:h0tpladxx:1:1101:11075:2200	16	ENST00000581862.5|ENSG00000263711.5|OTTHUMG00000178984.2|OTTHUMT00000444214.2|AC079062.1-203|AC079062.1|3610|lincRNA|	661	255	25M	*	0	0	ATACCACTGCTTATCCCATGTACTC	JFAIGHHJJIGJHHHHHFFFDFCCB	NH:i:1
GLPB22-B5C:585:h0tpladxx:1:1101:15295:2242	16	ENST00000338432.11|ENSG00000076555.15|OTTHUMG00000169250.5|OTTHUMT00000403077.1|ACACB-201|ACACB|9360|protein_coding|	1033	255	25M	*	0	0	GTATCAACGCAGAGTACATAAGCAG	GIGCJJIJIGHJHGHHHFFFFFCC@	NH:i:4
GLPB22-B5C:585:h0tpladxx:1:1101:15295:2242	272	ENST00000377854.9|ENSG00000076555.15|OTTHUMG00000169250.5|-|ACACB-203|ACACB|6588|protein_coding|	923	255	25M	*	0	0	GTATCAACGCAGAGTACATAA

In [27]:
# view paired end
!gsutil cat gs://broad-dsde-mint-dev-cromwell-execution/cromwell-executions/test_kallisto/2d57ff39-b10d-4a6d-8ed3-acdd478eff48/call-QuantPairedEnd/pseudo.bam | samtools view | head -n 10

GLPB22-B5C:585:h0tpladxx:1:1101:3628:2241	77	*	0	0	*	*	0	0	TCCTGGAAGCAGGGCATTTGTTGCT	@BCFFFFFHHHGHJJJJJJIJJJJI
GLPB22-B5C:585:h0tpladxx:1:1101:3628:2241	141	*	0	0	*	*	0	0	GTGNTACNNNNNNNNNNNNNNNNNN	<;<#2=@##################
GLPB22-B5C:585:h0tpladxx:1:1101:6746:2219	77	*	0	0	*	*	0	0	TTTCAGTACAGAGTAGATACAGAAT	?@@DDDDDHHHHGCFH@GEHIIIIH
GLPB22-B5C:585:h0tpladxx:1:1101:6746:2219	141	*	0	0	*	*	0	0	NNNNNNNNNNNNNNNNNNNNNNNNN	#########################
GLPB22-B5C:585:h0tpladxx:1:1101:8068:2250	77	*	0	0	*	*	0	0	TTGCTGGCGCGCGTGCGTTCATTGC	@BBDDFFFHFHHHEFGHGHIIJJJE
GLPB22-B5C:585:h0tpladxx:1:1101:8068:2250	141	*	0	0	*	*	0	0	GTTCTGCATGAAAATTTCCAGNNNN	<;<?@@@@@@??@@@???>?@####
GLPB22-B5C:585:h0tpladxx:1:1101:11075:2200	89	ENST00000581862.5|ENSG00000263711.5|OTTHUMG00000178984.2|OTTHUMT00000444214.2|AC079062.1-203|AC079062.1|3610|lincRNA|	661	255	25M	=	661	0	ATACCACTGCTTATCCCATGTACTC	JFAIGHHJJIGJHHHHHFFFDFCCB	NH:i:1
GLPB22-B5C:585:h0tpladxx:1:1101:11075:2200	165	ENST00000581862.5|ENSG00000263711.