In [1]:
import firecloud.api as fapi
import pandas as pd
import csv
import os

bucket = "fc-secure-d99fbd65-eb27-4989-95b4-4cf559aa7d36"
bcl_root = f"gs://{bucket}/bcls"
samplesheet_root = f"gs://{bucket}/samplesheets"
fastqs_root = f"gs://{bucket}/fastqs"

namespace = "testmybroad"
workspace = "Macosko-Pipelines"
cnamespace = "macosko-pipelines"

indexes_root = "https://raw.githubusercontent.com/MacoskoLab/Macosko-Pipelines/main/bcl2fastq/indexes/"
NN = pd.read_csv(indexes_root+"SI-NN.csv")
NT = pd.read_csv(indexes_root+"SI-NT.csv")
TT = pd.read_csv(indexes_root+"SI-TT.csv")
ND = pd.read_csv(indexes_root+"ND7.csv")

In [2]:
def dict2csv(indexes, s=8, d=10):
    with open('single.csv', 'w', newline='') as f1, open('dual.csv', 'w', newline='') as f2:
        single = csv.writer(f1)
        single.writerow(["[Data]"])
        single.writerow(["Sample_ID", "index", "Lane"])
        
        dual = csv.writer(f2)
        dual.writerow(["[Data]"])
        dual.writerow(["Sample_ID", "index", "index2", "Lane"])
        
        for lane in sorted(indexes.keys()):
            for index in indexes[lane]:
                if index in ND["I7_Index_ID"].values:
                    seq = ND.loc[ND['I7_Index_ID'] == index,:]["index"].values[0]
                    single.writerow([index, seq[:s], lane])
                elif index in TT["index_name"].values:
                    match = TT.loc[TT['index_name'] == index,:]
                    seq1 = match["index(i7)"].values[0]
                    seq2 = match["index2_workflow_b(i5)"].values[0]
                    dual.writerow([index, seq1[:d], seq2[:d], lane])
                elif index in NT["index_name"].values:
                    match = NT.loc[NT['index_name'] == index,:]
                    seq1 = match["index(i7)"].values[0]
                    seq2 = match["index2_workflow_b(i5)"].values[0]
                    dual.writerow([index, seq1[:d], seq2[:d], lane])
                elif index in NN["index_name"].values:
                    match = NN.loc[NN['index_name'] == index,:]
                    seq1 = match["index(i7)"].values[0]
                    seq2 = match["index2_workflow_b(i5)"].values[0]
                    dual.writerow([index, seq1[:d], seq2[:d], lane])
                else:
                    raise IndexError(f"ERROR: index {index} not found")
    
    df = pd.read_csv("single.csv", skiprows=1)
    assert len(df) == len(df.drop_duplicates())
    
    df = pd.read_csv("dual.csv", skiprows=1)
    assert len(df) == len(df.drop_duplicates())
                    
def upload(bcl, samplesheet_root=samplesheet_root):
    if sum(1 for _ in open("single.csv")) > 2:
        !gsutil cp single.csv {samplesheet_root}/{bcl}/single.csv
    if sum(1 for _ in open("dual.csv")) > 2:
        !gsutil cp dual.csv {samplesheet_root}/{bcl}/dual.csv

def run(bcl, sheet, params = ""):
    # Update the configuration
    body = fapi.get_workspace_config(namespace, workspace, cnamespace, "bcl2fastq").json()  
    body["inputs"]["bcl2fastq.technique"] = '"bcl2fastq"'
    body["inputs"]["bcl2fastq.bcl"] = f'"{bcl_root}/{bcl}"'
    body["inputs"]["bcl2fastq.samplesheet"] = f'"{samplesheet_root}/{bcl}/{sheet}"'
    body["inputs"]["bcl2fastq.params"] = f'"{params}"'
    fapi.update_workspace_config(namespace, workspace, cnamespace, "bcl2fastq", body).json()
    
    # Validate the configuration
    res = fapi.validate_config("testmybroad", "Macosko-Pipelines", "macosko-pipelines", "bcl2fastq").json()
    assert res["extraInputs"] == [], f"ERROR: extra input: \n{res['extraInputs']}"
    assert res["invalidInputs"] == {}, f"ERROR: invalid input: \n{res['invalidInputs']}"
    assert res["invalidOutputs"] == {}, f"ERROR: invalid output: \n{res['invalidOutputs']}"
    assert res["missingInputs"] == [], f"ERROR: missing input: \n{res['missingInputs']}"
    
    # Submit the job
    fapi.create_submission(namespace, workspace, cnamespace, "bcl2fastq", user_comment=f"{sheet} {bcl}").json()
    
    print(f"Submitted {sheet} for {bcl}")

In [10]:
# Upload the samplesheet
!cat single.csv
!cat dual.csv
upload(bcl)

In [4]:
# Submit jobs
run(bcl, "single.csv")
run(bcl, "dual.csv")

In [6]:
# Check the FASTQ sizes
print(bcl)
sizes = !gsutil du -h {fastqs_root}/{bcl}
sizes = [size.split() for size in sizes]
sizes = [(size[0]+" "+size[1], os.path.basename(size[2])) for size in sizes if size[2][-9:] == ".fastq.gz"]
for lane in sorted(indexes.keys()):
    for index in indexes[lane]:
        res = [size[0] for size in sizes if f"{index}_S" in size[1] and f"_L00{lane}_" in size[1]]
        warning = "WARNING!!" if any(" B" in s or " KiB" in s or "MiB" in s for s in res) or len(res) == 0 else ""
        print(f"{lane} {index} {res} {warning}")

In [None]:
# List all bcl2fastq submissions
subs = fapi.list_submissions("testmybroad", "Macosko-Pipelines").json()
subs = [sub for sub in subs if sub["status"] not in ["Done", "Aborted"]]
subs = [sub for sub in subs if sub["methodConfigurationName"].split("_")[0]=="bcl2fastq"]
[print(sub) for sub in subs]

In [None]:
# Abort bcl2fastq submissions
ids = [sub["submissionId"] for sub in subs]
[fapi.abort_submission("testmybroad", "Macosko-Pipelines", submission_id) for submission_id in ids]