In [37]:
import pandas as pd
import glob
import csv
import os
from os.path import basename

In [38]:
indexes_root = "https://raw.githubusercontent.com/MacoskoLab/Macosko-Pipelines/main/bcl2fastq/indexes/"
NN = pd.read_csv(indexes_root+"SI-NN.csv")
NT = pd.read_csv(indexes_root+"SI-NT.csv")
TT = pd.read_csv(indexes_root+"SI-TT.csv")
ND = pd.read_csv(indexes_root+"ND7.csv")

In [39]:
def dict2df(indexes):
    rows = []
    assert type(indexes) == dict
    for lane in sorted(indexes.keys()):
        assert lane in [1,2,3,4,5,6,7,8]
        assert type(indexes[lane])==list and len(indexes[lane])==len(set(indexes[lane]))
        for index in indexes[lane]:
            if index in ND["I7_Index_ID"].values:
                row = ND.loc[ND['I7_Index_ID'] == index]
                seq1 = row["index"].iloc[0]
                rows.append([lane, index, seq1+"AT", "ACCGAGATCT"])
            elif index in TT["index_name"].values:
                row = TT.loc[TT['index_name'] == index]
                seq1 = row["index(i7)"].iloc[0]
                seq2 = row["index2_workflow_a(i5)"].iloc[0]
                rows.append([lane, index, seq1, seq2])
            elif index in NT["index_name"].values:
                row = NT.loc[NT['index_name'] == index]
                seq1 = row["index(i7)"].iloc[0]
                seq2 = row["index2_workflow_a(i5)"].iloc[0]
                rows.append([lane, index, seq1, seq2])
            elif index in NN["index_name"].values:
                row = NN.loc[NN['index_name'] == index]
                seq1 = row["index(i7)"].iloc[0]
                seq2 = row["index2_workflow_a(i5)"].iloc[0]
                rows.append([lane, index, seq1, seq2])
            else:
                raise IndexError(f"ERROR: index {index} not found")
    
    df = pd.DataFrame(rows, columns=['Lane', 'Sample_ID', 'index', 'index2'])

    # If whole flowcell is 8bp, remove the buffer padding
    if (df['index2'] == 'ACCGAGATCT').all():
        df['index'] = df['index'].str[:8]
        df['index2'] = df['index2'].str[:8]
    
    return df

### Create Samplesheets

In [40]:
# Naeem sample
bcl = "/broad/macosko_storage/macosko_lab_GP_depo/240808_SL-EXC_0341_A22CTCMLT4"
indexes = {6: ["SI-TT-C8"]}
samplesheet = dict2df(indexes)
cycles = [150,10,10,150]
# FASTQ DONE

In [45]:
# 1.2cm puck batch
# https://docs.google.com/spreadsheets/d/1xD1IvMF-YEp7qE-9jpTyg_sjpNdURSBE1msZ8iLChq0
bcl = "/broad/macosko_storage/macosko_lab_GP_depo/240923_SL-EXH_0185_B22G7HYLT4"
indexes = {3:["D701","D702","D703"],
           4:["D704","D705","D706","D707"],
           5:["D708","D709","D710","D701"]}
samplesheet = dict2df(indexes)
cycles = [160,10,10,158]
# FASTQ DONE

In [50]:
# White matter
# https://docs.google.com/spreadsheets/d/1tFFn7R8b0z9EEUnoYaG8hk-vQK6OlG9YFg3Qmq2pey8?gid=906070385
bcl = "/broad/gpbican/mccarroll_bican_bcls/241002_SL-EXH_0192_A22NLKLLT3"
indexes = {i+1: ["SI-TT-B9", "SI-TT-B10", "SI-TT-G3", "SI-TT-G4"] for i in range(8)}
samplesheet = dict2df(indexes)
cycles = [28,10,10,90]
# FASTQ DONE

In [55]:
# 2cm puck batch
# https://docs.google.com/spreadsheets/d/1tFFn7R8b0z9EEUnoYaG8hk-vQK6OlG9YFg3Qmq2pey8?gid=978890763
bcl = "/broad/gpbican/mccarroll_bican_bcls/241018_SL-EXA_0332_B22VFMLLT3"
indexes = {i+1: ["D703", "D704", "D705", "D706"] for i in range(8)}
samplesheet = dict2df(indexes)
cycles = [50,10,10,90]
# FASTQ DONE

In [60]:
# Calico LC
# Macosko Sequencing Experiments - rows 2176-2183 - 22VFK7LT3
bcl = "/broad/macosko_storage/macosko_lab_GP_depo/241018_SL-EXA_0333_A22VFK7LT3"
indexes = {i+1: ["SI-TT-C9", "SI-TT-C10", "SI-TT-E1", "SI-TT-E2", "SI-TT-E3", "SI-TT-E4", "SI-TT-E5", "SI-TT-E6"] for i in range(8)}
samplesheet = dict2df(indexes)
cycles = [28,10,10,90]
# FASTQ DONE

### Create Samplesheet

In [32]:
assert len(samplesheet) == len(samplesheet.drop_duplicates())
sheet_path = os.path.join("/", "discopipeline", "samplesheets", basename(bcl), "SampleSheet.csv")
print(sheet_path)

/discopipeline/samplesheets/241018_SL-EXA_0333_A22VFK7LT3/SampleSheet.csv


In [33]:
# Write the file
os.makedirs(os.path.dirname(sheet_path), exist_ok=True)
with open(sheet_path, 'w') as f:
    f.write("[Settings]\n")
    f.write("CreateFastqForIndexReads,0\n") # default: 0
    f.write("NoLaneSplitting,false\n") # default: false
    i1len = len(samplesheet["index"][0])
    i2len = len(samplesheet["index2"][0])
    assert type(cycles) == list and all(type(c) == int for c in cycles) and len(cycles) == 4
    assert cycles[0] > 0 and cycles[1] >= i1len and cycles[2] >= i2len and cycles[3] > 0
    R1 = f"Y{cycles[0]}"
    R2 = f"I{i1len}" + ("" if cycles[1]==i1len else f"N{cycles[1]-i1len}")
    R3 = f"I{i2len}" + ("" if cycles[2]==i2len else f"N{cycles[2]-i2len}")
    R4 = f"Y{cycles[3]}"
    f.write(f"OverrideCycles,{R1};{R2};{R3};{R4}\n")
    f.write("\n")
    
    f.write("[Data]\n")
samplesheet.to_csv(sheet_path, mode='a', index=False)

In [65]:
# Make sure it looks good
!cat {sheet_path}

### Run bcl-convert

In [35]:
print(bcl)

/broad/macosko_storage/macosko_lab_GP_depo/241018_SL-EXA_0333_A22VFK7LT3


In [66]:
root = "/broad/macosko/data/discopipeline"
binary = "/broad/macosko/data/discopipeline/software/bcl-convert-4.3.6-2.el8.x86_64/bin/bcl-convert"
srun = f"srun -C RedHat8 -J bcl-convert-{basename(bcl)[-10:]} -c 32 --mem 96G --time 96:00:00"
params = f"--bcl-input-directory={bcl} --output-directory={root}/fastqs/{basename(bcl)} --sample-sheet={root}/samplesheets/{basename(bcl)}/SampleSheet.csv" # --strict-mode=true
print(f"{srun} {binary} {params}")

### Validate FASTQ sizes

In [61]:
fastq_path = f"/discopipeline/fastqs/{basename(bcl)}"
print(fastq_path)

/discopipeline/fastqs/241018_SL-EXA_0333_A22VFK7LT3


In [62]:
print(bcl)
sizes = !du -sh {fastq_path}/*
sizes = [size.split() for size in sizes]
sizes = [(size[0], basename(size[1])) for size in sizes if size[1][-9:] == ".fastq.gz"]
# print(sizes)

for lane in sorted(indexes.keys()):
    for index in indexes[lane]:
        res = [size[0] for size in sizes if f"{index}_S" in size[1] and f"_L00{lane}_" in size[1]]
        warning = "WARNING!!" if any("B" in s or "K" in s or "M" in s for s in res) or len(res) != 2 else ""
        print(f"{lane} {index} {res} {warning}")

/broad/macosko_storage/macosko_lab_GP_depo/241018_SL-EXA_0333_A22VFK7LT3
1 SI-TT-C9 ['2.4G', '5.1G'] 
1 SI-TT-C10 ['1.4G', '3.0G'] 
1 SI-TT-E1 ['4.6G', '9.9G'] 
1 SI-TT-E2 ['5.6G', '13G'] 
1 SI-TT-E3 ['2.3G', '4.8G'] 
1 SI-TT-E4 ['5.4G', '12G'] 
1 SI-TT-E5 ['9.0G', '20G'] 
1 SI-TT-E6 ['4.8G', '11G'] 
2 SI-TT-C9 ['2.4G', '5.1G'] 
2 SI-TT-C10 ['1.5G', '3.0G'] 
2 SI-TT-E1 ['4.6G', '9.9G'] 
2 SI-TT-E2 ['5.6G', '12G'] 
2 SI-TT-E3 ['2.3G', '4.8G'] 
2 SI-TT-E4 ['5.4G', '12G'] 
2 SI-TT-E5 ['9.0G', '20G'] 
2 SI-TT-E6 ['4.8G', '11G'] 
3 SI-TT-C9 ['2.4G', '5.0G'] 
3 SI-TT-C10 ['1.4G', '3.0G'] 
3 SI-TT-E1 ['4.6G', '9.8G'] 
3 SI-TT-E2 ['5.6G', '12G'] 
3 SI-TT-E3 ['2.3G', '4.8G'] 
3 SI-TT-E4 ['5.4G', '12G'] 
3 SI-TT-E5 ['8.9G', '20G'] 
3 SI-TT-E6 ['4.8G', '11G'] 
4 SI-TT-C9 ['2.4G', '5.0G'] 
4 SI-TT-C10 ['1.5G', '3.0G'] 
4 SI-TT-E1 ['4.6G', '9.9G'] 
4 SI-TT-E2 ['5.6G', '13G'] 
4 SI-TT-E3 ['2.3G', '4.8G'] 
4 SI-TT-E4 ['5.4G', '12G'] 
4 SI-TT-E5 ['9.0G', '20G'] 
4 SI-TT-E6 ['4.8G', '11G'] 
5 SI-TT-C9 

### Delete Undetermined FASTQs

In [67]:
# List the files
files_to_delete = glob.glob(os.path.join(fastq_path, 'Undetermined_S0_*_001.fastq.gz'))
[print(file) for file in files_to_delete];

In [64]:
# Delete each file
[os.remove(file) for file in files_to_delete];

### Documentation

https://support-docs.illumina.com/APP/AppBCLConvert_v2_0/Content/APP/SampleSheets_swBCL_swBS_appBCL.htm
https://knowledge.illumina.com/software/general/software-general-reference_material-list/000008935
https://knowledge.illumina.com/software/general/software-general-reference_material-list/000003710

### Other commands

In [111]:
print(f"less {os.path.join(bcl,'RunInfo.xml')}")

less /broad/macosko_storage/macosko_lab_GP_depo/241018_SL-EXA_0333_A22VFK7LT3/RunInfo.xml
