## converter from the excel table to a valid sample sheet for 10x

In [1]:
# main imports
import os
import pandas as pd

# some sensible settings for better output
from IPython.display import display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('max_colwidth', 200)
pd.options.mode.chained_assignment = None

# get the code
import sys

### set your PATH environments for more flexibility
home = os.environ['HOME']
work = os.environ['WORK']

code_path = os.path.join(home, "Sites/sceleton")
sys.path.append(os.path.join(code_path, "code/py"))
from script_utils import load_config, show_output, get_path, full_path
config_path = "../configs"
config = load_config(os.path.join(config_path, "test_config.yml"))


[1;36;1mconfig file ../configs/test_config.yml successfully loaded[0m
[1;30;1mAdded /Users/martinszyska/Sites/Bio/cellrangersnake/scripts/py to python path for imports[0m
[1;30;1mLoading additional config CR_config from configs/cellranger_config.yml[0m


### get the sample_df

In [10]:
sample_sheet = os.path.join(home, "Sites/Bio/cellrangersnake/sheets/lena_samples.csv")
sample_df = pd.read_csv(sample_sheet, sep="\t").set_index('Sample')
sample_df

Unnamed: 0_level_0,sample,fastqs,Run,library_type
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CAS9-Stim1,Cas9_Stim_1_GEX,/fast/groups/ag_schmueck/work/NGSData/Illumina/230114_LenaCas9/fastq,230114,Gene Expression
CAS9-Stim1,Cas9_Stim_1_hTCR,/fast/groups/ag_schmueck/work/NGSData/Illumina/230114_LenaCas9/fastq,230114,TCR
CAS9-Stim2,Cas9_Stim_2_GEX,/fast/groups/ag_schmueck/work/NGSData/Illumina/230114_LenaCas9/fastq,230114,Gene Expression
CAS9-Stim2,Cas9_Stim_2_hTCR,/fast/groups/ag_schmueck/work/NGSData/Illumina/230114_LenaCas9/fastq,230114,TCR
CAS9-Unstim,Cas9_Unstim_GEX,/fast/groups/ag_schmueck/work/NGSData/Illumina/230114_LenaCas9/fastq,230114,Gene Expression
CAS9-Unstim,Cas9_Unstim_hTCR,/fast/groups/ag_schmueck/work/NGSData/Illumina/230114_LenaCas9/fastq,230114,TCR


In [8]:
r = config['cellranger']['Reads']['GEX']
def get_ADT_path(run):
    '''
    path getter for the ADT file created by make_ADT_files
    '''
    return f"ADT_files/ADT_{run}.csv"


def get_read(_type, config):
    '''
    returns the read-string for each library type
    '''
    return [f"{read.lower()}-length,{length}\n" for read, length in config['cellranger']['Reads'][_type].items()]

def get_CRargs(config):
    CRargs = config['cellranger']['args']
    return [arg.replace("--", "").replace("=", ",").replace(" ", ",") + "\n" for arg in CRargs]


########### IS NOT NEEDED IN PIPELINE ############################
def static_path(file, config):
    '''
    returns the absolute path when given relative to static folder
    '''

    return os.path.join(config['paths']['static'], file)


def make_multi_lib_file(sc_df, sample="", filepath="", run_config={}):
    '''
    the sample sheet writer combining a library_df and info from config into a valid MiniSeq sample sheet
    '''
    
    libs_df = sc_df.loc[sample]

    c = run_config['cellranger']
    # write to file
    with open(filepath, 'w') as f:
        if 'Gene Expression' in libs_df['library_type'].values:
            #write the GEX config
            f.write('[gene-expression]\n')
            f.write(f"reference,{static_path(c['transcriptome_path'], config)}\n")
            f.writelines(get_read('GEX', config))
            f.writelines(get_CRargs(config))

        if "Antibody Capture" in libs_df['library_type'].values:
            # write the Feature config
            f.write('\n[feature]\n')
            # get the ADT run from libs_df
            run = libs_df.loc[libs_df['library_type'] == "Antibody Capture", 'Run'][0]
            f.write(f"reference,{get_ADT_path(run)}\n")
            f.writelines(get_read("FeatureBarcode", config))
            
            
        if 'TCR' in libs_df['library_type'].values:
            # write the VDJ config
            f.write('\n[vdj]\n')
            f.write(f"reference,{static_path(c['VDJ_ref'], config)}\n")
            f.writelines(get_read("TCR", config))
        libs_df = libs_df.rename({
            'sample':'fastq_id',
            'library_type':'feature_types'
        }, axis=1).loc[:, ['fastq_id', 'fastqs', 'feature_types']]
        # write the libraries 
        f.write('\n[libraries]\n')        
        libs_df.to_csv(f, index=False)
    show_output(f"Sample sheet written to {filepath}", color="success")
    return libs_df

### test  CRmulti-samplesheet writer

In [9]:
test_output = os.path.join(home, "Sites/Bio/cellrangersnake/test/multi_lena.csv")
libs_df = make_multi_lib_file(sample_df, sample="CAS9-Stim2", run_config=config, filepath=test_output)
libs_df

                          sample  \
Sample                             
CAS9-Stim2       Cas9_Stim_2_GEX   
CAS9-Stim2      Cas9_Stim_2_hTCR   
CAS9-Stim2  Cas9_Stim_2_Antibody   

                                                                          fastqs  \
Sample                                                                             
CAS9-Stim2  /fast/groups/ag_schmueck/work/NGSData/Illumina/230114_LenaCas9/fastq   
CAS9-Stim2  /fast/groups/ag_schmueck/work/NGSData/Illumina/230114_LenaCas9/fastq   
CAS9-Stim2  /fast/groups/ag_schmueck/work/NGSData/Illumina/230114_LenaCas9/fastq   

               Run      library_type  
Sample                                
CAS9-Stim2  230114   Gene Expression  
CAS9-Stim2  230114               TCR  
CAS9-Stim2  221018  Antibody Capture  
[1;36;1mSample sheet written to /Users/martinszyska/Sites/Bio/cellrangersnake/test/multi_lena.csv[0m


Unnamed: 0_level_0,fastq_id,fastqs,feature_types
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CAS9-Stim2,Cas9_Stim_2_GEX,/fast/groups/ag_schmueck/work/NGSData/Illumina/230114_LenaCas9/fastq,Gene Expression
CAS9-Stim2,Cas9_Stim_2_hTCR,/fast/groups/ag_schmueck/work/NGSData/Illumina/230114_LenaCas9/fastq,TCR
CAS9-Stim2,Cas9_Stim_2_Antibody,/fast/groups/ag_schmueck/work/NGSData/Illumina/230114_LenaCas9/fastq,Antibody Capture
