## 1 Functions and module

In [2]:
import pandas as pd
import gzip
import glob
import os
import subprocess

In [5]:
def get_ngs_dir(config_path,variable):
    """
    Returns the value of NGS_DIR defined in the given config.sh file,
    or None if it's not found or there's an error.
    """
    # Run a subshell that sources config.sh and echoes $NGS_DIR
    command = f"bash -c 'source {config_path} && echo ${variable}'"
    try:
        result = subprocess.run(
            command, shell=True, check=True, capture_output=True, text=True
        )
        ngs_dir = result.stdout.strip()
        # If NGS_DIR is not set or empty, return None
        return ngs_dir if ngs_dir else None
    except subprocess.CalledProcessError:
        return None


## 2 Input and output address

In [6]:
parental_address = '/labs/mwinslow/Haiqing/UltraSeq_Projects/project1'

## 3 Load config.sh

In [7]:
temp = 'NGS_DIR'
ngs_dir = get_ngs_dir(f"{parental_address}/config.sh",temp)
print(f"{temp}= {ngs_dir}")


NGS_DIR= /labs/mwinslow/Haiqing/NGS_Raw_data/LA92-250304


In [8]:
temp = 'PROJECT_DIR'
project_dir = get_ngs_dir(f"{parental_address}/config.sh",temp)
print(f"{temp}= {project_dir}")


PROJECT_DIR= /labs/mwinslow/Haiqing/UltraSeq_Projects/project1


In [9]:
fastq_dir = ngs_dir+"/01.RawData"
print(fastq_dir)

output_prefix1 = project_dir+"/01_data_collection/data"
print(output_prefix1)

/labs/mwinslow/Haiqing/NGS_Raw_data/LA92-250304/01.RawData
/labs/mwinslow/Haiqing/UltraSeq_Projects/project1/01_data_collection/data


## 4 Generate file with necessary raw data address

In [10]:
# find all the fastq.gz files 
temp_pattern = '/**/*fq.gz' # When recursive is set, ** followed by a path separator matches 0 or more subdirectories.
fastq_address_list = glob.glob(fastq_dir+temp_pattern, recursive=True)

In [11]:
fastq_address_list

['/labs/mwinslow/Haiqing/NGS_Raw_data/LA92-250304/01.RawData/LA92_10/LA92_10_CKDL250005148-1A_22MKWKLT4_L5_1.fq.gz',
 '/labs/mwinslow/Haiqing/NGS_Raw_data/LA92-250304/01.RawData/LA92_10/LA92_10_CKDL250005148-1A_22MKWKLT4_L5_2.fq.gz',
 '/labs/mwinslow/Haiqing/NGS_Raw_data/LA92-250304/01.RawData/LA92_26/LA92_26_CKDL250005148-1A_22MKWKLT4_L5_2.fq.gz',
 '/labs/mwinslow/Haiqing/NGS_Raw_data/LA92-250304/01.RawData/LA92_26/LA92_26_CKDL250005148-1A_22MKWKLT4_L5_1.fq.gz',
 '/labs/mwinslow/Haiqing/NGS_Raw_data/LA92-250304/01.RawData/LA92_11/LA92_11_CKDL250005148-1A_22MKWKLT4_L5_1.fq.gz',
 '/labs/mwinslow/Haiqing/NGS_Raw_data/LA92-250304/01.RawData/LA92_11/LA92_11_CKDL250005148-1A_22MKWKLT4_L5_2.fq.gz',
 '/labs/mwinslow/Haiqing/NGS_Raw_data/LA92-250304/01.RawData/LA92_25/LA92_25_CKDL250005148-1A_22MKWKLT4_L5_2.fq.gz',
 '/labs/mwinslow/Haiqing/NGS_Raw_data/LA92-250304/01.RawData/LA92_25/LA92_25_CKDL250005148-1A_22MKWKLT4_L5_1.fq.gz',
 '/labs/mwinslow/Haiqing/NGS_Raw_data/LA92-250304/01.RawData/LA9

In [12]:
read1_address = [x for x in fastq_address_list if '_1.fq' in x]
read2_address = [x for x in fastq_address_list if '_2.fq' in x]

In [13]:
temp_dic1 = {}
for x in read1_address:
    temp = x.split('/')[-2]
    if temp in temp_dic1.keys():
        temp_dic1[temp].append(x)
    else:
        temp_dic1[temp] = [x]

In [14]:
temp_dic2 = {}
for x in read2_address:
    temp = x.split('/')[-2]
    if temp in temp_dic2.keys():
        temp_dic2[temp].append(x)
    else:
        temp_dic2[temp] = [x]

In [15]:
read1_df = pd.DataFrame({'Sample_ID':temp_dic1.keys()})
read2_df = pd.DataFrame({'Sample_ID':temp_dic2.keys()})

In [16]:
read1_df['Address'] = [' '.join(sorted(x)) for x in temp_dic1.values()]
read2_df['Address'] = [' '.join(sorted(x)) for x in temp_dic2.values()]

In [17]:
read_df = pd.merge(read1_df,read2_df, on ='Sample_ID',
                   how ='outer', suffixes = ('_r1', '_r2'))

In [18]:
read_df.shape

(30, 3)

In [19]:
read_df.head()

Unnamed: 0,Sample_ID,Address_r1,Address_r2
0,LA92_10,/labs/mwinslow/Haiqing/NGS_Raw_data/LA92-25030...,/labs/mwinslow/Haiqing/NGS_Raw_data/LA92-25030...
1,LA92_26,/labs/mwinslow/Haiqing/NGS_Raw_data/LA92-25030...,/labs/mwinslow/Haiqing/NGS_Raw_data/LA92-25030...
2,LA92_11,/labs/mwinslow/Haiqing/NGS_Raw_data/LA92-25030...,/labs/mwinslow/Haiqing/NGS_Raw_data/LA92-25030...
3,LA92_25,/labs/mwinslow/Haiqing/NGS_Raw_data/LA92-25030...,/labs/mwinslow/Haiqing/NGS_Raw_data/LA92-25030...
4,LA92_17,/labs/mwinslow/Haiqing/NGS_Raw_data/LA92-25030...,/labs/mwinslow/Haiqing/NGS_Raw_data/LA92-25030...


In [20]:
# filter data
read_df = read_df[read_df.Sample_ID.str.contains('LA92')]

In [21]:
print(f'There are {read_df.shape[0]} samples for the Kat7 dual-guide project')
# note from the library pooling form, samples LA92_09 and LA92_13 are not included.

There are 27 samples for the Kat7 dual-guide project


## 5 Output data

In [22]:
temp_o = output_prefix1 +"/NGS_address"

In [23]:
file_a = open(temp_o, 'w')
for index, row in read_df.iterrows():
    t1 = row['Address_r1']
    t2 = row['Address_r2']
    t3 = row['Sample_ID']
    temp_s = ','.join([t1,t2,t3])+'\n'
    file_a.write(temp_s)
file_a.close()