## 1 Functions and module

In [1]:
import pandas as pd
import gzip
import glob
import os
import subprocess

In [2]:
def get_ngs_dir(config_path,variable):
    """
    Returns the value of NGS_DIR defined in the given config.sh file,
    or None if it's not found or there's an error.
    """
    # Run a subshell that sources config.sh and echoes $NGS_DIR
    command = f"bash -c 'source {config_path} && echo ${variable}'"
    try:
        result = subprocess.run(
            command, shell=True, check=True, capture_output=True, text=True
        )
        ngs_dir = result.stdout.strip()
        # If NGS_DIR is not set or empty, return None
        return ngs_dir if ngs_dir else None
    except subprocess.CalledProcessError:
        return None


## 2 Input and output address

In [22]:
parental_address = '/labs/mwinslow/Haiqing/Raw_data_processing/Cas12a_3guide_example'

## 3 Load config.sh

In [27]:
temp = 'NGS_DIR'
ngs_dir = get_ngs_dir(f"{parental_address}/config.sh",temp)
print(f"{temp}= {ngs_dir}")


NGS_DIR= /labs/mwinslow/Haiqing/NGS_Raw_data/JH_HX_Cas12a_241121


In [28]:
temp = 'PROJECT_DIR'
project_dir = get_ngs_dir(f"{parental_address}/config.sh",temp)
print(f"{temp}= {project_dir}")


PROJECT_DIR= /labs/mwinslow/Haiqing/Raw_data_processing/Cas12a_3guide_example


In [29]:
fastq_dir = ngs_dir+"/01.RawData"
print(fastq_dir)

output_prefix1 = project_dir+"/data"
print(output_prefix1)

/labs/mwinslow/Haiqing/NGS_Raw_data/JH_HX_Cas12a_241121/01.RawData
/labs/mwinslow/Haiqing/Raw_data_processing/Cas12a_3guide_example/data


## 4 Generate file with necessary raw data address

In [9]:
# find all the fastq.gz files 
temp_pattern = '/**/*fq.gz' # When recursive is set, ** followed by a path separator matches 0 or more subdirectories.
fastq_address_list = glob.glob(fastq_dir+temp_pattern, recursive=True)

In [10]:
fastq_address_list

['/labs/mwinslow/Haiqing/NGS_Raw_data/JH_HX_Cas12a_241121/01.RawData/LA87_18/LA87_18_CKDL240038314-1A_22HVCLLT4_L2_2.fq.gz',
 '/labs/mwinslow/Haiqing/NGS_Raw_data/JH_HX_Cas12a_241121/01.RawData/LA87_18/LA87_18_CKDL240038314-1A_22HVCLLT4_L2_1.fq.gz',
 '/labs/mwinslow/Haiqing/NGS_Raw_data/JH_HX_Cas12a_241121/01.RawData/LA87_14/LA87_14_CKDL240038314-1A_22HVCLLT4_L2_1.fq.gz',
 '/labs/mwinslow/Haiqing/NGS_Raw_data/JH_HX_Cas12a_241121/01.RawData/LA87_14/LA87_14_CKDL240038314-1A_22HVCLLT4_L2_2.fq.gz',
 '/labs/mwinslow/Haiqing/NGS_Raw_data/JH_HX_Cas12a_241121/01.RawData/LA85_07/LA85_07_CKDL240038313-1A_22HVCLLT4_L1_2.fq.gz',
 '/labs/mwinslow/Haiqing/NGS_Raw_data/JH_HX_Cas12a_241121/01.RawData/LA85_07/LA85_07_CKDL240038313-1A_22HVCLLT4_L1_1.fq.gz',
 '/labs/mwinslow/Haiqing/NGS_Raw_data/JH_HX_Cas12a_241121/01.RawData/LA87_19/LA87_19_CKDL240038314-1A_22HVCLLT4_L2_2.fq.gz',
 '/labs/mwinslow/Haiqing/NGS_Raw_data/JH_HX_Cas12a_241121/01.RawData/LA87_19/LA87_19_CKDL240038314-1A_22HVCLLT4_L2_1.fq.gz',


In [11]:
read1_address = [x for x in fastq_address_list if '_1.fq' in x]
read2_address = [x for x in fastq_address_list if '_2.fq' in x]

In [12]:
temp_dic1 = {}
for x in read1_address:
    temp = x.split('/')[-2]
    if temp in temp_dic1.keys():
        temp_dic1[temp].append(x)
    else:
        temp_dic1[temp] = [x]

In [13]:
temp_dic2 = {}
for x in read2_address:
    temp = x.split('/')[-2]
    if temp in temp_dic2.keys():
        temp_dic2[temp].append(x)
    else:
        temp_dic2[temp] = [x]

In [14]:
read1_df = pd.DataFrame({'Sample_ID':temp_dic1.keys()})
read2_df = pd.DataFrame({'Sample_ID':temp_dic2.keys()})

In [15]:
read1_df['Address'] = [' '.join(sorted(x)) for x in temp_dic1.values()]
read2_df['Address'] = [' '.join(sorted(x)) for x in temp_dic2.values()]

In [16]:
read_df = pd.merge(read1_df,read2_df, on ='Sample_ID',
                   how ='outer', suffixes = ('_r1', '_r2'))

In [17]:
read_df.head()

Unnamed: 0,Sample_ID,Address_r1,Address_r2
0,LA87_18,/labs/mwinslow/Haiqing/NGS_Raw_data/JH_HX_Cas1...,/labs/mwinslow/Haiqing/NGS_Raw_data/JH_HX_Cas1...
1,LA87_14,/labs/mwinslow/Haiqing/NGS_Raw_data/JH_HX_Cas1...,/labs/mwinslow/Haiqing/NGS_Raw_data/JH_HX_Cas1...
2,LA85_07,/labs/mwinslow/Haiqing/NGS_Raw_data/JH_HX_Cas1...,/labs/mwinslow/Haiqing/NGS_Raw_data/JH_HX_Cas1...
3,LA87_19,/labs/mwinslow/Haiqing/NGS_Raw_data/JH_HX_Cas1...,/labs/mwinslow/Haiqing/NGS_Raw_data/JH_HX_Cas1...
4,LA85_21,/labs/mwinslow/Haiqing/NGS_Raw_data/JH_HX_Cas1...,/labs/mwinslow/Haiqing/NGS_Raw_data/JH_HX_Cas1...


In [34]:
# filter data
read_df = read_df[read_df.Sample_ID.str.contains('LA87')]

## 5 Output data

In [31]:
temp_o = output_prefix1 +"/NGS_address"

In [33]:
file_a = open(temp_o, 'w')
for index, row in read_df.iterrows():
    t1 = row['Address_r1']
    t2 = row['Address_r2']
    t3 = row['Sample_ID']
    temp_s = ','.join([t1,t2,t3])+'\n'
    file_a.write(temp_s)
file_a.close()