In [None]:
import glob
import os
import subprocess
import shlex

# Define file paths
input_folder = "/home/masterco20/rna"
output_folder = f"{input_folder}/Trim_Reads"
logs_folder = f"{input_folder}/Trim_Logs"
multiqc_output = f"{input_folder}/MultiQC_Report"

# Log file for the entire script
script_log = f"{logs_folder}/script_execution.log"

# Create output directories if they do not exist
os.makedirs(output_folder, exist_ok=True)
os.makedirs(logs_folder, exist_ok=True)
os.makedirs(multiqc_output, exist_ok=True)

# Process all .fastq.gz files in the input folder
for file in glob.glob(f"{input_folder}/*.fastq.gz"):
    base = os.path.basename(file)
    sample_name = os.path.splitext(base)[0]  # Extract the sample name without extension

    # Log file for each sample processing
    sample_log = f"{logs_folder}/{sample_name}_processing.log"

    # Run fastp with output and error redirection to log files
    cmd = f"fastp -i {file} -I {file.replace('_1.fastq.gz', '_2.fastq.gz')} \
         -o {output_folder}/{sample_name}_trim_R1.fastq.gz -O {output_folder}/{sample_name}_trim_R2.fastq.gz \
         -w 4 --detect_adapter_for_pe \
         -j {logs_folder}/{sample_name}_fastp.json -h {logs_folder}/{sample_name}_fastp_report.html"
    
    with open(sample_log, 'w') as f:
        subprocess.run(shlex.split(cmd), stdout=f, stderr=subprocess.STDOUT)

# Run MultiQC with the fastp module to generate a single report
cmd_multiqc = f"multiqc -m fastp -o {multiqc_output} {logs_folder}"
with open(f"{logs_folder}/multiqc_execution.log", 'w') as f:
    subprocess.run(shlex.split(cmd_multiqc), stdout=f, stderr=subprocess.STDOUT)