# SCRIPT TO PERFORM QUALITY CONTROL ON WGS DATA

## This script should only be run once

#### Initialization
##### Load packages

In [None]:
import glob
import json
import subprocess
import os
from math import ceil

In [None]:
# Task: bgen_qc
def bgen_qc(geno_bgen_file, geno_sample_file, ref_first=True, keep_file=None, extract_file=None, plink2_options="", file_prefix="WGS_test"):

    # Prepare the plink2 command
    plink_cmd = [
        "plink2",
        "--bgen", geno_bgen_file,
        "ref-first",
        "--sample", geno_sample_file,
        "--out", "WGS_test_qc_pass",
        "--mac 10 --maf 0.0001 --hwe 1e-15 --mind 0.1 --geno 0.1", 
        "--write-snplist", "--write-samples", "--no-id-header"
    ]
    
    if keep_file:
        plink_cmd.extend(["--keep", keep_file])
    if extract_file:
        plink_cmd.extend(["--extract", extract_file])
    if plink2_options:
        plink_cmd.extend(plink2_options.split())
        
    # Convert the plink2 command list to a string
    run_plink = ' '.join(plink_cmd)

    # Prepare the dx run command with logging
    log_file = f"{file_prefix}_dx_run.log"
    dx_command = f'dx run swiss-army-knife -icmd="{run_plink}" --yes --instance-type mem1_ssd1_v2_x16 > {log_file} 2>&1'

    # Execute the dx run command
    try:
        subprocess.run(dx_command, shell=True, check=True)
    except subprocess.CalledProcessError as e:
        with open(log_file, 'r') as log:
            print(log.read())  # Print the log to see what went wrong
        raise e  # Re-raise the error after logging
    
    # Generate placeholder for mindrem id file
    open(f"{file_prefix}_qc_pass.mindrem.id", 'w').close()
    
    return {
        "qc_pass_snplist": f"{file_prefix}_qc_pass.snplist",
        "qc_pass_id": f"{file_prefix}_qc_pass.id",
        "mindrem_id": f"{file_prefix}_qc_pass.mindrem.id",
        "qc_pass_log": f"{file_prefix}_qc_pass.log"
    }


In [None]:
# Task: concat_qc_pass_files
def concat_qc_pass_files(qc_pass_snplists, qc_mindrem_ids, output_prefix="concatenated_qc_pass"):
    
    # Concatenate and sort the snplists
    with open(f"{output_prefix}.snplist", 'w') as outfile:
        subprocess.run(f"cat {' '.join(qc_pass_snplists)} | sort -k1,1n", shell=True, stdout=outfile)
    
    # Concatenate and sort the mindrem ids
    with open(f"{output_prefix}.mindrem.id", 'w') as outfile:
        subprocess.run(f"cat {' '.join(qc_mindrem_ids)} | sort", shell=True, stdout=outfile)
    
    return {
        "concatenated_qc_pass_snplist": f"{output_prefix}.snplist",
        "concatenated_mindrem_id": f"{output_prefix}.mindrem.id"
    }

In [None]:
# Workflow: bgens_qc
def bgens_qc(geno_bgen_files, geno_sample_files, ref_first=True, keep_file=None, extract_files=[], plink2_options="", output_prefix=""):
    # Run the bgen_qc task in a scatter fashion
    bgen_qc_outputs = []
    for i in range(len(geno_bgen_files)):
        file_prefix = os.path.splitext(os.path.basename(geno_bgen_files[i]))[0]
        extract_file_i = extract_files[i] if len(extract_files) > i else None
        output = bgen_qc(
            geno_bgen_file=geno_bgen_files[i],
            geno_sample_file=geno_sample_files[i],
            ref_first=ref_first,
            keep_file=keep_file,
            extract_file=extract_file_i,
            plink2_options=plink2_options,
            file_prefix=file_prefix
        )
        bgen_qc_outputs.append(output)
    
    # Collect all the qc_pass_snplists and mindrem_id files for concatenation
    qc_pass_snplists = [output['qc_pass_snplist'] for output in bgen_qc_outputs]
    qc_mindrem_ids = [output['mindrem_id'] for output in bgen_qc_outputs]
    
    # Run the concat_qc_pass_files task
    concat_output = concat_qc_pass_files(
        qc_pass_snplists=qc_pass_snplists,
        qc_mindrem_ids=qc_mindrem_ids,
        output_prefix=output_prefix
    )
    
    return {
        "concat_qc_pass_snplist": concat_output['concatenated_qc_pass_snplist'],
        "concat_qc_mindrem_id": concat_output['concatenated_mindrem_id'],
        "qc_pass_logs": [output['qc_pass_log'] for output in bgen_qc_outputs]
    }

In [None]:
# Example usage:
path_to_data = '/mnt/project/Bulk/Imputation/Imputation\ from\ genotype\ (GEL)/'

geno_bgen_files = [f"/mnt/project/Bulk/Imputation/Imputation\ from\ genotype\ \(GEL\)/ukb21008_c{i}_b0_v1.bgen" for i in range(1, 23)]
geno_sample_files = [f"/mnt/project/Test_Javier/Data/GEL_imputed_sample_files_fixed/ukb21008_c{i}_b0_v1.sample" for i in range(1, 23)]

#extract_files = ["extract1.txt", "extract2.txt"]  # Optional extract files
output_prefix = "test"  # Output file prefix


In [None]:
# Run the workflow
workflow_outputs = bgens_qc(
    geno_bgen_files=geno_bgen_files,
    geno_sample_files=geno_sample_files,
    ref_first=True,
    keep_file=None,
    plink2_options="",
    output_prefix=output_prefix
)

# Display the outputs
print(workflow_outputs)