In [None]:
import pandas as pd
import subprocess
import sys
import numpy as np
import os
import shutil
import joblib

In [None]:
def shell_do(command, log=False, return_log=False):
    print(f'Executing: {(" ").join(command.split())}', file=sys.stderr)

    res=subprocess.run(command.split(), stdout=subprocess.PIPE)

    if log:
        print(res.stdout.decode('utf-8'))
    if return_log:
        return(res.stdout.decode('utf-8'))

In [None]:
def run_qc(path_dict, bash_path, env):
    # write swarm file
    with open(f'qc_pipeline.swarm', 'w') as f:
        # loop through path_dict
        for cohort in path_dict:
            f.write(f"bash {bash_path} -i {path_dict[cohort]['geno']} -c {cohort} -o {path_dict[cohort]['out']} -e {env}\n")
        f.close()
    
    # queue swarm job
    shell_do(f'swarm -f qc_pipeline.swarm -g 200 --time 10-00:00:00 --module python/3.7,plink/1.9,GCTA')

In [None]:
# project path
wd = 'insert_path'

# bash script path
bash_path = f'{wd}/processing/qc/qc_pipeline.sh'

# conda enviornment (need jupyter and GenoTools [https://github.com/dvitale199/GenoTools] installed)
env = 'genotools'

# path dict
path_dict = {'JG':{}, 'ADNI':{}, 'FTD':{}, 'LBD':{}, 'ALS':{}, 'PD':{}, 'ADSP':{}}

In [None]:
# joint genotyping paths
jg_data_dir = f'{wd}/ROSMAPMayoRNAseqMSBB/joint_genotyping'
jg_geno_path = f'{jg_data_dir}/jointGenotypingROSMAPMayoRNAseqMSBB_pheno_new_ids'
jg_out_path = f'{jg_data_dir}/qc/jointGenotypingROSMAPMayoRNAseqMSBB_pheno_qc'

# ADNI paths
adni_data_dir = f'{wd}/ADNI/genotypes'
adni_geno_path = f'{adni_data_dir}/ADNI_all_pheno'
adni_out_path = f'{adni_data_dir}/qc/ADNI_all_pheno_qc'

# FTD paths
ftd_data_dir = f'{wd}/FTD_LBD_ALS/ftd_genotypes'
ftd_geno_path = f'{ftd_data_dir}/merged_FTD_age_filter'
ftd_out_path = f'{ftd_data_dir}/qc/merged_FTD_qc'

# LBD paths
lbd_data_dir = f'{wd}/FTD_LBD_ALS/lbd_genotypes'
lbd_geno_path = f'{lbd_data_dir}/merged_LBD_age_filter'
lbd_out_path = f'{lbd_data_dir}/qc/merged_LBD_qc'

# ALS paths
als_data_dir = f'{wd}/FTD_LBD_ALS/als_genotypes'
als_geno_path = f'{als_data_dir}/merged_ALS_age_filter'
als_out_path = f'{als_data_dir}/qc/merged_ALS_qc'

# AMP PD paths
pd_geno_path = f'{wd}/AMP_PD/amp_pd_pheno'
pd_out_path = f'{wd}/AMP_PD/qc/amp_pd_pheno_qc'

#ADSP paths
adsp_data_dir = f'{wd}/ADSP'
adsp_geno_path = f'{adsp_data_dir}/adsp_formatted_normalized_pheno_new_ids'
adsp_out_path = f'{adsp_data_dir}/qc/adsp_formatted_normalized_pheno_qc'

In [None]:
# adding to path dict
path_dict['JG']['geno'] = jg_geno_path
path_dict['JG']['out'] = jg_out_path
path_dict['ADNI']['geno'] = adni_geno_path
path_dict['ADNI']['out'] = adni_out_path
path_dict['FTD']['geno'] = ftd_geno_path
path_dict['FTD']['out'] = ftd_out_path
path_dict['LBD']['geno'] = lbd_geno_path
path_dict['LBD']['out'] = lbd_out_path
path_dict['ALS']['geno'] = als_geno_path
path_dict['ALS']['out'] = als_out_path
path_dict['PD']['geno'] = pd_geno_path
path_dict['PD']['out'] = pd_out_path
path_dict['ADSP']['geno'] = adsp_geno_path
path_dict['ADSP']['out'] = adsp_out_path

In [None]:
run_qc(path_dict, bash_path, env)

In [None]:
# see number of cases/vairants after QC is complete
for cohort in path_dict:
    print(f'\n{cohort}\n')
    fam = pd.read_csv(f"{path_dict[cohort]['out']}.fam", sep='\s+', header=None)
    fam.columns = ['FID','IID','PAT','MAT','SEX','PHENO']
    print(fam.head())
    print(fam.shape)
    print(fam['PHENO'].value_counts())
    
    bim = pd.read_csv(f"{path_dict[cohort]['out']}.bim", sep='\s+', header=None)
    bim.columns = ['CHR','ID','LOC','BP','ALT','REF']
    print(bim.head())
    print(bim.shape)
    print(bim['CHR'].value_counts())