In [None]:
import pandas as pd
import subprocess
import sys
import numpy as np
import os
import shutil
import joblib

In [None]:
def shell_do(command, log=False, return_log=False):
    print(f'Executing: {(" ").join(command.split())}', file=sys.stderr)

    res=subprocess.run(command.split(), stdout=subprocess.PIPE)

    if log:
        print(res.stdout.decode('utf-8'))
    if return_log:
        return(res.stdout.decode('utf-8'))

In [None]:
def vcf_to_pfile(seq_data_files, temp_vcf, out_files):
    # write bash script
    with open('vcf_to_pfile.sh', 'w') as f:
        f.write('#!/usr/bin/env bash\n\n')
        f.write('module load plink/2.3-alpha samtools\n')
        f.write(f'bcftools concat {seq_data_files} -o {temp_vcf}\n')
        f.write(f'plink2 --vcf {temp_vcf} --allow-extra-chr --double-id --make-pgen --out {out_files}\n')
        f.write(f'rm {temp_vcf}')
        f.close()
    
    # write swarm script
    with open('vcf_to_pfile.swarm', 'w') as f:
        f.write('bash vcf_to_pfile.sh')
        f.close()

    # queue swarm job
    swarm_cmd = f'swarm -f vcf_to_pfile.swarm -g 20 --time 12:00:00 --module plink/2.0_alpha_1_final,samtools'
    shell_do(swarm_cmd)

In [None]:
def pfiles_to_bfiles(pfiles_path, bfiles_path):
    # convert plink2 pfiles to plink binaries (all chr at once)
    plink_cmd = f'plink2 --pfile {pfiles_path} --allow-extra-chr --chr 1-23 --make-bed --out {bfiles_path}'
    shell_do(plink_cmd)

In [None]:
def merge_bfiles(bfiles_path, out_path, num_chr):
    # write individual chr paths to txt file
    with open(f'{out_path}.txt', 'w') as f:
        for i in range(num_chr):
            bfiles_individual_path = bfiles_path.replace('*', str(i+1))
            # reformatting bim avoids merge errors
            reformat_bim(f'{bfiles_individual_path}.bim')
            f.write(f'{bfiles_individual_path}\n')
        f.close()
    
    # write bash script
    with open('merge_bfiles.sh', 'w') as f:
        f.write('#!/usr/bin/env bash\n\n')
        f.write('module load plink/1.9\n')
        f.write(f'plink --merge-list {out_path}.txt --make-bed --out {out_path}')
        f.close()
    
    # write swarm script
    with open('merge_bfiles.swarm', 'w') as f:
        f.write('bash merge_bfiles.sh')
        f.close()
    
    # queue swarm job
    swarm_cmd = f'swarm -f merge_bfiles.swarm -g 1000 --partition largemem -t 64 --time 10-00:00:00 --module plink/1.9'
    shell_do(swarm_cmd)

In [None]:
def read_fam(fam, change_type=True, copy_iid=False):
    # read fam file, name columns
    fam = pd.read_csv(fam, sep='\s+', header=None)
    fam.columns = ['FID','IID','PAT','MAT','SEX','PHENO']
    
    # convert to string type for easy merging
    if change_type:
        fam['FID'] = fam['FID'].astype(str)
        fam['IID'] = fam['IID'].astype(str)
        
    # make FID and IID the same
    if copy_iid:
        fam['FID'] = fam['IID'].copy()
    
    return fam

In [None]:
def reformat_bim(bim_path):
    # read bim file, name columns
    bim = pd.read_csv(bim_path, sep='\s+', header=None)
    bim.columns = ['CHR','ID','LOC','BP','ALT','REF']
    
    # change to chr:basepair:ref:alt format and write to file
    bim['ID'] = bim['CHR'].astype(str) + ':' + bim['BP'].astype(str) + ':' + bim['REF'] + ':' + bim['ALT']
    bim.to_csv(bim_path, sep='\t', index=None, header=None)

In [None]:
def update_ids(geno_path, ids_path):
    # write bash script
    with open('update_ids.sh', 'w') as f:
        f.write('#!/usr/bin/env bash\n\n')
        f.write('module load plink/1.9\n')
        f.write(f'plink --bfile {geno_path} --update-ids {ids_path} --make-bed --out {geno_path}_new_ids')
        f.close()
    
    shell_do(f'bash update_ids.sh')

In [None]:
def get_joint_genotyping_phenos(covars_path, geno_path, out_path):
    # read covars file 
    covars = pd.read_csv(covars_path, sep=',')
    
    # isolate MSBB to generate phenotype from coglev
    msbb_covars = covars[covars['cohort'] == 'MSBB']
    covars = covars[covars['cohort'] != 'MSBB']
    msbb_covars['clinAD'] = np.where(msbb_covars['coglev'] == 'Dementia', 1, 0)
    covars = msbb_covars.append(covars)
    
    # isolate ROSMAP to change IDs for MSBB and MayoRNA to str
    rosmap_covars = covars[covars['cohort'] == 'ROSMAP']
    covars = covars[covars['cohort'] != 'ROSMAP']
    
    # get rid of NA IDs
    covars = covars[covars['wgs_id'].notna()]
    
    # change to str
    covars['wgs_id'] = covars['wgs_id'].astype(float).astype(int).astype(str)
    covars = covars.append(rosmap_covars)
    
    # isolate proper covars
    covars_needed = covars.loc[:, ['wgs_id','female','clinAD']]
    covars_needed.columns = ['IID','SEX','PHENO']
    covars_needed['PHENO'] = covars_needed['PHENO'].astype(int)
    
    # read fam file
    fam = read_fam(f'{geno_path}.fam')
    fam = fam.drop(columns=['SEX','PHENO'], axis=1)
    
    # merge and switch sex and pheno to plink format
    merge = fam.merge(covars_needed, how='inner', on=['IID'])
    merge['SEX'] = np.where(merge['SEX'] == 1, 2, 1)
    merge['PHENO'] = np.where(merge['PHENO'] == 1, 2, 1)
    
    # write IDs to txt
    merge[['FID','IID']].to_csv(f'{out_path}_ids.txt', sep='\t', index=None, header=None)
    
    # only keep samples from intersection of covars and IDs
    keep_cmd = f'plink2 --bfile {geno_path} --keep {out_path}_ids.txt --make-bed --out {out_path}'
    shell_do(keep_cmd)
    
    # write to file
    merge.to_csv(f'{out_path}.fam', sep='\t', index=None, header=None)
    
    # new IDs
    merge['FID_new'] = 'JG' + merge['FID'].astype(str)
    merge['IID_new'] = 'JG' + merge['IID'].astype(str)
    
    # write new IDs to txt
    merge[['FID','IID','FID_new','IID_new']].to_csv(f'{out_path}_new_ids.txt', sep='\t', index=None, header=None)
    
    update_ids(out_path, f'{out_path}_new_ids.txt')

In [None]:
def merge_adni(geno_dir, out_path):
    # getting out_dir
    out_dir = os.path.dirname(out_path)
    
    # list of chromosome
    chrom_list = [i for i in range(1,23)] + ['X']

    # writing merge list 1 - original file location
    with open('adni_merge_list1.txt', 'w') as f:
        for chrom in chrom_list:
            f.write(f'{geno_dir}/adni.nov2018.chr{chrom}\n')
        f.close()

    # writing merge list 2 - project file location
    with open('adni_merge_list2.txt', 'w') as f:
        for chrom in chrom_list:
            binary = f'adni.nov2018.chr{chrom}'
            f.write(f'{out_dir}/{binary}\n')
        f.close()

    # write bash script
    with open('merge_adni.sh', 'w') as f:
        f.write('#!/usr/bin/env bash\n\n')
        f.write('module load plink/1.9\n')

        # first merge from original file location
        merge_cmd = f'plink --merge-list adni_merge_list1.txt --allow-extra-chr --make-bed --out {out_path}'
        f.write(f'{merge_cmd}\n')

        # excluding missnp variants from the individual chromosome files
        for chrom in chrom_list:
            binary = f'adni.nov2018.chr{chrom}'
            exclude_cmd = f'plink --bfile {geno_dir}/{binary} --allow-extra-chr --exclude {out_path}-merge.missnp --make-bed --out {out_dir}/{binary}'
            f.write(f'{exclude_cmd}\n')

        # second merge from project file location with exlcuded snps
        merge_cmd2 = f'plink --merge-list adni_merge_list2.txt --allow-extra-chr --chr 1-23 --make-bed --out {out_path}'
        f.write(f'{merge_cmd2}\n')
        
        # removing individual chromosome files 
        f.write(f'rm {out_dir}/adni.nov2018.chr*')
        f.close()

    # write swarm script 
    with open('merge_adni.swarm', 'w') as f:
        f.write('bash merge_adni.sh')
        f.close()

    # queue swarm job
    swarm_cmd = 'swarm -f merge_adni.swarm -g 50 --time 12:00:00 --module plink/1.9'
    shell_do(swarm_cmd)

In [None]:
def get_adni_phenos(covars_path, geno_path, out_path):
    # read covar file, rename needed columns
    covars = pd.read_csv(covars_path, sep=',')
    covars = covars.rename(columns={'PTID':'IID','DX_bl':'Diagnosis'})
    
    # isolate proper columns
    covars_needed = covars.loc[:, ['IID','Diagnosis','AGE']]
    
    # read fam
    fam = read_fam(f'{geno_path}.fam', True)
    
    # merge with covars and drop duplicates
    merge = fam.merge(covars_needed, how='inner', on=['IID'])
    merge = merge.drop_duplicates(ignore_index=True)
    
    # isolate CN and AD cases and change to plink format
    merge = merge[(merge['Diagnosis']=='CN') | (merge['Diagnosis']=='AD')]
    merge['PHENO'] = np.where(merge['Diagnosis'] == 'AD', 2, 1)
    
    # drop controls with age under 60
    merge = merge.drop(merge[(merge['AGE'] < 60) & (merge['PHENO'] == 1)].index)
    
    # drop diagnosis and age columns
    merge = merge.drop(columns=['Diagnosis','AGE'], axis=1)
    
    # write IDs to txt
    merge[['FID','IID']].to_csv(f'{out_path}_ids.txt', sep='\t', index=None, header=None)
    
    # only keep samples from intersection of covars and IDs
    keep_cmd = f'plink2 --bfile {geno_path} --keep {out_path}_ids.txt --make-bed --out {out_path}'
    shell_do(keep_cmd)
    
    # write to file
    merge.to_csv(f'{out_path}.fam', sep='\t', index=None, header=None)

In [None]:
def get_ftd_lbd_genos(geno_dir, ftd_out_dir, lbd_out_dir):
    # write bash script to copy files into wd
    with open('get_ftd_lbd.sh', 'w') as f:
        f.write('#!/usr/bin/env bash\n\n')
        f.write(f'cp {geno_dir}/merged_FTD.* {ftd_out_dir}\n')
        f.write(f'cp {geno_dir}/merged_LBD.* {lbd_out_dir}\n')
        f.close()
    
    # run bash script
    shell_do(f'bash get_ftd_lbd.sh')

In [None]:
def process_ftd_lbd_als(covars_path, ftd_geno_path, lbd_geno_path, ftd_out_path, lbd_out_path, als_out_path):
    # read covars, isolate needed columns
    covars = pd.read_csv(covars_path, sep='\s+')
    covars_needed = covars.loc[:, ['FID','CONSENSUS_AGE','PHENO']]
    covars_needed.columns = ['FID','AGE','STATUS']
    
    # read FTD/ALS fam
    ftd_als_fam = read_fam(f'{ftd_geno_path}.fam')
    
    # merge
    ftd_als_merge = ftd_als_fam.merge(covars_needed, how='inner', on=['FID'])
    
    # drop controls with age under 60
    ftd_als_merge = ftd_als_merge.drop(ftd_als_merge[(ftd_als_merge['AGE'] < 60) & (ftd_als_merge['PHENO'] == 1)].index)
    
    # isolate controls + FTD cases
    ftd_merge = ftd_als_merge[(ftd_als_merge['STATUS'] == 'CONTROL') | (ftd_als_merge['STATUS'] == 'FTD')]
    
    # write IDs to txt
    ftd_merge[['FID','IID']].to_csv(f'{ftd_out_path}_ids.txt', sep='\t', index=None, header=None)
    
    # only keep IDs left
    keep_cmd = f'plink2 --bfile {ftd_geno_path} --keep {ftd_out_path}_ids.txt --make-bed --out {ftd_out_path}'
    shell_do(keep_cmd)
    
    # isolate controls + ALS cases
    als_merge = ftd_als_merge[(ftd_als_merge['STATUS'] == 'CONTROL') | (ftd_als_merge['STATUS'] == 'ALS')]
    
    # write IDs to txt
    als_merge[['FID','IID']].to_csv(f'{als_out_path}_ids.txt', sep='\t', index=None, header=None)
    
    # only keep IDs left
    keep_cmd = f'plink2 --bfile {ftd_geno_path} --keep {als_out_path}_ids.txt --make-bed --out {als_out_path}'
    shell_do(keep_cmd)
    
    # read LBD fam
    lbd_fam = read_fam(f'{lbd_geno_path}.fam')
    
    # merge
    lbd_merge = lbd_fam.merge(covars_needed, how='inner', on=['FID'])
    
    # drop controls with age under 60
    lbd_merge = lbd_merge.drop(lbd_merge[(lbd_merge['AGE'] < 60) & (lbd_merge['PHENO'] == 1)].index)
    
    # write IDs to txt
    lbd_merge[['FID','IID']].to_csv(f'{lbd_out_path}_ids.txt', sep='\t', index=None, header=None)
    
    # only keep IDs left
    keep_cmd = f'plink2 --bfile {lbd_geno_path} --keep {lbd_out_path}_ids.txt --make-bed --out {lbd_out_path}'
    shell_do(keep_cmd)

In [None]:
def get_amp_pd_phenos(covars_path, geno_path, out_path):
    # read covars, isolate needed columns
    covars = pd.read_csv(covars_path, sep=',')
    covars_needed = covars.loc[:,['FID','IID','SEX','PD_PHENO','AGE_ANALYSIS','DLB_PHENO']]
    
    # read fam file
    fam = read_fam(f'{geno_path}.fam')
    
    # merge 
    merge = fam[['FID','IID','PAT','MAT']].merge(covars_needed, how='inner', on=['FID','IID'])
    
    # drop samples with LBD and NA PD pheno
    merge = merge[(merge['DLB_PHENO'] != 2) & (merge['PD_PHENO'] != -9)]
    
    # drop controls with age under 60
    merge = merge.drop(merge[(merge['AGE_ANALYSIS'] < 60) & (merge['PD_PHENO'] == 1)].index)
    
    # drop age and dlb columns
    merge = merge.drop(columns=['AGE_ANALYSIS','DLB_PHENO'], axis=1)
    
    # write IDs to txt
    merge[['FID','IID']].to_csv(f'{out_path}_ids.txt', sep='\t', index=None, header=None)
    
    # only keep IDs left
    keep_cmd = f'plink2 --bfile {geno_path} --keep {out_path}_ids.txt --make-bed --out {out_path}'
    shell_do(keep_cmd)
    
    # write to file
    merge.to_csv(f'{out_path}.fam', sep='\t', index=None, header=None)

In [None]:
def get_adsp_genos(pfiles, individual_paths, out_path):
    # loop through individual chr pfiles and convert to bfiles
    for i in range(22):
        pfile = pfiles.replace('*', str(i+1))
        individual_out = individual_paths.replace('*', str(i+1))
        plink_cmd = f'plink2 --pfile {pfile} --make-bed --out {individual_out}'
        shell_do(plink_cmd)
    
    # merge individual chr bfiles
    merge_bfiles(individual_paths, adsp_geno_path, 22)

In [None]:
def get_adsp_phenos(covars_path, geno_path, out_path):
    # read covars, isolate needed columns
    covars = pd.read_csv(covars_path, sep='\t')
    covars_needed = covars.loc[:,['SUBJID','Sex','Age','AD']]
    
    # read fam file
    fam = read_fam(f'{geno_path}.fam', True)
    
    # build merge-ID for merging with covars
    fam_id_split = fam['IID'].str.split(pat='-', expand=True)
    fam_id_split['ID'] = fam_id_split[0] + '-' + fam_id_split[1] + '-' + fam_id_split[2]
    fam['merge-ID'] = fam_id_split['ID']
    
    # merge covars and fam
    merge = covars_needed.merge(fam, how='inner', left_on='SUBJID',right_on='merge-ID')
    
    # adjusting age and changing its type
    merge['Age'] = np.where(merge['Age']=='90+', '90', merge['Age'])
    merge['Age'] = merge['Age'].astype(float)
    
    # assigning sex and pheno
    merge['SEX'] = np.where(merge['Sex'] == 0, 1, 2)
    merge['PHENO'] = np.where(merge['AD'] == '0', 1, 2)
    
    # drop controls with age under 60
    merge = merge.drop(merge[(merge['Age'] < 60) & (merge['AD'] == '0')].index)
    
    # drop covar columns 
    merge = merge.drop(columns=['SUBJID','Sex','Age','AD','merge-ID'], axis=1)
    
    # write IDs to txt
    merge[['FID','IID']].to_csv(f'{out_path}_ids.txt', sep='\t', header=False, index=False)
    
    # only keep IDs left
    keep_cmd = f'plink2 --bfile {geno_path} --keep {out_path}_ids.txt --make-bed --out {out_path}'
    shell_do(keep_cmd)
    
    # write to file
    merge.to_csv(f'{out_path}.fam', sep='\t', header=False, index=False)
    
    merge['FID_new'] = 'ADSP' + merge['IID'].astype(str)
    merge['IID_new'] = 'ADSP' + merge['IID'].astype(str)
          
    # write new IDs to txt
    merge[['FID','IID','FID_new','IID_new']].to_csv(f'{out_path}_new_ids.txt', sep='\t', index=None, header=None)
    
    update_ids(out_path, f'{out_path}_new_ids.txt')

In [None]:
# project directory
wd = 'insert_path'

In [None]:
# jointGenotyping data paths
jg_data_dir = 'insert_path'
jg_seq_data_files = f'{jg_data_dir}/NIA_JG_1898_samples_GRM_WGS_b37_JointAnalysis01_2017-12-08_*.recalibrated_variants.vcf.gz'
jg_out_vcf = f'{jg_data_dir}/temp.vcf'
jg_out_file = f'{jg_data_dir}/all'

vcf_to_pfile(jg_seq_data_files, jg_out_vcf, jg_out_file)

In [None]:
# generate jointGenotyping plink binaries
jg_out_dir = f'{wd}/ROSMAPMayoRNAseqMSBB'
jg_geno_path = f'{jg_out_dir}/joint_genotyping/jointGenotypingROSMAPMayoRNAseqMSBB'

pfiles_to_bfiles(jg_out_file, jg_geno_path)

In [None]:
# get jointGenotyping phenos
jg_covars_path = f'{jg_data_dir}/threeCohorts.csv'
jg_out_path = f'{jg_geno_path}_pheno'

get_joint_genotyping_phenos(jg_covars_path, jg_geno_path, jg_out_path)

In [None]:
reformat_bim(f'{jg_out_path}_new_ids.bim')

In [None]:
# merge ADNI chromosomes
adni_geno_dir = 'insert_path'
adni_geno_path = f'{wd}/ADNI/genotypes/ADNI_all'

merge_adni(adni_geno_dir, adni_geno_path)

In [None]:
# get ADNI phenos
adni_covars_path = 'insert_path'
adni_out_path = f'{wd}/ADNI/genotypes/ADNI_all_pheno'

get_adni_phenos(adni_covars_path, adni_geno_path, adni_out_path)

In [None]:
reformat_bim(f'{adni_out_path}.bim')

In [None]:
# get dementia seq genotypes
ftd_lbd_geno_dir = 'insert_path'
ftd_out_dir = f'{wd}/FTD_LBD_ALS/ftd_genotypes'
lbd_out_dir = f'{wd}/FTD_LBD_ALS/lbd_genotypes'
als_out_dir = f'{wd}/FTD_LBD_ALS/als_genotypes'

get_ftd_lbd_genos(ftd_lbd_geno_dir, ftd_out_dir, lbd_out_dir)

In [None]:
# isolate ALS cases and get dementia seq phenos
ftd_geno_path = f'{ftd_out_dir}/merged_FTD'
lbd_geno_path = f'{lbd_out_dir}/merged_LBD'
ftd_out_path = f'{ftd_out_dir}/merged_FTD_age_filter'
lbd_out_path = f'{lbd_out_dir}/merged_LBD_age_filter'
als_out_path = f'{als_out_dir}/merged_ALS_age_filter'
ftd_lbd_covars_path = f'{ftd_lbd_geno_dir}/demseq_model1_covariates.txt'

process_ftd_lbd_als(ftd_lbd_covars_path, ftd_geno_path, lbd_geno_path, ftd_out_path, lbd_out_path, als_out_path)

In [None]:
reformat_bim(f'{ftd_out_path}.bim')
reformat_bim(f'{lbd_out_path}.bim')
reformat_bim(f'{als_out_path}.bim')

In [None]:
# get AMP-PD phenos
pd_data_dir = 'insert_path'
pd_covars_path = f'{pd_data_dir}/encoded_AMP_covs_SEPT2021.csv'
pd_geno_path = f'{pd_data_dir}/amp_v2.5_formatted_split_normalized_allchr_plinkv19_maxAlleles2_EURO'
pd_out_path = f'{wd}/AMP_PD/amp_pd_pheno'

get_amp_pd_phenos(pd_covars_path, pd_geno_path, pd_out_path)

In [None]:
reformat_bim(f'{pd_out_path}.bim')

In [None]:
# get ADSP genos
adsp_data_dir = 'insert_path'
adsp_pfiles = f'{adsp_data_dir}/chr*_formatted_normalized'
adsp_out_dir = f'{wd}/ADSP'
adsp_individual_out_path = f'{adsp_out_dir}/individual_chrs/chr*_formatted_normalized'
adsp_geno_path = f'{adsp_out_dir}/adsp_formatted_normalized'

get_adsp_genos(adsp_pfiles, adsp_individual_out_path, adsp_geno_path)

In [None]:
# get ADSP phenos
adsp_covars_path = f'insert_path'
adsp_out_path = f'{adsp_out_dir}/adsp_formatted_normalized_pheno'

get_adsp_phenos(adsp_covars_path, adsp_geno_path, adsp_out_path)

In [None]:
reformat_bim(f'{adsp_out_path}_new_ids.bim')