In [None]:
import pandas as pd
import subprocess
import sys
import numpy as np
import os
import shutil
import joblib
from biothings_client import get_client

In [None]:
def shell_do(command, log=False, return_log=False):
    print(f'Executing: {(" ").join(command.split())}', file=sys.stderr)

    res=subprocess.run(command.split(), stdout=subprocess.PIPE)

    if log:
        print(res.stdout.decode('utf-8'))
    if return_log:
        return(res.stdout.decode('utf-8'))

In [None]:
def extract(path_dict, merge_bim, common_snps_out_path):
    # write common SNPs to txt file
    merge_bim[['ID']].to_csv(common_snps_out_path, sep='\t', index=False, header=False)
    
    # extracting common SNPs from each cohort 
    for cohort in path_dict:
        extract_cmd = f"plink2 --bfile {path_dict[cohort]['geno']} --extract {common_snps_out_path} --make-bed --out {path_dict[cohort]['out']}"
        shell_do(extract_cmd)

In [None]:
def merge(path_dict, merge_list_out_path, out_path):
    # write out paths to txt file for merging
    with open (merge_list_out_path, 'w') as f:
        for cohort in path_dict:
            f.write(f"{path_dict[cohort]['out']}\n")
        f.close()
    
    # bash script for merging
    with open('merge_genotypes.sh', 'w') as f:
        f.write('#!/usr/bin/env bash\n\n')
        f.write('module load plink/1.9\n')
        f.write(f'plink --merge-list {merge_list_out_path} --make-bed --out {out_path}\n')
        f.close()
    
    # swarm script
    with open('merge_genotypes.swarm', 'w') as f:
        f.write('bash merge_genotypes.sh')
        f.close()
        
    # queue swarm job
    shell_do('swarm -f merge_genotypes.swarm -g 200 --time 12:00:00 --module plink/1.9')

In [None]:
def make_phenotype_file(path_dict, geno_path, out_path):
    # read merged fam
    fam = pd.read_csv(f'{geno_path}.fam', sep='\s+', header=None)
    fam.columns = ['FID','IID','PAT','MAT','SEX','PHENO']
    print(fam.shape)
    
    # set up df
    total_merge = pd.DataFrame(columns=['FID','IID','PHENOTYPE','COHORT'])

    # loop through cohorts
    for cohort in path_dict: 
        # merge fam files and set cohort variable
        merge = path_dict[cohort]['fam'].merge(fam[['FID','IID']], how='inner', on=['FID','IID'])
        merge['COHORT'] = cohort

        # get pheno based on disease
        if cohort == 'FTD':
            merge['PHENOTYPE'] = np.where(merge['PHENO'] == 1, 'control', 'ftd')
            total_merge = total_merge.append(merge[['FID','IID','PHENOTYPE','COHORT']])

        elif cohort == 'LBD':
            merge['PHENOTYPE'] = np.where(merge['PHENO'] == 1, 'control', 'lbd')
            total_merge = total_merge.append(merge[['FID','IID','PHENOTYPE','COHORT']])

        elif cohort == 'ALS':
            merge['PHENOTYPE'] = np.where(merge['PHENO'] == 1, 'control', 'als')
            total_merge = total_merge.append(merge[['FID','IID','PHENOTYPE','COHORT']])
        
        elif cohort == 'PD':
            merge['PHENOTYPE'] = np.where(merge['PHENO'] == 1, 'control', 'pd')
            total_merge = total_merge.append(merge[['FID','IID','PHENOTYPE','COHORT']])
            
        else:
            merge['PHENOTYPE'] = np.where(merge['PHENO'] == 1, 'control', 'ad')
            total_merge = total_merge.append(merge[['FID','IID','PHENOTYPE','COHORT']])

    # drop any duplicates
    total_merge = total_merge.drop_duplicates(subset=['FID','IID']) 
    print(total_merge.head())
    print(total_merge.shape)
    print(total_merge['PHENOTYPE'].value_counts())
    print(total_merge['COHORT'].value_counts())
    
    total_merge.to_csv(f'{out_path}', sep='\t', header=False, index=False)

In [None]:
def annovar_merge(annovar_dowload_file, geno_path, out_path):
    # could run in swarm job
    # get download file folder
    wd = os.path.dirname(annovar_download_file)
    
    # unpack annovar in wd
    unpack_cmd = f'tar xvfz {annovar_download_file} -C {wd}/'
    shell_do(unpack_cmd)
    annovar_folder_path = f'{wd}/annovar'
    
    # download hg38 - avsnp150
    download_cmd = f'{annovar_folder_path}/annotate_variation.pl -buildver hg38 -downdb -webfrom annovar avsnp150 {annovar_folder_path}/humandb/'
    shell_do(download_cmd)
    
    # read in bim
    bim = pd.read_csv(f'{geno_path}.bim', sep='\s+')
    bim.columns = ['CHR','ID','POS','BP','REF','ALT']
    
    # create annovar txt file
    annovar_txt = pd.DataFrame(columns=['#CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO'])
    annovar_txt['#CHROM'] = bim['CHR']
    annovar_txt['POS'] = bim['BP']
    annovar_txt['ID'] = bim['ID']
    annovar_txt['REF'] = bim['ALT']
    annovar_txt['ALT'] = bim['REF']
    annovar_txt['QUAL'] = '.'
    annovar_txt['FILTER'] = '.'
    annovar_txt['INFO'] = 'PR'
    annovar_txt.to_csv(f'{out_path}.txt', sep='\t', index=False)
    
    # get RSIDs
    convert_cmd = f'{annovar_folder_path}/table_annovar.pl {out_path}.txt {annovar_folder_path}/humandb/ -buildver hg38 --thread 1 -polish -out {out_path} -remove -protocol avsnp150 -operation f -nastring . -vcfinput'
    shell_do(convert_cmd)
    
    # read in RSIDs
    annovar_data_path = f'{out_path}.hg38_multianno.txt'
    annovar_data = pd.read_csv(annovar_data_path, sep='\s+')
    
    # merge with bim and get rid of SNPs with no RSID
    merge = bim.merge(annovar_data[['avsnp150','Otherinfo6']], how='inner', left_on=['ID'], right_on=['Otherinfo6'])
    merge_trimmed = merge[merge['avsnp150'] != '.']
    
    # write SNPs to text file and extract
    snps_path = f'{out_path}_rsID.txt'
    merge_trimmed[['ID']].to_csv(snps_path, sep='\t', index=False, header=False)
    extract_cmd = f'plink2 --bfile {geno_path} --extract {snps_path} --make-bed --out {out_path}'
    shell_do(extract_cmd)
    
    # read in new bim file and convert ID so rsID
    new_bim = pd.read_csv(f'{out_path}.bim', sep='\s+', header=None)
    new_bim.columns = ['CHR','ID','POS','BP','REF','ALT']
    new_bim = new_bim.merge(annovar_data[['avsnp150','Otherinfo6']], how='inner', left_on=['ID'], right_on=['Otherinfo6'])
    new_bim['ID'] = new_bim['avsnp150']
    new_bim = new_bim.drop(columns=['avsnp150', 'Otherinfo6'], axis=1)
    
    # write to file
    new_bim.to_csv(f'{out_path}.bim', sep='\t', index=False, header=False)

In [None]:
def run_merged_qc(bash_path, geno_path, out_path, env):
    # write swarm file
    with open(f'merge_qc.swarm', 'w') as f:
        f.write(f'bash {bash_path} -i {geno_path} -o {out_path} -e {env}\n')
        f.close()
    
    # queue swarm job
    shell_do('swarm -f merge_qc.swarm -g 200 --time 10-00:00:00 --module python/3.7,plink/1.9,GCTA')

In [None]:
def run_merged_ancestry(bash_path, geno_path, ref_path, ref_labels_path, out_path, env):
    # write swarm file
    with open(f'merge_ancestry.swarm', 'w') as f:
        pkl_path = f'{out_path}_umap_linearsvc_ancestry_model.pkl'
        f.write(f'bash {bash_path} -g {geno_path} -r {ref_path} -l {ref_labels_path} -m {pkl_path} -o {out_path} -e {env}')
        f.close()
    # queue swarm job
    shell_do('swarm -f merge_ancestry.swarm -g 200 --time 10-00:00:00 --module python/3.7,plink/1.9,admixture')

In [None]:
def remove_by_ancestry(geno_path, pred_labels_path, pca_path, out_path, n_pcs=2):
    # read predicted_labels.txt file
    pred_labels = pd.read_csv(pred_labels_path, sep='\s+')
    
    # read project_new_pca.txt file and drop label column
    pca = pd.read_csv(pca_path, sep='\s+')
    pca = pca.drop(columns=['label'], axis=1)
    
    # merge
    merge = pred_labels.merge(pca, how='inner', on=['FID','IID'])
    
    # drop non-EUR cases
    merge = merge[merge['label'] == 'EUR']
    
    # drop PC outliers (1+2 to start)
    merge_trim_pca = merge.copy(deep=True)

    for i in range(2):
        print(f'PC{i+1}')
        pc_col = f'PC{str(i+1)}'
        q1 = np.quantile(merge[pc_col],0.25)
        med = np.quantile(merge[pc_col],0.5)
        q3 = np.quantile(merge[pc_col],0.75)
        iq = q3-q1
        lf = q1-(1.5*iq)
        uf = q3+(1.5*iq)
        merge_trim_pca = merge_trim_pca[(merge_trim_pca[pc_col]>lf) & (merge_trim_pca[pc_col]<uf)]
        print(merge_trim_pca.shape)
    
    # write FID, IID to txt
    merge_trim_pca[['FID','IID']].to_csv(f'{out_path}_ids.txt', sep='\t', index=False)
    
    # plink --keep command
    keep_cmd = f'plink2 --bfile {geno_path} --keep {out_path}_ids.txt --make-bed --out {out_path}'
    shell_do(keep_cmd)

In [None]:
def extract_gwas_hits(geno_path, ad_hits_path, pd_hits_path, als_hits_path, lbd_hits_path, out_path):
    # read bim file
    bim = pd.read_csv(f'{geno_path}.bim', sep='\s+', header=None)
    bim.columns = ['CHR','ID','LOC','BP','ALT','REF']
    
    # read hits files
    ad_hits = pd.read_csv(ad_hits_path, sep='\s+')
    ad_hits['RSID'] = ad_hits['lead_SNP'].str.split('_').str[0]
    pd_hits = pd.read_csv(pd_hits_path, sep=',')
    pd_hits = pd_hits[pd_hits['GWAS'] == 'META5']
    als_hits = pd.read_csv(als_hits_path, sep=',')
    als_hits = als_hits.rename({'ID':'RSID'}, axis=1)
    lbd_hits = pd.read_csv(lbd_hits_path, sep=',')
    lbd_hits = lbd_hits.rename({'SNPS':'RSID'}, axis=1)
    
    # merge with bim, concat and drop duplicates
    merge_ad = bim.merge(ad_hits[['RSID']], how='inner', left_on=['ID'], right_on=['RSID'])
    merge_pd = bim.merge(pd_hits[['RSID']], how='inner', left_on=['ID'], right_on=['RSID'])
    merge_als = bim.merge(als_hits[['RSID']], how='inner', left_on=['ID'], right_on=['RSID'])
    merge_lbd = bim.merge(lbd_hits[['RSID']], how='inner', left_on=['ID'], right_on=['RSID'])
    merge_hits = pd.concat([merge_ad,merge_pd,merge_als,merge_lbd], ignore_index=True)
    merge_hits = merge_hits.drop_duplicates()
    
    merge_hits.to_csv(f'/data/CARD/projects/AD_Cluster/processing/sum_stats/combined.txt')
    
    # write IDs to txt and extract
    merge_hits[['RSID']].to_csv(f'{out_path}_ids.txt', sep='\t', index=False, header=False)
    extract_cmd = f'plink2 --bfile {geno_path} --extract {out_path}_ids.txt --make-bed --out {out_path}'
    shell_do(extract_cmd)

In [None]:
def extract_gwas_variants(geno_path, ad_stats_path, pd_stats_path, als_stats_path, lbd_stats_path, ftd_hits_path, out_path):
    # read bim file
    bim = pd.read_csv(f'{geno_path}.bim', sep='\s+', header=None)
    bim.columns = ['CHR','ID','LOC','BP','ALT','REF']
    bim['mergeID'] = bim['CHR'].astype(str) + ':' + bim['BP'].astype(str) 
    
    # read stats files and make mergeID where necessary
    ad_stats = pd.read_csv(ad_stats_path, sep='\s+')
    pd_stats = pd.read_csv(pd_stats_path, sep='\s+')
    pd_stats['mergeID'] = pd_stats['chr'].astype(str) + ':' + pd_stats['pos'].astype(str)
    als_stats = pd.read_csv(als_stats_path, sep='\s+')
    als_stats['mergeID'] = als_stats['CHR'].astype(str) + ':' + als_stats['BP'].astype(str)
    lbd_stats = pd.read_csv(lbd_stats_path, sep='\s+')
    ftd_stats = pd.read_csv(ftd_stats_path, sep='\s+')
    ftd_stats['mergeID'] = ftd_stats['Chr'].astype(str) + ':' + ftd_stats['Bp'].astype(str)
    
    # merge, drop, rename
    merge_ad = bim.merge(ad_stats[['variant_id','p_value','effect_allele','beta']], how='inner', left_on=['ID'], right_on=['variant_id'])
    merge_ad = merge_ad.drop(columns=['mergeID', 'variant_id'], axis=1)
    merge_ad['disease'] = 'AD'
    merge_pd = bim.merge(pd_stats[['mergeID','p','A1','b']], how='inner', on=['mergeID'])
    merge_pd = merge_pd.rename({'p':'p_value','A1':'effect_allele','b':'beta'}, axis=1)
    merge_pd = merge_pd.drop(columns=['mergeID'], axis=1)
    merge_pd['disease'] = 'PD'
    merge_als = bim.merge(als_stats[['mergeID','P','Allele1','Effect']], how='inner', on=['mergeID'])
    merge_als = merge_als.rename({'P':'p_value','Allele1':'effect_allele','Effect':'beta'}, axis=1)
    merge_als = merge_als.drop(columns=['mergeID'], axis=1)
    merge_als['effect_allele'] = merge_als['effect_allele'].str.upper()
    merge_als['disease'] = 'ALS'
    merge_lbd = bim.merge(lbd_stats[['variant_id','p_value','effect_allele','beta']], how='inner', left_on=['ID'], right_on=['variant_id'])
    merge_lbd = merge_lbd.drop(columns=['mergeID','variant_id'])
    merge_lbd['disease'] = 'LBD'
    merge_ftd = bim.merge(ftd_stats[['mergeID','P.value','Allele1','Beta']], how='inner', on=['mergeID'])
    merge_ftd = merge_ftd.rename({'P.value':'p_value','Allele1':'effect_allele','Beta':'beta'}, axis=1)
    merge_ftd = merge_ftd.drop(columns=['mergeID'], axis=1)
    merge_ftd['effect_allele'] = merge_ftd['effect_allele'].str.upper()
    merge_ftd['disease'] = 'FTD'
    
    # [5e-3,1e-3,5e-5,1e-5,5e-8]
    
    for val in [5e-8]:
        # make out path
        val_str = str(val)
        out = f'{out_path}_{val_str}'
        
        # concatenate merged stats and drop by pval
        concat = pd.concat([merge_ad,merge_pd,merge_als,merge_ftd,merge_lbd], ignore_index=True)
        concat = concat[concat['p_value'] < val]
        concat = concat.drop(columns=['p_value'], axis=1)
        
        # for PRS analysis
        concat[['ID','effect_allele','beta','disease']].to_csv(f'{out}_assoc.txt', sep='\t', header=False, index=False)
        
        # group same snps together
        concat = concat.groupby(['CHR','ID','LOC','BP','ALT','REF']).agg(tuple).applymap(list).reset_index()
        
        # IDs to extract - appending all non-lead SNPs that are gene-associated
        ids = concat[['ID']] 
        assoc_snps = {'rs7412':['AD'],'rs429358':['AD'],'rs75932628':['AD'],'rs143332484':['AD'],
                      'rs2230288':['PD'],'rs75548401':['PD'],'rs76763715':['PD'],'rs34637584':['PD'],'rs1491942':['LBD']}
        
        needed_assoc_snps = []
        
        for key in list(assoc_snps.keys()):
            if key not in ids.values:
                needed_assoc_snps.append(key)
        
        ids = ids.append(pd.Series(needed_assoc_snps), ignore_index=True)
        ids.to_csv(f'{out}_ids.txt', sep='\t', index=False, header=False)
        
        # create snp disease association df
        snp_disease = pd.DataFrame(concat['disease'].to_list())
        snp_disease['ID'] = concat['ID']
        
        assoc_snps_disease = {}
        
        for i in range(snp_disease.shape[1]):
            if i != (snp_disease.shape[1]-1):
                assoc_snps_disease[i] = []
            else:
                assoc_snps_disease['ID'] = []
        
        for snp in assoc_snps:
            if snp in needed_assoc_snps:
                assoc_snps_disease[0].append(assoc_snps[snp][0])
                for key in list(assoc_snps_disease.keys()):
                    if (key != 0) and (key != 'ID'):
                        assoc_snps_disease[key].append(np.nan)
                assoc_snps_disease['ID'].append(snp)
                
        snp_disease = pd.concat([snp_disease,pd.DataFrame(assoc_snps_disease)], axis=0, ignore_index=True)
        snp_disease.to_csv(f'{out}_snp_disease.csv', sep=',', header=False, index=False)
        
        # get gene associations
        variants = list(snp_disease['ID'])
        variant_map = {x : None for x in variants}
        variant_client = get_client('variant')
        rsIDs = variant_client.querymany(variants, scopes='dbsnp.rsid', fields='all', verbose=False)
        for rsID in rsIDs:
            query = rsID['query']
            if 'dbsnp' in rsID:
                dbsnp = rsID['dbsnp']
                if 'gene' in dbsnp:
                    gene = dbsnp['gene']
                    if 'symbol' in gene and not variant_map[query]:
                        variant_map[query] = gene['symbol']
        genes = list(variant_map.values())
        snp_disease['genes'] = genes
        snp_disease.to_csv(f'{out}_snp_disease_genes.csv', sep=',', header=False, index=False)
        
        # extract gwas variants
        extract_cmd = f'plink2 --bfile {geno_path} --extract {out}_ids.txt --make-bed --out {out}'
        shell_do(extract_cmd)

In [None]:
def downsample(geno_path, pheno_path, out_path):
    # read fam file
    fam = pd.read_csv(f'{geno_path}.fam', sep='\s+', header=None)
    fam.columns = ['FID','IID','PAT','MAT','SEX','PHENO']
    
    # read pheno file, remove controls
    pheno = pd.read_csv(pheno_path, sep='\s+', header=None)
    pheno.columns = ['FID','IID','PHENO','COHORT']
    pheno = pheno[pheno['PHENO'] != 'control']
    
    # merge
    merge = fam.merge(pheno, how='inner', on=['FID','IID'])
    print(merge.head())
    
    # sample n cases from each pheno
    g = merge.groupby('PHENO_y', group_keys=False)
    balanced = pd.DataFrame(g.apply(lambda x: x.sample(n=1000, random_state=42))).reset_index(drop=True)
    
    # write IDs to txt
    balanced[['FID','IID']].to_csv(f'{out_path}_ids.txt', sep='\t', index=False)
    
    # plink keep cmd
    keep_cmd = f'plink2 --bfile {geno_path} --keep {out_path}_ids.txt --make-bed --out {out_path}'
    shell_do(keep_cmd)

In [None]:
def remove_controls(geno_path, pheno_path, out_path):
    fam = pd.read_csv(f'{geno_path}.fam', sep='\s+', header=None)
    fam.columns = ['FID','IID','PAT','MAT','SEX','PHENO']
    
    pheno = pd.read_csv(f'{pheno_path}', sep='\s+', header=None)
    pheno.columns = ['FID','IID','PHENO','COHORT']
    
    merge = fam.merge(pheno, how='inner', on=['FID','IID'])
    merge = merge[merge['PHENO_y'] != 'control']
                      
    merge[['FID','IID']].to_csv(f'{out_path}_ids.txt', sep='\t', index=False)
                      
    keep_cmd = f'plink2 --bfile {geno_path} --keep {out_path}_ids.txt --make-bed --out {out_path}'
    shell_do(keep_cmd)

In [None]:
# project path
wd = 'insert_path'

# bash path
bash_path = f'{wd}/processing/merge/merge_pipeline.sh'

# pickle path
path_dict_pkl_path = f'{wd}/processing/merge/path_dict.pkl'
merge_pkl_path = f'{wd}/processing/merge/merge_bim.pkl'

# conda enviornment (need jupyter and GenoTools installed)
env = 'genotools'

# out dir
out_dir = f'{wd}/merged_genotypes'
os.makedirs(out_dir, exist_ok=True)

# extract dir
extract_dir = f'{out_dir}/extract'
os.makedirs(extract_dir, exist_ok=True)

# stats dir
stats_dir = f'{wd}/processing/sum_stats'

In [None]:
# jg path
jg_geno_path = f'{wd}/ROSMAPMayoRNAseqMSBB/joint_genotyping/lifted/jointGenotypingROSMAPMayoRNAseqMSBB_pheno_qc_lifted'
jg_out_path = f'{extract_dir}/jointGenotyping_common_snps'

# adni path
adni_geno_path = f'{wd}/ADNI/genotypes/qc/ADNI_all_pheno_qc'
adni_out_path = f'{extract_dir}/ADNI_common_snps'

# ftd path
ftd_geno_path = f'{wd}/FTD_LBD_ALS/ftd_genotypes/qc/merged_FTD_qc'
ftd_out_path = f'{extract_dir}/FTD_common_snps'

# lbd path
lbd_geno_path = f'{wd}/FTD_LBD_ALS/lbd_genotypes/qc/merged_LBD_qc'
lbd_out_path = f'{extract_dir}/LBD_common_snps'

# als path
als_geno_path = f'{wd}/FTD_LBD_ALS/als_genotypes/qc/merged_ALS_qc'
als_out_path = f'{extract_dir}/ALS_common_snps'

# pd path
pd_geno_path = f'{wd}/AMP_PD/qc/amp_pd_pheno_qc'
pd_out_path = f'{extract_dir}/amp_pd_common_snps'

# adsp path 
adsp_geno_path = f'{wd}/ADSP/qc/adsp_formatted_normalized_pheno_new_ids'
adsp_out_path = f'{extract_dir}/ADSP_common_snps'

In [None]:
# load pickle file if it exists
if os.path.isfile(path_dict_pkl_path):
    path_dict = joblib.load(path_dict_pkl_path)
    
else: # otherwise set it up
    path_dict = {'JG':{}, 'ADNI':{}, 'FTD':{}, 'LBD':{}, 'ALS':{}, 'PD':{}, 'ADSP':{}}
    
    # adding to path dict
    path_dict['JG']['geno'] = jg_geno_path
    path_dict['JG']['out'] = jg_out_path
    path_dict['ADNI']['geno'] = adni_geno_path
    path_dict['ADNI']['out'] = adni_out_path
    path_dict['FTD']['geno'] = ftd_geno_path
    path_dict['FTD']['out'] = ftd_out_path
    path_dict['LBD']['geno'] = lbd_geno_path
    path_dict['LBD']['out'] = lbd_out_path
    path_dict['ALS']['geno'] = als_geno_path
    path_dict['ALS']['out'] = als_out_path
    path_dict['PD']['geno'] = pd_geno_path
    path_dict['PD']['out'] = pd_out_path
    path_dict['ADSP']['geno'] = adsp_geno_path
    path_dict['ADSP']['out'] = adsp_out_path
    
    # read bim files and add to dict
    for cohort in path_dict:
        path_dict[cohort]['bim'] = pd.read_csv(f"{path_dict[cohort]['geno']}.bim", sep='\s+', header=None)
        path_dict[cohort]['bim'].columns = ['CHR','ID','LOC','BP','ALT','REF']
        print(cohort)
        print(path_dict[cohort]['bim'].shape)
    
    # read fam files and add to dict
    for cohort in path_dict:
        path_dict[cohort]['fam'] = pd.read_csv(f"{path_dict[cohort]['geno']}.fam", sep='\s+', header=None)
        path_dict[cohort]['fam'].columns = ['FID','IID','PAT','MAT','SEX','PHENO']
        print(cohort)
        print(path_dict[cohort]['fam'].shape)
    
    # create pickle file
    joblib.dump(path_dict, path_dict_pkl_path)

In [None]:
# load pickle file if it exists
if os.path.isfile(merge_pkl_path):
    merge_bim = joblib.load(merge_pkl_path)
    print(merge_bim.shape)
    
else: # otherwise merge cohort bim files together
    merge_bim = path_dict['JG']['bim'].merge(path_dict['ADNI']['bim'][['ID']], how='inner', on=['ID'])
    print(merge_bim.shape)
    for cohort in path_dict:
        if cohort != 'JG' and cohort != 'ADNI':
            merge_bim = merge_bim.merge(path_dict[cohort]['bim'][['ID']], how='inner', on=['ID'])
            print(merge_bim.shape)
            
    # create pickle file
    joblib.dump(merge_bim, merge_pkl_path)

In [None]:
common_snps_out_path = f'{wd}/merged_genotypes/common_snps.txt'
extract(path_dict, merge_bim, common_snps_out_path)

In [None]:
merge_list_out_path = f'{wd}/merged_genotypes/merge_list.txt'
merge_geno_path = f'{wd}/merged_genotypes/all_cohorts_common_snps'
merge(path_dict, merge_list_out_path, merge_geno_path)

In [None]:
phenotype_out_path = f'{wd}/merged_genotypes/all_cohorts_phenotype.txt'
make_phenotype_file(path_dict, merge_geno_path, phenotype_out_path)

In [None]:
annovar_download_file = 'insert_path_to_annovar_gz_download_file'
annovar_out_path = f'{merge_geno_path}_annovar'
annovar_merge(annovar_download_file, merge_geno_path, annovar_out_path)

In [None]:
qc_bash_path = f'{wd}/processing/merge/merge_qc.sh'
qc_out_path = f'{wd}/merged_genotypes/qc/all_cohorts_common_snps_annovar_related_prune'
run_merged_qc(qc_bash_path, annovar_out_path, qc_out_path, env)

In [None]:
ancestry_bash_path = f'{wd}/processing/merge/merge_ancestry.sh'
ref_path = 'insert_ref_panel_path'
ref_labels_path = 'insert_ref_label_path'
ancestry_out_path = f'{wd}/merged_genotypes/ancestry/all_cohorts_common_snps_annovar_related_prune'
run_merged_ancestry(ancestry_bash_path, qc_out_path, ref_path, ref_labels_path, ancestry_out_path, env)

In [None]:
pred_labels_path = f'{wd}/merged_genotypes/ancestry/all_cohorts_common_snps_annovar_related_prune_umap_linearsvc_predicted_labels.txt'
pred_labels = pd.read_csv(pred_labels_path, sep='\s+')
print(pred_labels['label'].value_counts())
adj_labels_path = f'{wd}/merged_genotypes/ancestry/all_cohorts_common_snps_annovar_related_prune_adjusted_labels.txt'
adj_labels = pd.read_csv(adj_labels_path, sep='\s+')
print(adj_labels['label'].value_counts())


In [None]:
pred_labels_path = f'{wd}/merged_genotypes/ancestry/all_cohorts_common_snps_annovar_related_prune_umap_linearsvc_predicted_labels.txt'
pca_path = f'{wd}/merged_genotypes/ancestry/all_cohorts_common_snps_annovar_related_prune_projected_new_pca.txt'
ancestry_prune_out_path = f'{wd}/merged_genotypes/ancestry/all_cohorts_common_snps_annovar_related_prune_eur'
remove_by_ancestry(qc_out_path, pred_labels_path, pca_path, ancestry_prune_out_path)


In [None]:
ad_hits_path = f'{wd}/processing/sum_stats/AD.loci.tsv'
pd_hits_path = f'{wd}/processing/sum_stats/pd_gwas_risk_variants.csv'
als_hits_path = f'{wd}/processing/sum_stats/als_gwas_risk_variants.csv'
lbd_hits_path = f'{wd}/processing/sum_stats/lbd_gwas_risk_variants.csv'
gwas_out_path = f'{wd}/merged_genotypes/gwas_hits_common_snps_annovar_related_prune_eur'
extract_gwas_hits(ancestry_prune_out_path, ad_hits_path, pd_hits_path, als_hits_path, lbd_hits_path, gwas_out_path)

In [None]:
ad_stats_path = f'{stats_dir}/Schwartzentruber_2021_lifted_hg38.txt'
pd_stats_path = f'{stats_dir}/nallsEtAl2019_lifted_hg38.txt'
als_stats_path = f'{stats_dir}/alsMetaSummaryStats_lifted_hg38.txt'
lbd_stats_path = f'{stats_dir}/GCST90001390_buildGRCh38.tsv'
ftd_stats_path = f'{stats_dir}/Meta-analysis.Matched.AllResults_lifted_hg38.txt'
significance_out_path = f'{wd}/merged_genotypes/significance/gwas_common_snps_annovar_related_prune_eur'
extract_gwas_variants(ancestry_prune_out_path, ad_stats_path, pd_stats_path, als_stats_path, lbd_stats_path, ftd_stats_path, significance_out_path)


In [None]:
cases_only_out_path = f'{wd}/merged_genotypes/significance/cases_only/gwas_common_snps_annovar_related_prune_eur_cases_5e-08'
remove_controls(f'{significance_out_path}_5e-08', phenotype_out_path, cases_only_out_path)

In [None]:
significance_geno_path = f'{significance_out_path}_5e-08'
downsampled_out_path = f'{wd}/merged_genotypes/downsampled/gwas_common_snps_annovar_related_prune_eur_5e-08_downsampled'
downsample(significance_geno_path, phenotype_out_path, downsampled_out_path)

In [None]:
fam = pd.read_csv(f'{downsampled_out_path}.fam', sep='\s+', header=None)
print(fam.head())
print(fam.shape)

In [None]:
bim = pd.read_csv(f'{downsampled_out_path}.bim', sep='\s+', header=None)
print(bim.head())
print(bim.shape)