## Adjust genotypes by APOE and PCs

In [None]:
import h5py
import pandas as pd
import tables
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statistics
import umap.umap_ as umap
from joblib import dump, load
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
import subprocess
import sys
import os
import shutil
import joblib
from QC.utils import shell_do
import patsy

In [None]:
# Use GenoML to construct the .h5 that will be used for adjusting (GenoML will normalize and munge the data as well)
wd ='insert_path'

## APOE prep (SKIP FOR NOW)

Check imputation quality. If R2 > 0.95 then follow this script to get APOE genotype: https://github.com/neurogenetics/APOE_genotypes

Then create a column that has 0 for no e4 alleles, 1 for e3/e4 and e1/e4, 2 for e4/e4.

If R2 < 0.95, use dosage of rs429358 and rs7412 as 2 separate columns.

## PCA Prep

In [None]:
# using flashPCA      
def adj_pca_flash(geno_path, out_path, dim, exclusion = None):
    # Filter data (you can speed up things by adding --memory 119500 --threads 19 in PLINK)
    if exclusion != None:
        cmd_filter = f'plink --bfile {geno_path} --maf 0.05 --geno 0.01 --hwe 5e-6 --autosome --exclude {exclusion} --make-bed --out {out_path}'
    else:
        cmd_filter = f'plink --bfile {geno_path} --maf 0.05 --geno 0.01 --hwe 5e-6 --autosome --make-bed --out {out_path}'
    
    # Prune snps 
    cmd_prune = f'plink --bfile {out_path} --indep-pairwise 1000 10 0.02 --autosome --out {out_path}_pruned_data'
    
    # Extract pruned SNPs and only these variants will be used for PC calculation
    cmd_exract = f'plink --bfile {out_path} --extract {out_path}_pruned_data.prune.in --make-bed --out {out_path}_pruned_snps'
    
    # run PCS using flashpca
    flash_pca(f'{out_path}_pruned_snps', f'{out_path}_pca', dim = dim)
    
def adj_pca_fbash(geno_path, out_path, dim, exclusion = None):
    
    with open('PCA_prep_flash.py', 'w') as f:
        #f.write('#!/usr/bin/env bash\n\n')
        f.write('from flash_pca import flash_pca \n')
        f.write('from QC.utils import shell_do')
        f.write('\n \n')
        # Filter data (you can speed up things by adding --memory 119500 --threads 19 in PLINK)
        if exclusion != None:
            f.write(f'cmd1 = "plink --bfile {geno_path} --maf 0.05 --geno 0.01 --hwe 5e-6 --autosome --exclude {exclusion} --make-bed --out {out_path}" \n')
        else:
            f.write(f'cmd1 = "plink --bfile {geno_path} --maf 0.05 --geno 0.01 --hwe 5e-6 --autosome --make-bed --out {out_path}" \n')
    
        # Prune snps 
        f.write(f'cmd2 = "plink --bfile {out_path} --indep-pairwise 1000 10 0.02 --autosome --out {out_path}_pruned_data" \n')
    
        # Extract pruned SNPs and only these variants will be used for PC calculation
        f.write(f'cmd3 = "plink --bfile {out_path} --extract {out_path}_pruned_data.prune.in --make-bed --out {out_path}_pruned_snps" \n \n')
    
        f.write('cmd_list = [cmd1,cmd2,cmd3] \n')
        f.write('[shell_do(cmd) for cmd in cmd_list] \n')
        # Calculate/generate PCs based on pruned data set + loadings
        f.write(f'flash_pca("{out_path}_pruned_snps", "{out_path}", dim = {dim})')
    f.close()
    
    # write swarm script
    with open('PCA_prep_flash.swarm', 'w') as f:
        f.write('python PCA_prep_flash.py')
    f.close()
        
    swarm_cmd = f'swarm -f PCA_prep_flash.swarm -g 50 --time 24:00:00 --module plink/1.9.0-beta4.4,flashpca'
    shell_do(swarm_cmd)

In [None]:
geno = f'{wd}/merged_genotypes/downsampled/gwas_common_snps_annovar_related_prune_eur_5e-08_downsampled'
out = f'{wd}/processing/adjustment/downsampled/gwas_common_snps_annovar_related_prune_eur_5e-08_downsampled'
exc = f'{wd}/processing/adjustment/hg38_exclusion_regions.txt'

In [None]:
adj_pca_fbash(geno, out, dim = 10, exclusion = exc)

## Confounders file
#### If using flashpca method for PCs

In [None]:
pca_eigenvecs = f'{wd}/processing/adjustment/downsampled/gwas_common_snps_annovar_related_prune_eur_5e-08_downsampled.pcs'
# create df
eigen_df = pd.read_csv(pca_eigenvecs, sep = '\s+')

# modify df to proper format
loadings_df_10 = eigen_df.iloc[:,1:]
loadings_df_10.rename({'IID': 'ID'}, axis = 1, inplace = True) 
loadings_df_10

# export out file 
loadings_df_10.to_csv(f'{wd}/processing/adjustment/downsampled/gwas_common_snps_10PC_LOADINGS.csv', header = True, index = False)

In [None]:
# load in
confounders = f'{wd}/processing/adjustment/downsampled/gwas_common_snps_10PC_LOADINGS.csv'
confounders_df = pd.read_csv(confounders)

## Use GenoML to prep data for adjustment

Directions for installing GenoML: https://github.com/GenoML/genoml2

In [None]:
!genoml discrete supervised munge \
--geno /wd/merged_genotypes/downsampled/gwas_common_snps_annovar_related_prune_eur_5e-08_downsampled \
--prefix /wd/processing/adjustment/downsampled/gwas_common_snps_related_prune_5e-08_downsampled \
--pheno /wd/processing/adjustment/downsampled/gwas_common_snps_annovar_related_prune_eur_5e-08_downsampled_pheno.csv \
--impute mean

In [None]:
# read in munge file
munged_data = f'{wd}/processing/adjustment/downsampled/gwas_common_snps_related_prune_5e-08_downsampled.dataForML.h5'
target_data_df = pd.read_hdf(munged_data, 'dataForML')

### ADJUSTMENT PREP AND RUN

In [None]:
# create target columns file and export out
cols = target_data_df.columns.tolist()[2:]
f = open(f"{wd}/processing/adjustment/downsampled/gwas_common_snps_related_prune_5e-08_downsampled_columns_5e_08_cases.txt", "w")
for var in cols:
    f.write(var)
    f.write('\n')
f.close()

In [None]:
# read in target columns file and create df
target_columns = f"{wd}/processing/adjustment/downsampled/gwas_common_snps_related_prune_5e-08_downsampled_columns_5e_08_cases.txt"
target_column_df = pd.read_csv(target_columns, names=['TARGETS'])

# Keep only intersecting feature names left in munged set (removed either because --gwas or std dev of 0 etc.)
target_data_list = target_data_df.columns
target_column_list = target_column_df['TARGETS'].tolist()
intersecting_list = list(set(target_data_list).intersection(set(target_column_list)))

target_column_df = pd.DataFrame(intersecting_list,columns=['TARGETS'])

print(len(intersecting_list))
print(intersecting_list[:10])
print(target_data_df.head())
print(target_data_df.shape)

normalize_switch = 'yes' # Yep or nope to run the Z normalization of residuals.

# Munging begins. First make feature lists for targets and confounders. Then merge datasets.

target_list = list(target_column_df['TARGETS'])
confounder_list = list(confounders_df.columns[1:])
columns_to_keep_list = list(target_data_df.columns)

adjustments_df = target_data_df.merge(confounders_df, how='inner', on='ID', suffixes=['', '_y'])

In [None]:
# Here is where we start the adjusting
## First, make the formula pieces
import time
t0 = time.time()
formula_for_confounders = ' + '.join(confounder_list)
i = 1
for target in target_list:
    current_target = str(target)
    print(f"working on {current_target}. {i} out of {len(target_list)} targets complete. {round((i/len(target_list))*100,2)}% complete")
    current_formula = target + " ~ " + formula_for_confounders
    #print(current_formula)
    target_model = smf.ols(formula= current_formula, data=adjustments_df).fit()
    if normalize_switch == 'yes':
        adjustments_df['temp'] = pd.to_numeric(target_model.resid)
        #print(type(adjustments_df['temp']))
        mean_scalar = adjustments_df['temp'].mean()
        sd_scalar = adjustments_df['temp'].std()
        adjustments_df[current_target] = (adjustments_df['temp'] - mean_scalar)/sd_scalar
        adjustments_df.drop(columns=['temp'], inplace=True)
    
    else:
        adjustments_df[current_target] = pd.to_numeric(target_model.resid)
    i += 1
t1 = time.time()

runtime = t0-t1/60

print(f'Total run time was {runtime} seconds!')

In [None]:
# Now some more munging to just extract columns that are in original dataset
adjusted_df = adjustments_df[columns_to_keep_list]

In [None]:
# Save as an --addit file format
tweaked_adj_df = adjusted_df.drop(columns=['PHENO'])
tweaked_adj_df.to_csv(f"{wd}/processing/adjustment/downsampled/downsampled_gwas5e08_ADJUSTED10PCs.csv", index=False)

### Modify IDs and SNP Position columns to avoid errors later on (ONLY IF YOU DONT HAVE RSID BUT HAVE POSITION)

In [None]:
# create dfs to match up temp names with originals
col_list = target_data_df.columns.tolist()
key_df = pd.DataFrame() # holds SNP data
key_df['Original'] = col_list

# modify column names for SNPs
edit_tl = ['ID', 'PHENO']
for i in range(len(col_list[2:])):
    new = f'var{i}'
    edit_tl.append(new)
key_df['temp'] = edit_tl

# modify IDs
id_df = pd.DataFrame()
id_df['Original_id'] = target_data_df['ID']
id_df['temp_id'] = range(len(target_data_df['ID'])) # temp ids

# update confounders IDs to match
confounders_df['ID'] = id_df['temp_id']

# create dfs to match up temp names with originals
col_list = target_data_df.columns.tolist()
key_df = pd.DataFrame()
key_df['Original'] = col_list

edit_tl = ['ID','PHENO']


for i in range(len(col_list[2:])):
    new = f'var{i}'
    edit_tl.append(new)
    
key_df['temp'] = edit_tl

# modify IDs
id_df = pd.DataFrame()
id_df['Original_id'] = target_data_df['ID']
id_df['temp_id'] = range(len(target_data_df['ID']))

# replacing SNP names with temp name for adjustment
target_data_df2 = target_data_df.set_axis(edit_tl, axis = 1)
target_data_df2['ID'] = id_df['temp_id']

In [None]:
# replace names 
# extract correct labels
replace_cols = target_data_df.columns.to_list()

# replace with correct labels 
adjusted_df.columns = replace_cols
adjusted_df.loc[:,'ID'] = id_df.loc[:,'Original_id']