## Quantile transformation and filtering for phenotypes / covariates
Basic pre-processing notebook.

In [None]:
import pandas as pd
import numpy as np
import yaml
import os
import seaborn as sns

from numpy import unique
from numpy import setdiff1d
from sklearn.preprocessing import quantile_transform

In [None]:
# os.chdir('../../../') # change to the main project directory

In [None]:
with open('conf/config.yaml', 'r') as stream:
    try:
        config = (yaml.load(stream))
    except Exception as ex:
        print ex

In [None]:
pheno_file = config['phenotypes_raw']
covar_file = config['covariates_raw']

pheno = pd.read_csv(pheno_file, sep='\t', header=0, index_col=['iid'])
covar = pd.read_csv(covar_file, sep='\t', header=0, index_col=['iid'])

In [None]:
iid_withdraw = config['iid_withdraw']
iid_wd = []
with open(iid_withdraw, 'r') as infile:
    for l in infile:
        iid_wd.append(int(l.strip()))
iid_wd = unique(iid_wd + [-1])

In [None]:
iid_exclude = config['iid_exclude']
iid_excl = []
for path in iid_exclude:
    with open(path, 'r') as infile:
        for l in infile:
            iid_excl.append(int(l.strip()))

In [None]:
iid_covar = setdiff1d(covar.index.values, iid_wd)
iid_pheno = setdiff1d(pheno.index.values, iid_wd)

In [None]:
print(len(iid_covar))

In [None]:
print(len(iid_pheno))

In [None]:
pheno = pheno.loc[iid_pheno]
covar = covar.loc[iid_covar]

In [None]:
iid_covar = setdiff1d(covar.index.values, iid_excl)
iid_pheno = setdiff1d(pheno.index.values, iid_excl)

In [None]:
pheno = pheno.loc[iid_covar]
covar = covar.loc[iid_pheno]

In [None]:
pcor = pheno.corr()

In [None]:
sns.clustermap(pcor, cmap='RdBu_r', center=0., figsize=(13,13)) #phenotype 

In [None]:
def transform(x, seed=1):
    '''
    returns Gaussian quantile transformed values, "nan" are kept
    '''
    np.random.seed(seed)
    x_transform = x.copy().values
    is_nan = np.isnan(x_transform)
    n_quantiles = np.sum(~is_nan)
        
    x_transform[~is_nan] = quantile_transform(x_transform[~is_nan].reshape([-1, 1]), n_quantiles=n_quantiles, subsample=n_quantiles, output_distribution='normal', copy=True)[:,0]
    return x_transform

In [None]:
pheno_transform_all = pheno.transform(lambda x: transform(x, seed=100))

In [None]:
# sklearn sometimes produces uggly pipelups at the extreme ends of ranges when it can't break ties.
# this sets the values of those extreme values to the second highest/lowest values in the range in order to prevent too many extreme outliers
# this should not affect most phenotypes

for p in pheno_transform_all.columns:
    
    if np.sum(pheno_transform_all[p] == np.nanmin(pheno_transform_all[p])) > 1:
        
        new_min = pheno_transform_all[p].dropna().sort_values().unique()[1]
        pheno_transform_all.loc[pheno_transform_all[p] == np.nanmin(pheno_transform_all[p]),p] = new_min
        
    if np.sum(pheno_transform_all[p] == np.nanmax(pheno_transform_all[p])) > 1:
        
        new_max = pheno_transform_all[p].dropna().sort_values(ascending=False).unique()[1]
        pheno_transform_all.loc[pheno_transform_all[p] == np.nanmax(pheno_transform_all[p]),p] = new_max
        

In [None]:
ptcor = pheno_transform_all.corr()

In [None]:
sns.clustermap(ptcor, cmap='RdBu_r', center=0., figsize=(13,13))

In [None]:
# os.makedirs('./data/covariates/', exist_ok=True)
# pheno_transform_all.to_csv('./data/covariates/phenotypes_transformed.tsv.gz', sep='\t')
# covar.to_csv('./data/covariates/covariates.tsv.gz', sep='\t')