In [None]:
import os
#os.chdir('../')

## Export results for splicing

In [None]:
import pandas as pd
import numpy as np
import gzip
from glob import glob

In [None]:
import scipy.stats as st

In [None]:
from script.python.util.snake import clean_str

import yaml

with open('conf/config.yaml') as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

In [None]:

if config['phenotypes'].endswith('.gz'):
    with gzip.open(config['phenotypes'],'rt') as f:
        phenotypes = f.readline().rstrip().split('\t')
else:
    with open(config['phenotypes'], 'r') as f:
        phenotypes = f.readline.rstrip().split('\t')

phenotypes = { clean_str(p): p  for p in phenotypes[1:] }

In [None]:
def estimate_lambda(pv):

    '''
    estimate the lambda for a given array of P-values
    ------------------------------------------------------------------
    pv          numpy array containing the P-values
    ------------------------------------------------------------------
    L           lambda value
    ------------------------------------------------------------------
    '''
    
    LOD2 = np.median(st.chi2.isf(np.abs(pv[~np.isnan(pv)]), 1))
    L = (LOD2/st.chi2(1).median())
    
    return L

In [None]:
def rename_cols(cols, suffix=''):
    return {c: c+suffix if c not in ['gene', 'pheno'] else c for c in cols}

In [None]:
outfiles = glob('./work/association/sclrt_kernels_spliceai/all/*/results.tsv.gz')

In [None]:
results = [ pd.read_csv(s, sep='\t', na_values='.') for s in outfiles ]

In [None]:
results = pd.concat(results)

In [None]:
dropcols = list((c for c in results.columns if c.startswith('lrtstat_') or c.startswith('alteqnull_')))

In [None]:
dropcols

In [None]:
results.drop(columns=dropcols, inplace=True)

In [None]:
results = results[results.nCarrier >= 5]

In [None]:
pv_cols = list((c for c in results.columns if c.startswith('pv_')))

In [None]:
results.rename(columns=rename_cols(results.columns, '_splice'), inplace=True)

In [None]:
tmp = results.copy()

pv_cols = list(c for c in tmp.columns if 'pv_' in c)
for col in pv_cols:
    if 'score' in col:
        continue
    if ('cLOF' in col) or ('mrgLOF' in col):
        continue
    kernel = col.split('_')[2]
    effect = col.split('_')[-1]
    
    tmp.loc[tmp[col].isna(),col] = tmp.loc[tmp[col].isna(), 'pv_score_{}_{}'.format(kernel, effect)]
    print(col)

results = tmp
del tmp

In [None]:
lambda_val =  results.groupby(['pheno'])[pv_cols].agg(estimate_lambda)

In [None]:
pd.options.display.max_columns = 100

In [None]:
results.sort_values('pv_lrt_linwb_splice')[['pheno'] + results.columns.to_list()].head(30)

In [None]:
results.sort_values('pv_lrt_linw_splice')[['pheno'] + results.columns.to_list()].head(30)

In [None]:
lambda_val

In [None]:
lambda_val.to_csv('results/tables/lambdaval_splice.tsv', sep='\t')
results.to_csv('results/tables/results_splice.tsv.gz', sep='\t', index=False)