# Meta Analysis Notebook

## Steps

**Pre-processing**

- Set up directories for meta-analysis
- Concatenate and compress GWAS results

**Input Files**

`input/analysis.json` - this file defines the directory with the results (summary statistics) from the GWAS pipeline for a given outcome. The structure of the file is as follows

```json
{
  "Study": {
    "cohorts": [     // define the study arms in the analysis
      "PD",
      "HC"
    ],
    "results": [
      {
        "path": "Study_2022-03-07T1903831",
        "outcome": "outcome_1",
        "mode": "lt"
      },
      {
        "path": "Study_2022-03-07T1903846",
        "outcome": "outcome_1",
        "mode": "cs"
      }
    ]
  }
}
```

1. Calculate lambdas from p-values
2. Run meta-analysis with METAL

In [1]:
import os
from pathlib import Path
import json

In [2]:
files = dict()

In [3]:
files['base'] = Path('')
files['base/results'] = files['base'] / 'results'
files['base/METAL'] = files['base'] / 'METAL'

files['jobs'] = files['base'] / 'jobs'
files['METAL/raw_summary_stats'] = files['base/METAL'] / 'raw_summary_stats'
files['METAL/lt'] = files['base/METAL'] / 'meta_analysis/lt'
files['METAL/cs'] = files['base/METAL'] / 'meta_analysis/cs'


files['lambda.csv'] = files['METAL/raw_summary_stats'] / 'lambda.csv'
files['analysis.json'] = files['base'] / 'input/analysis.json'

In [4]:
# create meta-analysis directory
if not os.path.exists( files['METAL/raw_summary_stats'] / 'lt' ):
  os.makedirs( files['METAL/raw_summary_stats'] / 'lt' )
if not os.path.exists( files['METAL/raw_summary_stats'] / 'cs' ):
  os.makedirs( files['METAL/raw_summary_stats'] / 'cs')
  
if not os.path.exists( files['base/METAL'] / 'meta_analysis/lt' ):
  os.makedirs( files['METAL/lt'] )
if not os.path.exists( files['base/METAL'] / 'meta_analysis/cs' ):
  os.makedirs( files['METAL/cs'] )

## Pre-Processing - concatenate results from GWAS

In [5]:
with open(files['analysis.json'], 'r') as f:
  outcomes = json.loads(f.read())

In [13]:
cmds = []

## define lambda function to filter outcomes from analysis.json - use None for all
criteria = {
  None: lambda x: True,
  'cs': lambda x: x['mode'] == 'cs'
}


for study in outcomes.keys():
  sel_outcomes = filter( criteria['cs'], outcomes[study]['results'] )
  sel_outcomes = list(sel_outcomes)
  if len(sel_outcomes) < 1:
    continue
  
  for cohort in outcomes[study]['cohorts']:
    study = study.split('/')[0]
    c = f'EUR_{study}-{cohort}'
    for oc in sel_outcomes:
      cmd = ["/data/CARD/projects/longGWASnextflow/Scripts/preprocess.sh"]
      path = os.path.join(files['base/results'], oc['path'], c)
      cmd.append(path)
      cmd.append("| bgzip >")

      suffix = 'gallop.gz' if oc['mode'] == 'lt' else 'linear.gz'
      out_path = files['METAL/raw_summary_stats'] / \
                 f"{oc['mode']}/{c}_allchr.{oc['outcome']}.{suffix}"

      if os.path.exists(out_path):
        continue
      cmd.append(out_path)
      cmds.append(' '.join( map(str, cmd)) )

In [4]:
with open( os.path.join(files['jobs'], 'preprocess.outcomes.swarm'), 'w' ) as f:
    f.write('\n'.join(cmds))

NameError: name 'cmds' is not defined

#### Pre-process concatenate summary stats swarm job

```bash
swarm \
  --module python/3.8,samtools \
  -g 10 -p 2 \
  --time 01:15:00 \
  -f jobs/preprocess.outcomes.swarm \
  --partition quick,norm \
  --logdir logs/swarm_logs
```

## Calculate Lambdas

In [4]:
from scipy.stats import chi2
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

In [6]:
def calc_lambda(pvals):
    return np.median(chi2.ppf(pvals, df=1)) / chi2.ppf(0.5, 1)

cache_lambdas = None
# grab cache values if exists
if os.path.exists(files['lambda.csv']):
  cache_lambdas = pd.read_csv(files['lambda.csv'], index_col=0)
  
tmp_dfs = []
for m in ['cs', 'lt']:
  tmp_files = os.listdir( files['METAL/raw_summary_stats'] / m )
  tmp_files = list(filter(lambda x: x.endswith('.gallop.gz') or x.endswith('.linear.gz'), tmp_files))
  for f in tqdm(tmp_files):
    if cache_lambdas is not None and f'{m}/{f}' in cache_lambdas.index:
      continue

    fn = files['METAL/raw_summary_stats'] / f'{m}/{f}'
    tmp_res = dict()
    try:
      tmp_df = pd.read_csv(fn, compression='gzip', sep='\t', engine='c')
    except pd.errors.EmptyDataError:
      tmp_res[f'{m}/{f}'] = (np.NaN, np.NaN)
      tmp_dfs.append(
          pd.DataFrame.from_dict(tmp_res).T.rename(columns={0: 'Slope', 1: 'Intercept'}))

    tmp_df = tmp_df[~(tmp_df.A1_FREQ < 0.05) & ~(tmp_df.A1_FREQ > 0.95)]
    if m == 'lt':
      tmp_res[f'{m}/{f}'] = (calc_lambda(tmp_df.Ps), calc_lambda(tmp_df.Pi))
      tmp_dfs.append(
          pd.DataFrame.from_dict(tmp_res).T.rename(columns={0: 'Slope', 1: 'Intercept'}))
    else:
      tmp_res[f'{m}/{f}'] = (np.NaN, calc_lambda(tmp_df.P))
      tmp_dfs.append(
          pd.DataFrame.from_dict(tmp_res).T.rename(columns={0: 'Slope', 1: 'Intercept'}))

if len(tmp_dfs) > 0:
  tmp_dfs = pd.concat(tmp_dfs)
  if cache_lambdas is not None:
    tmp_dfs = pd.concat([cache_lambdas, tmp_dfs])
  tmp_dfs.to_csv(files['lambda.csv'])

100%|████████████████████████████████████████████| 6/6 [00:00<00:00, 103138.62it/s]
100%|██████████████████████████████████████████| 27/27 [00:00<00:00, 291121.36it/s]


## Run METAL

In [5]:
def conduct_METAL_job(script, outcome, interest, indir, 
                      out, correction, cohorts, suffix=None):
  cmd = [f'python {script}']
  cmd.append(f'--outcome {outcome}')
  cmd.append(f'--interest {effect}')
  cmd.append(f'--indir {indir}')
  cmd.append(f'--out {out}')
  cmd.append(f'-c {correction}')
  
  if suffix is not None:
    cmd.append(f'--suffix {suffix}')
  
  for c in cohorts:
    cmd.append(f'-s {c}')
  
  return cmd

In [6]:
# will be mapped to concatenated summary stats
cohorts = ['lt/EUR_PPMI-SWEDD',
           'lt/EUR_PPMI-Genetic-Cohort-PD',
           'lt/EUR_PPMI-Genetic-Cohort-Unaffected',
           'lt/EUR_PPMI-Healthy-Control',
           'lt/EUR_PPMI-PD',
           'lt/EUR_PPMI-Prodromal',
           'lt/EUR_ADNI-CN',
           'lt/EUR_ADNI-MCI',
           'lt/EUR_ADNI-Dementia',
           'lt/EUR_PDBP-Healthy-Control',
           'lt/EUR_PDBP-PD',
           'cs/EUR_BioFIND-PD',
           'cs/EUR_BioFIND-Healthy-Control']

cohort_states = {
  'HC': {
    'lt/EUR_PPMI-Healthy-Control',
    'lt/EUR_PPMI-Genetic-Cohort-Unaffected',
    'lt/EUR_ADNI-CN',
    'lt/EUR_PDBP-HC',
    'cs/EUR_BioFIND-Healthy-Control'
  },
  'PD': {
    'lt/EUR_PPMI-PD',
    'lt/EUR_PDBP-PD',
    'cs/EUR_BioFIND-PD'
  },
  'PD.all': {
    'lt/EUR_PPMI-PD',
    'lt/EUR_PPMI-Genetic-Cohort-PD',
    'lt/EUR_PDBP-PD',
    'cs/EUR_BioFIND-PD'
  }
}

modes = {
  'lt': ['Intercept', 'Slope'],
  'cs': ['Intercept']
}

files['conductMETAL.py'] = '/data/CARD/projects/longGWASnextflow/Scripts/conductMETAL.py'


filters = {
  'lt': lambda x: x.startswith('lt/'),
  'cs': lambda x: x.startswith('cs/'),
  'HC': lambda x: x in cohort_states['HC'],
  'PD.all': lambda x: x in cohort_states['PD.all'],
  'PD': lambda x: x in cohort_states['PD'],
  'PDBP': lambda x: x.split('_')[1].startswith('PDBP'),
  'PPMI': lambda x: x.split('_')[1].startswith('PPMI'),
  'BioFIND': lambda x: x.split('_')[1].startswith('BioFIND'),
  'ADNI': lambda x: x.split('_')[1].startswith('ADNI'),
  None: lambda x: True
}

In [7]:
### Analysis General
outcomes = ['log_CSF_Ab',
            'log_CSF_pTau',
            'log_CSF_tTau',]

cmds = []
m = 'lt'

for effect in modes[m]:
  for oc in outcomes:
    cmd = conduct_METAL_job( files['conductMETAL.py'],
                             oc, 
                             effect,
                             files['base/METAL'],
                             files[f'METAL/{m}'],
                             files['lambda.csv'],
                             filter(filters[None], cohorts) )
  
    cmds.append(' '.join(cmd))
  
fn = files['jobs'] / f'METAL.CSF_outcomes.{m}.swarm'

with open( fn , 'w') as f:
  f.write('\n'.join(cmds))

In [21]:
### Meta Analysis - By State
outcomes = ['log_CSF_Ab',
            'log_CSF_pTau',
            'log_CSF_tTau',]

cmds = []
m = 'lt'

for effect in modes[m]:
  for oc in outcomes:
    for sfx in ['HC', 'PD.all']:
    #for sfx in ['PD']:
      cmd = conduct_METAL_job( files['conductMETAL.py'],
                               oc, 
                               effect,
                               files['base/METAL'],
                               files[f'METAL/{m}'],
                               files['lambda.csv'],
                               filter(filters[sfx], cohorts),
                               sfx)
  
      cmds.append(' '.join(cmd))
  
fn = files['jobs'] / f'METAL.CSF_outcomes_states.{m}.swarm'

with open( fn , 'w') as f:
  f.write('\n'.join(cmds))

In [8]:
print(
f"""swarm \\
  --module python/3.8,metal,samtools \\
  -g 20 -p 2 \\
  --time 02:30:00 \\
  -f {fn} \\
  --partition quick,norm \\
  --logdir logs/swarm_logs
""")

swarm \
  --module python/3.8,metal,samtools \
  -g 20 -p 2 \
  --time 02:30:00 \
  -f /data/CARD/projects/longGWASnextflow/BiomarkerGWAS-2/jobs/METAL.CSF_outcomes.lt.swarm \
  --partition quick,norm \
  --logdir logs/swarm_logs



### METAL for disease status

In [7]:
files['METAL/meta_analysis'] = files['base/METAL'] / 'meta_analysis'
files['meta_analysis/lambda.csv'] = files['METAL/meta_analysis'] / 'lambda.csv'

In [8]:
def calc_lambda(pvals):
    return np.median(chi2.ppf(pvals, df=1)) / chi2.ppf(0.5, 1)

cache_lambdas = None
# grab cache values if exists
if os.path.exists(files['meta_analysis/lambda.csv']):
  cache_lambdas = pd.read_csv(files['meta_analysis/lambda.csv'], index_col=0)
  
tmp_dfs = []
for m in ['cs', 'lt']:
  for oc in os.listdir( files['METAL/meta_analysis'] / m ):
    tmp_files = os.listdir( files['METAL/meta_analysis'] / m / oc )
    tmp_files = list(filter(lambda x: x.endswith('.tbl'), tmp_files))
    for f in tqdm(tmp_files):
      if cache_lambdas is not None and f'{m}/{oc}/{f}' in cache_lambdas.index:
        continue
      key = f'{m}/{oc}/{f}'
      fn = files['METAL/meta_analysis'] / key
      tmp_res = dict()
      try:
        tmp_df = pd.read_csv(fn, sep='\t', engine='c')
      except pd.errors.EmptyDataError:
        tmp_res[key] = (np.NaN,)
        tmp_dfs.append(
            pd.DataFrame.from_dict(tmp_res).T.rename(columns={0: 'Lambda'}))

      tmp_df = tmp_df[~(tmp_df.Freq1 < 0.05) & ~(tmp_df.Freq1 > 0.95)]
      tmp_res[key] = (calc_lambda(tmp_df['P-value']),)
      tmp_dfs.append(
          pd.DataFrame.from_dict(tmp_res).T.rename(columns={0: 'Lambda'}))

tmp_dfs = pd.concat(tmp_dfs)
if cache_lambdas is not None:
  tmp_dfs = pd.concat([cache_lambdas, tmp_dfs])
tmp_dfs.to_csv(files['meta_analysis/lambda.csv'])

100%|██████████████████████████████████████████████| 14/14 [04:55<00:00, 21.12s/it]
100%|██████████████████████████████████████████████| 14/14 [05:24<00:00, 23.17s/it]
100%|██████████████████████████████████████████████| 14/14 [04:57<00:00, 21.23s/it]


In [13]:
# will be mapped to concatenated summary stats
cohorts = {'lambda.csv': ['lt/EUR_ADNI-CN',
                          'lt/EUR_ADNI-MCI',
                          'lt/EUR_ADNI-Dementia'],
           'meta_analysis/lambda.csv': []}

for oc in ['log_CSF_Ab', 'log_CSF_pTau', 'log_CSF_tTau']:
  for state in ['HC', 'PD.all']:
    cohorts['meta_analysis/lambda.csv'].append(f'lt/{oc}/Pi_{state}1.tbl')
    cohorts['meta_analysis/lambda.csv'].append(f'lt/{oc}/Ps_{state}1.tbl')

modes = {
  'lt': ['Intercept', 'Slope']
}

files['conductMETAL.py'] = '/data/CARD/projects/longGWASnextflow/Scripts/conductMETAL.py'


filters = {
  'lt': lambda x: x.startswith('lt/'),
  'cs': lambda x: x.startswith('cs/'),
  'HC': lambda x: x in cohort_states['HC'],
  'PD.all': lambda x: x in cohort_states['PD.all'],
  'PD': lambda x: x in cohort_states['PD'],
  'PDBP': lambda x: x.split('_')[1].startswith('PDBP'),
  'PPMI': lambda x: x.split('_')[1].startswith('PPMI'),
  'BioFIND': lambda x: x.split('_')[1].startswith('BioFIND'),
  'ADNI': lambda x: x.split('_')[1].startswith('ADNI'),
  None: lambda x: True
}

In [19]:
cohorts

{'lambda.csv': ['lt/EUR_ADNI-CN', 'lt/EUR_ADNI-MCI', 'lt/EUR_ADNI-Dementia'],
 'meta_analysis/lambda.csv': ['lt/log_CSF_Ab/Pi_HC1.tbl',
  'lt/log_CSF_Ab/Ps_HC1.tbl',
  'lt/log_CSF_Ab/Pi_PD.all1.tbl',
  'lt/log_CSF_Ab/Ps_PD.all1.tbl',
  'lt/log_CSF_pTau/Pi_HC1.tbl',
  'lt/log_CSF_pTau/Ps_HC1.tbl',
  'lt/log_CSF_pTau/Pi_PD.all1.tbl',
  'lt/log_CSF_pTau/Ps_PD.all1.tbl',
  'lt/log_CSF_tTau/Pi_HC1.tbl',
  'lt/log_CSF_tTau/Ps_HC1.tbl',
  'lt/log_CSF_tTau/Pi_PD.all1.tbl',
  'lt/log_CSF_tTau/Ps_PD.all1.tbl']}

In [18]:
list(filter(filters['PD.all'], cohorts))

[]

In [14]:
### Meta Analysis - By State
outcomes = ['log_CSF_Ab',
            'log_CSF_pTau',
            'log_CSF_tTau',]

cmds = []
m = 'lt'

for effect in modes[m]:
  for oc in outcomes:
    for sfx in ['HC', 'PD.all']:
    #for sfx in ['PD']:
      cmd = conduct_METAL_job( files['conductMETAL.py'],
                               oc, 
                               effect,
                               files['base/METAL'],
                               files[f'METAL/{m}'],
                               files['lambda.csv'],
                               filter(filters[sfx], cohorts),
                               sfx)
  
      cmds.append(' '.join(cmd))
  
fn = files['jobs'] / f'METAL.CSF_outcomes_{sfx}.{m}.swarm'

with open( fn , 'w') as f:
  f.write('\n'.join(cmds))

In [None]:
metal log_CSF_tTau/Ps_status.all.metal > log_CSF_tTau/Ps_status.all.metal.stdout