# Risk loci heterogeneity analysis

Dataframe key `AD.risk_loci` contains the risk loci from Bellenguez et.al. (columns: rsID, POS)

Outcomes
- `log CSF Aβ`
- `log CSF p-tau`
- `log CSF t-tau`

In [1]:
import pandas as pd
import numpy as np
import tabix

In [2]:
dfs = {
  'AD.risk_loci': pd.read_csv('METAL/RiskLoci/AD_loci.tsv',
                             sep='\t'),
  'PD.risk_loci': pd.read_csv('archive/META5_rsID_hg38.csv')
}

In [3]:
for oc in ['log_CSF_Ab', 'log_CSF_pTau', 'log_CSF_tTau']:
  dfs[f'ADNI-Dementia.{oc}'] = tabix.open(f'METAL/raw_summary_stats/lt/EUR_ADNI-Dementia_allchr.{oc}.gallop.gz')
  dfs[f'HC.{oc}'] = pd.read_csv(f'METAL/meta_analysis/lt/{oc}/Pi_HC1.tbl', sep='\t')
  dfs[f'PD.{oc}'] = pd.read_csv(f'METAL/meta_analysis/lt/{oc}/Pi_PD.all1.tbl', sep='\t')

In [None]:
dfs['P']

In [None]:
tmp_df = pd.read_csv('PD_omic_df.csv')
tmp_df = tmp_df[tmp_df.p_GWAS < 5e-8][['topSNP_chr', 'topSNP_bp', 'A1', 'A2', 'topRSID']]\
            .drop_duplicates().reset_index(drop=True).copy()
tmp_df['POS'] = 'chr' + tmp_df.topSNP_chr.astype(str) + ':' + tmp_df.topSNP_bp.astype(str)
tmp_df['Allele'] = tmp_df.A1 + '/' + tmp_df.A2
tmp_df = tmp_df.rename(columns={
  'topRSID': 'rsID'
})[['rsID', 'POS', 'Allele']]
dfs['PD.risk_loci'] = tmp_df.copy()

In [None]:
tmp_df = pd.read_csv('PD_omic_df.csv')

In [23]:
tmp_df = pd.read_csv('METAL/FUMA/MarkerNames.rsID_annotated-2.tsv',
                    sep='\t',
                    names=['MarkerName', 'rsID'])
dfs['PD.risk_loci'] = dfs['PD.risk_loci.orig'].merge(tmp_df, on='rsID', how='inner')
dfs['PD.risk_loci']['POS'] = dfs['PD.risk_loci'].MarkerName.str.split(':').apply(lambda x: ':'.join(x[:2]))
dfs['PD.risk_loci'] = dfs['PD.risk_loci'][['rsID', 'MarkerName', 'POS']].drop_duplicates()
dfs['PD.risk_loci']['Allele'] = dfs['PD.risk_loci'].MarkerName.str.split(':').apply(lambda x: '/'.join(x[2:]))

In [363]:
res = []
header = [
  'CROM',
  'POS',
  'ID',
  'REF',
  'ALT',
  'A1',
  'A1_FREQ',
  'MISS_FREQ',
  'OBS_CT',
  'TEST',
  'BETAs',
  'SEs',
  'T_STAT',
  'Ps',
  'OBS_CT_REP',
  'BETAi',
  'SEi',
  'Pi',
  'COV'
]

risk_key = 'PD.risk_loci'
#for idx, row in dfs['AD.risk_loci'].iterrows():
for oc in ['log_CSF_Ab', 'log_CSF_pTau', 'log_CSF_tTau']:
#for oc in ['log_CSF_tTau']:
  r = []
  for idx, row in dfs[risk_key].iterrows():
    chrom, pos = row.POS.split(':')
    pos = int(pos)

    query = dfs[f'ADNI-Dementia.{oc}'].query(chrom, pos-1, pos)
    try:
      r.append(
        next(query))
    except StopIteration:
      r.append([pd.NA] * len(header) )

  df = pd.DataFrame(r, columns=header)
  df['study'] = 'ADNI-Dementia'
  df['outcome'] = oc

  r = [df[['ID', 'A1', 'REF', 'A1_FREQ', 'BETAi', 'SEi', 'Pi', 'OBS_CT', 'outcome', 'study']].rename(columns={
    'ID': 'MarkerName',
    'A1': 'Allele2',
    'REF': 'A1',
    'A1_FREQ': 'A1_FREQ',
    'BETAi': 'BETA',
    'SEi': 'SE',
    'Pi': 'P'
  }).copy()]


  for c in ['HC', 'PD']:
    key = f'{c}.{oc}'
    idx = dfs[key].MarkerName.str.split(':').apply(lambda x: ':'.join(x[:2])).isin(dfs[risk_key].POS)
    dfs[key]['study'] = c
    dfs[key]['outcome'] = oc
    tmp_df = dfs[key][idx].copy()
    tmp_df = tmp_df.rename(columns=
                          {'Effect': 'BETA',
                           'StdErr': 'SE',
                           'P-value': 'P',
                           'OBS_CT_total': 'OBS_CT'})
    r.append( tmp_df[['MarkerName', 'Allele1', 'Allele2', 'Freq1', 'BETA', 'SE', 'P', 'OBS_CT', 'outcome', 'study']].rename(columns={
      'Allele1': 'A1',
      'Freq1': 'A1_FREQ'}).copy() )
    
  df = pd.concat(r).reset_index(drop=True) 
  for comp in ['HCAD', 'HCPD', 'PDAD']:
    key = f'{comp}.Pi_{oc}'
    if key not in dfs:
      dfs[key] = pd.read_csv(f'METAL/meta_analysis/lt/{oc}/Pi_status.{comp}1.tbl', sep='\t')
    idx = dfs[key].MarkerName.str.split(':').apply(lambda x: ':'.join(x[:2])).isin(dfs[risk_key].POS)
    df = df.merge(dfs[key][idx][['MarkerName', 'HetISq']].rename(columns={'HetISq': f'HetISq_{comp}'}), how='outer')

  df['HetISq'] = df[['HetISq_HCAD', 'HetISq_HCPD', 'HetISq_PDAD']].max(axis=1)
  res.append(df.copy())

In [376]:
df = pd.concat(res).reset_index(drop=True)

In [378]:
#df_copy = df.copy()
df = df_copy.copy() # run if redoing analysis

In [7]:
idx = pd.to_numeric(df['A1_FREQ']) > 0.5

In [318]:
tmp_alleles = df.loc[idx, 'A1'].copy()
# flip A1 allele
df.loc[idx, 'A1'] = df.loc[idx, 'Allele2']
df.loc[idx, 'Allele2'] = tmp_alleles
# flip BETA
df.loc[idx, 'BETA'] = pd.to_numeric(df.loc[idx, 'BETA']) * -1.
# create and update MAF
df['MAF'] = df['A1_FREQ']
df.loc[idx, 'MAF'] = 1. - pd.to_numeric(df.loc[idx, 'MAF'])
df['A1'] = df.A1.str.upper()
df['Allele2'] = df.Allele2.str.upper()

KeyError: "None of [Index(['PD', 'ADNI-Dementia', 'HC'], dtype='object')] are in the [index]"

In [380]:
# flip alleles
idx = df[~df.BETA.isna()].index
df.loc[idx, 'BETA'] = df.loc[idx, 'BETA'].astype(float)

for idx, grp in df.groupby('MarkerName'):
  if grp.A1.nunique() > 1:
    assert(grp[grp.study.isin(['HC', 'PD'])].A1.nunique() == 1)
    test_allele = grp[grp.study.isin(['HC', 'PD'])].A1.unique()[0]
    for rid, row in grp[grp.study == 'ADNI-Dementia'].iterrows():
      if row.A1 != test_allele:
        df.loc[row.name, 'Allele2'] = row.A1
        df.loc[row.name, 'A1'] = test_allele
        df.loc[row.name, 'BETA'] = row.BETA * -1.
        

In [362]:
# add back in `MAF`
df[[
  'MarkerName', 'A1', 'Allele2', ,
  'BETA', 'SE', 'P', 'OBS_CT', 'outcome', 'study',
  'HetISq_HCAD', 'HetISq_HCPD', 'HetISq_PDAD', 'HetISq'
]].to_csv('METAL/RiskLoci/AD_risk_loci_het-new.tbl', sep='\t', index=False)

## Code for UPset Plot

In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import upsetplot
from matplotlib import cm
from upsetplot import plot, from_memberships
from matplotlib import colors
from matplotlib.tight_layout import get_renderer
import matplotlib

from scripts.MetaStats import MetalBrowser,MetalStats
from scripts.SumStats import SumStats
from scripts.ForestPlot import *
from scripts.Filters import *

from pathlib import Path
from bokeh.io import show, output_notebook
from bokeh.plotting import figure, output_file, save
from itertools import combinations

output_notebook()

In [276]:
def plot_matrix(self, ax, scaler, cmap):
  """Plot the matrix of intersection indicators onto ax
  """
  ax = self._reorient(ax)
  data = self.intersections
  n_cats = data.index.nlevels
  n_empty = len( list(filter(lambda x: x.strip() == '', data.index.names)) )
  n_cats = n_cats - n_empty
  label_cats = list( filter(lambda x: x.strip() != '', data.index.names) )
  
  inclusion = data.index.to_frame().values
  mat_color = []
  
  # Prepare styling
  styles = [
    [
        self.subset_styles[i]
        #if inclusion[i, j]
        #else {"facecolor": self._other_dots_color, "linewidth": 0}
        for j in range(n_cats)
    ]
    for i in range(len(data))
  ]

  for i in range(len(data)):
    for j in range(n_cats):
      tmp_name = data.index.names[j]
      if tmp_name.strip() == '':
        continue
      if i >= len(self.mat_color.loc[tmp_name]):
        mat_color.append( None )
        continue
      mat_color.append( float(self.mat_color.loc[tmp_name].iloc[i]) )

  styles = sum(styles, [])  # flatten nested list
  style_columns = {"facecolor": "facecolors",
                   "edgecolor": "edgecolors",
                   "linewidth": "linewidths",
                   "linestyle": "linestyles",
                   "hatch": "hatch"}
  styles = pd.DataFrame(styles).reindex(columns=style_columns.keys())

  styles["linewidth"] = self.inclusion.flatten() * 2
  styles["linewidth"] = styles["linewidth"] + 1
  styles["linewidth"].fillna(0, inplace=True)
  #styles["facecolor"].fillna(self._facecolor, inplace=True)
  styles["facecolor"] = self._facecolor
  styles["c"] = mat_color
  styles["edgecolor"].fillna(styles["facecolor"], inplace=True)
  styles["edgecolor"] = ['black' if x == 1 else 'gray' for x in self.inclusion.flatten()]
  styles["linestyle"].fillna("solid", inplace=True)
  del styles["hatch"]  # not supported in matrix (currently)

  x = np.repeat(np.arange(len(data)), n_cats)
  y = np.tile(np.arange(n_cats), len(data))
  # Plot dots
  if self._element_size is not None:
    s = (self._element_size * .35) ** 2
  else:
    # TODO: make s relative to colw
    s = 200
    
  ax.scatter(*self._swapaxes(x, y), s=s, zorder=10, cmap=cmap,
             vmin=0., vmax=1.,
             **styles.rename(columns=style_columns))

  ax.set_ylim( ax.get_ylim()[0], n_cats)
  # Plot lines
  if self._with_lines:
    idx = np.flatnonzero(self.inclusion)
    line_data = (pd.Series(y[idx], index=x[idx])
                 .groupby(level=0)
                 .aggregate(['min', 'max']))
    colors = pd.Series([
      style.get("edgecolor", style.get("facecolor", self._facecolor))
      for style in self.subset_styles],
      name="color")
    line_data = line_data.join(colors)
    ax.vlines(line_data.index.values,
              line_data['min'], line_data['max'],
              lw=2, colors=line_data["color"],
              zorder=5)

  # Ticks and axes
  tick_axis = ax.yaxis
  tick_axis.set_ticks(np.arange(n_cats))
  tick_axis.set_ticklabels(label_cats,
                           rotation=0 if self._horizontal else -90)
  ax.xaxis.set_visible(True)
  ax.tick_params(axis='both', which='both', length=0)

  ax.xaxis.set_ticks( ticks=np.arange(len(data) + 1),
                      labels=self.labels)
                      #labels=[f'a-{i}' for i in range(n_cats -1)])
  
  
  ax.set_xticklabels( self.labels, rotation=60, ha='right' )
  if not self._horizontal:
    ax.yaxis.set_ticks_position('top')
  ax.set_frame_on(False)
  #ax.set_xlim(-.5, x[-1] + .5, auto=False)
  ax.grid(False)
  
  
def plot_intersections(self, ax):
  """Plot bars indicating intersection size
  """
  rects = self._plot_bars(ax, self.intersections,
                          title='N Subjects',
                          colors=self._facecolor)
  for style, rect in zip(self.subset_styles, rects):
    style = style.copy()
    style.setdefault("edgecolor",
                     style.get("facecolor", self._facecolor))
    for attr, val in style.items():
      getattr(rect, "set_" + attr)(val)

  if self.subset_legend:
    styles, labels = zip(*self.subset_legend)
    styles = [patches.Patch(**patch_style) for patch_style in styles]
    ax.legend(styles, labels)
    
    
def make_grid(self, intersection, fig=None):
      """Get a SubplotSpec for each Axes, accounting for label text width
      """
      data = self.intersections
      n_cats = data.index.nlevels
      n_empty = len( list(filter(lambda x: x.strip() == '', data.index.names)) )
      n_cats = n_cats - n_empty
      
      
      n_inters = len(data) + 2

      if fig is None:
          fig = plt.gcf()

      # Determine text size to determine figure size / spacing
      r = get_renderer(fig)
      text_kw = {"size": matplotlib.rcParams['xtick.labelsize']}
      # adding "x" ensures a margin
      t = fig.text(0, 0, '\n'.join(str(label) + "x"
                                   for label in self.totals.index.values),
                   **text_kw)
      textw = t.get_window_extent(renderer=r).width
      t.remove()

      figw = self._reorient(fig.get_window_extent(renderer=r)).width

      sizes = np.asarray([p['elements'] for p in self._subset_plots])
      fig = self._reorient(fig)

      non_text_nelems = len(self.intersections) + self._totals_plot_elements
      if self._element_size is None:
          colw = (figw - textw) / non_text_nelems
      else:
          render_ratio = figw / fig.get_figwidth()
          colw = self._element_size / 72 * render_ratio
          figw = colw * (non_text_nelems + np.ceil(textw / colw) + 1)
          fig.set_figwidth(figw / render_ratio)
          fig.set_figheight((colw * (n_cats + sizes.sum())) /
                            render_ratio)

      text_nelems = int(np.ceil(figw / colw - non_text_nelems))
      # print('textw', textw, 'figw', figw, 'colw', colw,
      #       'ncols', figw/colw, 'text_nelems', text_nelems)

      GS = self._reorient(matplotlib.gridspec.GridSpec)
      gridspec = GS(*self._swapaxes(n_cats + (sizes.sum() or 0),
                                    n_inters + text_nelems +
                                    self._totals_plot_elements),
                    hspace=1)
      if self._horizontal:
          out = {'matrix': gridspec[-n_cats if intersection else 0:, -n_inters:-2],
                 'shading': gridspec[-n_cats if intersection else 0:, :-2],
                 'totals': gridspec[-n_cats:, :self._totals_plot_elements],
                 'gs': gridspec,
                 'cbar': gridspec[:, -1:]}
          cumsizes = np.cumsum(sizes[::-1])
          for start, stop, plot in zip(np.hstack([[0], cumsizes]), cumsizes,
                                       self._subset_plots[::-1]):
              out[plot['id']] = gridspec[start:stop, -n_inters:-2]
      else:
          out = {'matrix': gridspec[-n_inters:, :n_cats],
                 'shading': gridspec[:, :n_cats],
                 'totals': gridspec[:self._totals_plot_elements, :n_cats],
                 'gs': gridspec}
          cumsizes = np.cumsum(sizes)
          for start, stop, plot in zip(np.hstack([[0], cumsizes]), cumsizes,
                                       self._subset_plots):
              out[plot['id']] = \
                  gridspec[-n_inters:, start + n_cats:stop + n_cats]
      return out
    
def plot(self, scaler, cmap, cmap_fmt='{:.2e}', intersection=True, fig=None):
        """Draw all parts of the plot onto fig or a new figure
        Parameters
        ----------
        fig : matplotlib.figure.Figure, optional
            Defaults to a new figure.
        Returns
        -------
        subplots : dict of matplotlib.axes.Axes
            Keys are 'matrix', 'intersections', 'totals', 'shading'
        """
        if fig is None:
            fig = plt.figure(figsize=self._default_figsize)
        specs = self.make_grid(intersection, fig)
        shading_ax = fig.add_subplot(specs['shading'])
        self.plot_shading(shading_ax)
        matrix_ax = self._reorient(fig.add_subplot)(specs['matrix'],
                                                    sharey=shading_ax)
        cbar_ax = self._reorient(fig.add_subplot)(specs['cbar'])
        cbar_ax.axis('off')
        
        self.plot_matrix(matrix_ax, scaler, cmap)
        #totals_ax = self._reorient(fig.add_subplot)(specs['totals'],
        #                                            sharey=matrix_ax)
        #self.plot_totals(totals_ax)
        out = {'matrix': matrix_ax,
               'shading': shading_ax}
               #'totals': totals_ax}

        for plot in self._subset_plots:
            if plot['type'] == 'default' and not intersection:
              continue
            ax = self._reorient(fig.add_subplot)(specs[plot['id']],
                                                 sharex=matrix_ax)
            
            if plot['type'] == 'default':
                self.plot_intersections(ax)
                ax.set_ylabel('HetISq')
            elif plot['type'] in self.PLOT_TYPES:
                kw = plot.copy()
                del kw['type']
                del kw['elements']
                del kw['id']
                self.PLOT_TYPES[plot['type']](self, ax, **kw)
            else:
                raise ValueError('Unknown subset plot type: %r' % plot['type'])
            out[plot['id']] = ax
          
        cbar = fig.colorbar(cm.ScalarMappable(norm=scaler, cmap=cmap), ax=cbar_ax,
                            ticks=[0.5],
                            fraction=1, pad=0.04)
        #cbar.ax.set_yticklabels([scaler.inverse(0.1), scaler.inverse(.5), scaler.inverse(.9)])
        cbar.ax.set_yticks([scaler.vmin, scaler.vcenter, scaler.vmax])
        cbar.ax.set_yticklabels( map(lambda x: cmap_fmt.format(x), [scaler.vmin, scaler.vcenter, scaler.vmax]) )
        return out

upsetplot.UpSet.plot_matrix = plot_matrix
upsetplot.UpSet.plot = plot
upsetplot.UpSet.plot_intersections = plot_intersections
upsetplot.UpSet.make_grid = make_grid

Z_LOWER = -5.4513 # qnorm(5e-8 / 2)
Z_UPPER = 5.4513
CMAP_SCHEME = 'bwr_r'
plt.rcParams.update({'font.size': 26})

In [382]:
idx = df[~df.BETA.isna()].index
df.POS = ''
df.loc[idx, 'POS'] = df.loc[idx, 'MarkerName'].str.split(':').apply(lambda x: ':'.join(x[:2]))
df = df.merge(dfs['AD.risk_loci'][['rsID', 'POS', 'Allele']], on='POS')

df['P'] = df.P.astype(float)
idx = df.groupby('rsID').P.min().index
df = df[df.rsID.isin(idx[(df.groupby('rsID').P.min() < 5e-2)])]
df = df.reset_index(drop=True)

In [224]:
### OLD

df['POS'] = df.MarkerName.str.split(':').apply(lambda x: ':'.join(x[:2]))
df = df.merge(dfs['AD.risk_loci'][['rsID', 'POS', 'Allele']], on='POS')


df['P'] = df.P.astype(float)
idx = df.groupby('rsID').P.min().index
df = df[df.rsID.isin(idx[(df.groupby('rsID').P.min() < 5e-2)])]
df = df.reset_index(drop=True)

KeyError: 'rsID'

In [24]:
## calculate the allele flip here

df['flip'] = df.A1.str.upper() != df.Allele.str.split('/', expand=True)[0]
df.loc[df[df.flip].index,'BETA'] = df[df.flip]['BETA'].astype(float) * -1.

In [368]:
df['Z'] = df['BETA'].astype(float) / df['SE'].astype(float)

In [369]:
oc = 'log_CSF_Ab'
df = df[df.outcome==oc]
df = df[df.HetISq > 0.].reset_index(drop=True)

In [370]:
tmp_input = []
for idx, row in df[['MarkerName', 'rsID', 'outcome', 'HetISq']].drop_duplicates().iterrows():
  for new_study in (' ', '  ', '   ', '    ', '     ')[:2]:
    tmp_input.append(
      [row[0], pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, new_study, row[2], row[1], 0, row[3]])

In [371]:
tmp_df = df[['MarkerName', 'A1', 'Allele2', 'BETA', 'SE', 'P', 'OBS_CT', 'study', 'outcome', 'rsID', 'Z', 'HetISq']].copy()
tmp_df = tmp_df.append(
  pd.DataFrame(tmp_input,
           columns=tmp_df.columns))

  tmp_df = tmp_df.append(


In [372]:
df['intersection'] = pd.NA
df.loc[np.where(df.HetISq_HCAD == df.HetISq)[0],'intersection'] = 'HCAD'
df.loc[np.where(df.HetISq_HCPD == df.HetISq)[0],'intersection'] = 'HCPD'
df.loc[np.where(df.HetISq_PDAD == df.HetISq)[0],'intersection'] = 'PDAD'

In [None]:
tmp_grp = list(combinations(tmp_df.study.unique(), 1)) + \
  list(combinations(tmp_df.study.unique(), 2)) + \
  list(combinations(tmp_df.study.unique(), 3)) + \
  list(combinations(tmp_df.study.unique(), 4)) + \
  list(combinations(tmp_df.study.unique(), 5)) + \
  list(combinations(tmp_df.study.unique(), 6))

new_grp = []
for x in tmp_grp:
  if not set(map(str.strip, x)) == {''}:
    new_grp.append(x)
tmp_grp = new_grp
tmp_grp = tmp_grp[:tmp_df.rsID.nunique()]
tmp_grp = [list(x) for x in tmp_grp]

grp = from_memberships(
  tmp_grp,
  data = tmp_df.groupby('rsID').HetISq.max().tolist())
#  data=dfs.groupby(['outcome', 'rsID']).OBS_CT.sum().tolist())

plt.rcParams.update({'font.size': 14})
fig = plt.figure(figsize=(8,4))
a = upsetplot.UpSet(grp)

tmp_data = pd.DataFrame( tmp_df.groupby('rsID').HetISq.first().sort_values(ascending=False) ).reset_index().set_index(a.intersections.index)
tmp_data['outcome'] = oc

idx = list(filter(lambda x: x.strip() != '', tmp_data.index.names))
intersection_mapper = {
  'HCAD': [1 if x != 'PD' else 0 for x in idx],
  'HCPD': [1 if x != 'ADNI-Dementia' else 0 for x in idx],
  'PDAD': [1 if x != 'HC' else 0 for x in idx],
  pd.NA: [0 for x in idx]
}
inclusion_df = {
  oc: tmp_df.merge(df[['study', 'rsID', 'intersection']], on=['rsID', 'study'])\
                      .sort_values(by='HetISq', ascending=False)[['rsID', 'intersection']].drop_duplicates()\
                      .dropna().intersection.apply(lambda x: intersection_mapper[x]).tolist()
}

a.intersections.update(
  pd.Series(tmp_data.HetISq.tolist(),
          index=a.intersections.index)
)

a._with_lines = True
a.inclusion = np.array(inclusion_df[oc])
a._element_size = 64

scaler = colors.TwoSlopeNorm(vcenter=0,
                    vmin=Z_LOWER,
                    vmax=Z_UPPER)
cmap = cm.get_cmap(CMAP_SCHEME).copy()
cmap.set_bad(color = 'k', alpha = 1.)

a.mat_color = tmp_df[['HetISq', 'rsID', 'study', 'Z']].sort_values(by='HetISq',ascending=False).set_index('study')[['Z']].apply(lambda x: scaler(x))
a.labels = list(( tmp_data['outcome'].str.replace('log_CSF_', '') + ' - ' + tmp_data['rsID']).unique()) + ['']


plt.subplots_adjust(left=.25, bottom=0, right=1.75, top=1.5, wspace=0, hspace=0)
fig.suptitle('Cross-Sectional Effect Size (Z) for AD risk loci \non log CSF Aβ', x=1, y=1.65, fontsize=40)
_ = a.plot(scaler, cmap, cmap_fmt='{:.2f}', fig=fig)

In [None]:
tmp_grp = list(combinations(tmp_df.study.unique(), 1)) + \
  list(combinations(tmp_df.study.unique(), 2)) + \
  list(combinations(tmp_df.study.unique(), 3)) + \
  list(combinations(tmp_df.study.unique(), 4)) + \
  list(combinations(tmp_df.study.unique(), 5)) + \
  list(combinations(tmp_df.study.unique(), 6))

new_grp = []
for x in tmp_grp:
  if not set(map(str.strip, x)) == {''}:
    new_grp.append(x)
tmp_grp = new_grp
tmp_grp = tmp_grp[:tmp_df.rsID.nunique()]
tmp_grp = [list(x) for x in tmp_grp]

grp = from_memberships(
  tmp_grp,
  data = tmp_df.groupby('rsID').HetISq.max().tolist())
#  data=dfs.groupby(['outcome', 'rsID']).OBS_CT.sum().tolist())

plt.rcParams.update({'font.size': 14})
fig = plt.figure(figsize=(8,4))
a = upsetplot.UpSet(grp)

#tmp_data = dfs.groupby(['rsID', 'outcome']).OBS_CT.sum().reset_index().set_index(a.intersections.index)
tmp_data = pd.DataFrame( tmp_df.groupby('rsID').HetISq.first().sort_values(ascending=False) ).reset_index().set_index(a.intersections.index)
tmp_data['outcome'] = oc

idx = list(filter(lambda x: x.strip() != '', tmp_data.index.names))
intersection_mapper = {
  'HCAD': [1 if x != 'PD' else 0 for x in idx],
  'HCPD': [1 if x != 'ADNI-Dementia' else 0 for x in idx],
  'PDAD': [1 if x != 'HC' else 0 for x in idx],
  pd.NA: [0 for x in idx]
}
inclusion_df = {
  oc: tmp_df.merge(df[['study', 'rsID', 'intersection']], on=['rsID', 'study'])\
                      .sort_values(by='HetISq', ascending=False)[['rsID', 'intersection']].drop_duplicates()\
                      .intersection.apply(lambda x: intersection_mapper[x]).tolist()
}
a.intersections.update(
  pd.Series(tmp_data.HetISq.tolist(),
          index=a.intersections.index)
)

a._with_lines = True
a.inclusion = np.array(inclusion_df[oc])
a._element_size = 64

scaler = colors.TwoSlopeNorm(vcenter=0,
                    vmin=Z_LOWER,
                    vmax=Z_UPPER)
cmap = cm.get_cmap(CMAP_SCHEME).copy()
cmap.set_bad(color = 'k', alpha = 1.)

a.mat_color = tmp_df[['HetISq', 'rsID', 'study', 'Z']].sort_values(by='HetISq',ascending=False).set_index('study')[['Z']].apply(lambda x: scaler(x))
a.labels = list(( tmp_data['outcome'].str.replace('log_CSF_', '') + ' - ' + tmp_data['rsID']).unique()) + ['']
plt.subplots_adjust(left=.25, bottom=0, right=1.75, top=1.5, wspace=0, hspace=0)
fig.suptitle('Time-constant Effect Size (Z) for PD risk loci \non log CSF t-Tau', x=1, y=1.65, fontsize=40)
_ = a.plot(scaler, cmap, cmap_fmt='{:.2f}', fig=fig)

## Forest Plots

In [12]:
def grep(query, fn, compressed=False):
  cmd = f'grep -w {query} {fn}'
  
  if compressed:
    cmd = 'z' + cmd
  p = subprocess.run(cmd.split(' '),
                     stdout=subprocess.PIPE,
                     stderr=subprocess.STDOUT)
  return p.stdout.decode('utf-8').strip().split('\t')

def get_query(query, fn, keep=False, metal=True):
  q = query if metal else query[:-4]
  data = grep(q, fn, compressed = not metal)
  res = [query, pd.NA, pd.NA, pd.NA]
  
  metal_idx = [7,8,9,15,16]
  gallop_idx = [15,16,17,8,14]
  
  idx = metal_idx if metal else gallop_idx
  if data != ['']:
    flip = False
    if metal and data[0].split(':')[2:] != [data[1].upper(), data[2].upper()]:
      flip = True
      
    if keep:
      res = [data[0] if metal else query]
      res += data[1:]
    else:
      res = [data[0] if metal else query,
             float(data[idx[0]]) * (-1 if flip else 1)]
      res += [ data[x] for x in idx[1:] ]
  return res

def create_SUMSTATS(data):
  tmp_dir = 'data/metal'
  sep='\t'
  for idx in range(data.shape[0]):
    row = data.iloc[idx,:]    # ignore lambda value
    c = row.study

    with open( f'{tmp_dir}/input/{c}.input', 'w') as f:
      f.write( sep.join( list(row.index) ) + '\n' )
      f.write( sep.join( map(str, list(row)) ) )  

def create_METAL(outcome, cohorts):
  tmp_dir = 'data/metal'
  cmd = list()
  cmd.append(f"""\
SCHEME STDERR
CUSTOMVARIABLE OBS_CT_total
LABEL OBS_CT_total as OBS_CT
  """)

  cmd.append("""\
CUSTOMVARIABLE OBS_CT_REP_total
LABEL OBS_CT_REP_total as OBS_CT_REP
  """)

  cmd.append(f"""\
MARKER ID
ALLELE ALT A1
EFFECT BETA
PVALUE P
STDERR SE
  """)
    
  for c in cohorts:
    fn = f'{tmp_dir}/input/{c}.input'
    cmd.append(f'PROCESS {fn}')

  cmd.append(f"""
OUTFILE {tmp_dir}/P_ .tbl
ANALYZE HETEROGENEITY
QUIT
  """)
    
  cmd = '\n'.join(cmd)               

  with open( f'{tmp_dir}/local.metal', 'w') as f:
    f.write(cmd)
    
def conduct_METAL():
  tmp_dir = 'data/metal'
  fout = open( f'{tmp_dir}/metal.stdout', 'w')
  cmd = f'metal {tmp_dir}/local.metal'
  p1 = subprocess.run(cmd.split(),  stdout=fout)
  
def load_METAL():
  tmp_dir = 'data/metal'
  df = pd.read_csv(f'{tmp_dir}/P_1.tbl', sep="\t")
  df['study'] = 'METAL meta-analysis'
  return df
  new_row = ['METAL meta-analysis',
             df.MarkerName,
             df.Allele1,
             df.Allele2,
             df.OBS_CT_total,
             df.OBS_CT_REP_total,
             df.Effect,
             df.StdErr,
             df['P-value']]
  return new_row

In [29]:
base = Path('METAL')
het_data = []

for idx, row in df[['rsID', 'MarkerName', 'HetISq']][df.HetISq > 0]\
                  .drop_duplicates().sort_values(by='HetISq', ascending=False)\
                  .iterrows():
  rsid = row.rsID
  query = row.MarkerName
  print(rsid, query, row.HetISq)
  
  fn = base / f'meta_analysis/lt/{oc}/Pi_HC1.tbl'
  cohort = 'HC'
  res = get_query(query, fn) + [cohort, oc, rsid]
  het_data.append(res)
  cohort = 'PD'
  fn = base / f'meta_analysis/lt/{oc}/Pi_PD.all1.tbl'
  res = get_query(query, fn) + [cohort, oc, rsid]
  het_data.append(res)
  fn = base / f'raw_summary_stats/lt/EUR_ADNI-Dementia_allchr.{oc}.gallop.gz'
  cohort = 'ADNI-Dementia'
  res = get_query(query, fn, metal=False) + [cohort, oc, rsid]
  het_data.append(res)


tmp_df = pd.DataFrame(het_data, 
           columns=['ID', 'BETA', 'SE', 'P', 'OBS_CT', 'OBS_CT_REP', 'study', 'outcome', 'rsID'])
tmp_df = tmp_df.dropna()
tmp_df['BETA'] = tmp_df.BETA.astype(float)
tmp_df['SE'] = tmp_df.SE.astype(float)
tmp_df['OBS_CT'] = tmp_df.OBS_CT.astype(float)
tmp_df['OBS_CT_REP'] = tmp_df.OBS_CT_REP.astype(float)
tmp_df['P'] = tmp_df.P.astype(float).apply(lambda x: '{:.3E}'.format(x) if x < 0.01 else '{:.3f}'.format(x))
tmp_df['ALT'] = tmp_df.ID.str.split(':').apply(lambda x: x[2])
tmp_df['A1'] = tmp_df.ID.str.split(':').apply(lambda x: x[3])

rs16941239 chr16:86420604:T:A 91.0
rs1582763 chr11:60254475:G:A 85.7
rs12590654 chr14:92472511:G:A 80.7
rs7401792 chr14:92464917:G:A 77.8
rs56407236 chr16:90103687:G:A 76.1
rs11771145 chr7:143413669:G:A 75.8
rs6966331 chr7:37844191:T:C 75.5
rs1800978 chr9:104903697:C:G 73.2
rs871269 chr5:151052827:C:T 73.2
rs6605556 chr6:32615322:A:G 72.3
rs12151021 chr19:1050875:A:G 71.6
rs6742 chr20:63743088:T:C 71.4
rs112403360 chr5:14724304:T:A 69.8
rs6489896 chr12:113281983:T:C 69.1
rs5848 chr17:44352876:C:T 66.1
rs62374257 chr5:86927378:T:C 65.8
rs6733839 chr2:127135234:C:T 65.1
rs10933431 chr2:233117202:G:C 64.6
rs2526377 chr17:58332680:A:G 63.4
rs113706587 chr5:180201150:G:A 63.1
rs7912495 chr10:11676714:A:G 62.5
rs6943429 chr7:7817263:T:C 61.8
rs587709 chr19:54267597:C:T 59.5
rs2245466 chr4:40197226:G:C 59.4
rs3851179 chr11:86157598:T:C 59.0
rs74685827 chr11:121482368:T:G 56.8
rs3848143 chr15:64131307:G:A 50.2
rs6014724 chr20:56423488:A:G 50.2
rs7767350 chr6:47517390:C:T 47.9
rs889555 chr16:31

In [30]:
for study in tmp_df.study.unique():
  tmp_df[tmp_df.study==study].to_csv(f'figures/het_analysis/data/{study}.{oc}.input',
                                    sep='\t', index=False)
  

In [26]:
base = Path('METAL')


for idx, row in df[['rsID', 'MarkerName', 'HetISq']][df.HetISq > 0]\
                  .drop_duplicates().sort_values(by='HetISq', ascending=False)\
                  .head(20).iterrows():
  het_data = []
  rsid = row.rsID
  query = row.MarkerName
  print(rsid, query, row.HetISq)
  
  fn = base / f'meta_analysis/lt/{oc}/Pi_HC1.tbl'
  cohort = 'HC'
  res = get_query(query, fn) + [cohort, oc, rsid]
  het_data.append(res)
  cohort = 'PD'
  fn = base / f'meta_analysis/lt/{oc}/Pi_PD.all1.tbl'
  res = get_query(query, fn) + [cohort, oc, rsid]
  het_data.append(res)
  fn = base / f'raw_summary_stats/lt/EUR_ADNI-Dementia_allchr.{oc}.gallop.gz'
  cohort = 'ADNI-Dementia'
  res = get_query(query, fn, metal=False) + [cohort, oc, rsid]
  het_data.append(res)
  
  
  tmp_df = pd.DataFrame(het_data, 
             columns=['ID', 'BETA', 'SE', 'P', 'OBS_CT', 'OBS_CT_REP', 'study', 'outcome', 'rsID'])

  tmp_df = tmp_df.dropna()
  tmp_df['BETA'] = tmp_df.BETA.astype(float)
  tmp_df['SE'] = tmp_df.SE.astype(float)
  tmp_df['OBS_CT'] = tmp_df.OBS_CT.astype(float)
  tmp_df['OBS_CT_REP'] = tmp_df.OBS_CT_REP.astype(float)
  tmp_df['P'] = tmp_df.P.astype(float).apply(lambda x: '{:.3E}'.format(x) if x < 0.01 else '{:.3f}'.format(x))
  tmp_df['ALT'] = tmp_df.ID.str.split(':').apply(lambda x: x[2])
  tmp_df['A1'] = tmp_df.ID.str.split(':').apply(lambda x: x[3])
  
  create_SUMSTATS(tmp_df)
  create_METAL(oc, ['HC', 'PD', 'ADNI-Dementia'])
  conduct_METAL()
  meta_res = load_METAL()
  meta_res = meta_res.rename(columns= {'Effect': 'BETA',
                                       'StdErr': 'SE',
                                       'P-value': 'P',
                                       'OBS_CT_total': 'OBS_CT',
                                       'OBS_CT_REP_total': 'OBS_CT_REP'})
  
  title = 'log CSF Aβ Time-constant - {} ({})'.format(rsid, query)
  #title = 'log CSF t-tau Time-constant - {} ({})'.format(rsid, query)

  
  output_file(filename=f'figures/het_analysis/{oc}.time-const.{rsid}.het.html',
              title=title)
  save(
    forest_plot(pd.concat([meta_res, tmp_df]),
           meta=meta_res,
           query={'effect': 'Intercept',
                  'outcome': oc},
           show_pval=True,
           title=title))

rs6943429 chr7:7817263:T:C 85.3
rs7157106 chr14:105761758:A:G 83.9
rs1358782 chr20:413334:A:G 83.3
rs587709 chr19:54267597:C:T 78.7
rs2830489 chr21:26775872:C:T 74.9
rs5848 chr17:44352876:C:T 74.6
rs16941239 chr16:86420604:T:A 74.0
rs7068231 chr10:60025170:T:G 71.6
rs679515 chr1:207577223:T:C 70.8
rs7225151 chr17:5233752:G:A 69.7
rs17020490 chr2:37304796:T:C 69.2
rs10437655 chr11:47370397:G:A 65.2
rs72777026 chr2:9558882:A:G 65.0
rs6586028 chr10:80494228:C:T 62.1
rs889555 chr16:31111250:C:T 57.5
rs6014724 chr20:56423488:A:G 55.8
rs3822030 chr4:993555:G:T 55.0
rs602602 chr15:58764824:T:A 53.9
rs6605556 chr6:32615322:A:G 53.6
rs8025980 chr15:50701814:A:G 52.6
