```
Copyright 2021 Google LLC.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

1. Redistributions of source code must retain the above copyright notice, this
   list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright notice,
   this list of conditions and the following disclaimer in the documentation
   and/or other materials provided with the distribution.

3. Neither the name of the copyright holder nor the names of its contributors
   may be used to endorse or promote products derived from this software
   without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
```

# ML-based Phenotyping Manuscript Plots

In [None]:
import collections
import csv
import functools
import os
import random
from matplotlib import lines
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib_venn
import numpy as np
import pandas as pd
import scipy.stats as ss
import seaborn as sns
from sklearn import metrics as sklearn_metrics
from sklearn import utils as sklearn_utils
import statsmodels.api as sm
from typing import Dict, List, Optional, Tuple, Text

# Modules defined within this repository.
import perf_metrics
import pheno_utils

In [None]:
AX = plt.axes
FIG = mpl.figure.Figure

mpl.rcParams['figure.dpi'] = 300
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['pdf.fonttype'] = 42

mpl.rcParams['savefig.transparent'] = True
mpl.rcParams['savefig.bbox'] = 'tight'
mpl.rcParams['savefig.dpi'] = 300

pd.set_option('mode.chained_assignment', 'raise')

##Global and Constant variables

In [None]:
_ABLATION_FR = 10
_SIG_CUTOFF = 5e-8
_MANHATTAN_CUTOFF = 0.001
_RANDOM_SEED = 42

# Colors from the PRS plots "vcdr_prs_manuscript_results.ipynb"
ML_COLOR = '#4285f4'
CRAIG_COLOR = '#fbbc05'

# ML-based VCDR GWAS information
_VCDR_GWAS = '/path/to/file'
_VCDR_HITS = '/path/to/file'
_VCDR_LOCI = '/path/to/file'

# hits from Craig et al.
_CRAIG_HITS = '/path/to/file'
# Overlap between VCDR (meta) and Craig et al. (meta) GWAS - 62 loci
_VCDR_CRAIG_OVERLAP = '/path/to/file'
# VCDR PRS predictions
_VCDR_PRS_PREDS = '/path/to/file'


# expert labels with one row per image
# columsn are 
#  'GLAUCOMA_GRADABILITY'
#  'VERTICAL_CUP_TO_DISC' 
#  'VCDR_GRADERS' (how many experts have graded the image) 
#  'VCDR_CONFIDENCE' (a QC metric between 0 and 1)
TRAIN_TRUTH = '/path/to/file'
TUNE_TRUTH = '/path/to/file'
TEST_TRUTH = '/path/to/file'
UKB_TRUTH = '/path/to/file'

# model predictions with one row per image
# column names are the same as described in the "phenotype_calling" notebook
TRAIN_PRED = '/path/to/file'
TUNE_PRED = '/path/to/file'
TEST_PRED = '/path/to/file'
UKB_PRED = '/path/to/file'

## Metrics

In [None]:
def pearson_corr(targets, predictions):
  """Returns Pearson correlation between <targets> and <predictions>."""
  return ss.pearsonr(targets, predictions)[0]


def spearman_corr(targets, predictions):
  """Returns Pearman correlation between <targets> and <predictions>."""
  return ss.spearmanr(targets, predictions)[0]


_METRICS = [
    perf_metrics.Metric(
        'num', lambda y_true, y_pred: len(y_true), binary_only=False),
    perf_metrics.Metric('Pearson corr', pearson_corr, binary_only=False),
    perf_metrics.Metric('Spearman corr', spearman_corr, binary_only=False)
]

##Helper functions

In [None]:
def get_snp_id(row):
  """Returns RS ID if exists, else returns chr:bp_ref_alt."""
  if row['RS'] != '.':
    return row['RS']
  else:
    return f'{row["CHR"]}:{row["BP"]}_{row["REF"]}_{row["ALT"]}'


_GENE_CONTEXT = 'GENE_CONTEXT'
_CHROMOSOME_SIZE_UPPERBOUND = 10**10
_GENE_DISTANCE_ANNOTATION_LIMITS = [10**3, 10**4, 10**5, 10**6]

Gene = collections.namedtuple('Gene', ['start', 'end', 'name'])


def _build_genes(gencode_filepath):
  """Creates the genes from an input file.

  Args:
    gencode_filepath: The path of file containing gene context information. The
      file is a TSV file with six columns for chromosome, start, end, strand,
      Ensembl gene ID, and HGNC gene name. Start and end are both 1-based
      inclusive positions.

  Returns:
    genes: A mapping from a each chromosome to its genes. Each gene is a triple
      consisting of start, end, and HGNC name of the gene.
  """
  genes = collections.defaultdict(list)
  with open(gencode_filepath) as f:
    gene_rows = list(csv.reader(f, delimiter='\t'))
  for chrom, start, end, _, _, hgnc in gene_rows[1:]:  # Skip the header row.
    start, end = int(start), int(end)
    if start >= end:
      raise ValueError(
          'start >= end for gene {} in chromosome {}: {:d} >= {:d}'.format(
              hgnc, chrom, start, end))
    genes[chrom].append(Gene(start=start, end=end, name=hgnc))
  return genes


def _annotate_distance(distance):
  """Returns the distance annotation to neighbor genes."""
  for i, limit in enumerate(_GENE_DISTANCE_ANNOTATION_LIMITS):
    if distance <= limit:
      return '-' * i
  return None


class GeneContext(object):
  """Class representing gene contexts."""

  def __init__(self, gencode_filepath: str) -> None:
    self._genes = _build_genes(gencode_filepath)

  def get_context(self, chrom, base_pair_position):
    """Gets the gene context of a variant.

    Args:
      chrom: The chromosome of variant.
      base_pair_position: The 1-based base-pair position of variant.

    Returns:
      gene_context: The gene context of variant where distance is annotated
        with some dashes.
    """
    # Check whether the variant falls into any genes.
    variant_genes = [
        gene.name
        for gene in self._genes[chrom]
        if gene.start <= base_pair_position <= gene.end
    ]
    if variant_genes:
      return '[{}]'.format(','.join(variant_genes))

    gene_before = Gene(start=None, end=0, name=None)
    gene_after = Gene(start=_CHROMOSOME_SIZE_UPPERBOUND, end=None, name=None)
    for gene in self._genes[chrom]:
      if gene_before.end < gene.end < base_pair_position:
        gene_before = gene
      if base_pair_position < gene.start < gene_after.start:
        gene_after = gene

    before, after = '', ''
    if gene_before.name:
      # When there is a gene before.
      left_distance_annotation = _annotate_distance(base_pair_position -
                                                    gene_before.end)
      if left_distance_annotation is not None:
        before = gene_before.name + left_distance_annotation
    if gene_after.name:
      # When there is a gene after.
      right_distance_annotation = _annotate_distance(gene_after.start -
                                                     base_pair_position)
      if right_distance_annotation is not None:
        after = right_distance_annotation + gene_after.name
    return '{}[]{}'.format(before, after)


def _add_gene_contexts(clusters, gencode_filepath):
  """Adds the gene context to each cluster.

  Args:
    clusters: A list of variants where each variant is a cluster representative,
      i.e. the most significant variant in the cluster.
   gencode_filepath: The path of file containing gene context information. The
     file is a TSV file with six columns for chromosome, start, end, strand,
     Ensembl gene ID, and HGNC gene name. Start and end are both 1-based
     inclusive positions.
  """
  logging.info('Annotating the clusters with gene context...')
  gene_context = GeneContext(gencode_filepath)
  for cluster in clusters:
    cluster[_GENE_CONTEXT] = gene_context.get_context('CHR', 'BP')


def _add_gene_labels(df_hits):
  """Adds gene context and label to hits."""
  chroms = df_hits['CHR'].to_list()
  positions = df_hits['BP'].to_list()

  variants = [{
      'CHR': str(chrom),
      'BP': position
  } for chrom, position in zip(chroms, positions)]

  with open(_GENCODE_FILE) as f:
    _add_gene_contexts(variants, gencode_filepath=f)

  df_hits['GENE_CONTEXT'] = [variant['GENE_CONTEXT'] for variant in variants]
  return df_hits


def _get_preds_filename(dataset):
  """Returns the path to the prediction file."""
    if dataset == 'tune':
      return TUNE_PRED
    elif dataset == 'test':
      return TEST_PRED
    elif dataset == 'train':
      return TRAIN_PRED
    elif dataset == 'ukb':
      return UKB_PRED
    else:
      raise ValueError('Invalid dataset: {}'.format(dataset))


def _eid_from_image_path(path):
  """Returns <eid> from <image_path>."""
  return int(os.path.basename(path).split('_')[0])


def load_predictions(dataset):
  """Return predictions for <dataset>."""
  filename = _get_preds_filename(dataset)
  raw_df = pheno_utils.load_csv(filename)
  converted_df = pheno_utils.convert_categorical_to_binary(
      raw_df, 'vertical_cd_visibility',
      'vertical_cd_visibility:UNABLE_TO_ASSESS', [
          'vertical_cd_visibility:SUFFICIENT',
          'vertical_cd_visibility:COMPROMISED',
          'vertical_cd_visibility:UNABLE_TO_ASSESS',
      ])

  col_map = {
      'image_id': 'image_id',
      'vertical_cd_visibility': 'gradability_prediction',
      'vertical_cup_to_disc:VERTICAL_CUP_TO_DISC': 'vcdr_prediction',
  }

  # Keep only image_id, vertical_cd_visibility, and vcdr_prediction.
  retval = converted_df[col_map.keys()].rename(columns=col_map)

  if len(retval) != len(set(retval.image_id)):
    raise ValueError('Duplicate image in prediction set {} {}'.format(
        downsample_fraction, dataset))

  if retval.vcdr_prediction.isna().any():
    raise ValueError('Unexpected NA present in predictions.')

  return retval


def load_labels(dataset):
  """Load VCDR labels."""
  if dataset == 'train':
    filename = TRAIN_TRUTH
  elif dataset == 'eval':
    filename = TUNE_TRUTH
  elif dataset == 'test':
    filename = TEST_TRUTH
  elif dataset == 'ukb':
    filename = UKB_TRUTH
  else:
    raise ValueError('Invalid dataset: {}'.format(dataset))

  raw_df = pheno_utils.load_csv(filename)
  col_map = {
      'Unnamed: 0': 'image_id',
      'GLAUCOMA_GRADABILITY': 'gradability_label',
      'VERTICAL_CUP_TO_DISC': 'vcdr_label',
      'VCDR_GRADERS': 'num_graders',
  }

  if dataset == 'ukb':
    # Restrict to records with confidence == 1.
    raw_df = raw_df.loc[raw_df['VCDR_CONFIDENCE'] > 0.75]
  retval = raw_df[col_map.keys()].rename(columns=col_map)

  if len(retval) != len(set(retval.image_id)):
    raise ValueError('Duplicate image in truth set {}'.format(dataset))

  return retval[~retval.vcdr_label.isna()]


def prediction_vs_label_scatterplot(dataset):
  """Generates the scatter plot of predicted vs labeled VCDR for <dataset>."""
  labels = load_labels(dataset)
  predictions = load_predictions(dataset=dataset)

  joined = predictions.set_index('image_id').join(
      labels.set_index('image_id'), how='inner')

  print('graders for {} - mean: {:.2f} - median: {:.2f}'.format(
      dataset, joined['num_graders'].mean(), joined['num_graders'].median()))

  jax = sns.jointplot(
      data=joined,
      x='vcdr_label',
      y='vcdr_prediction',
      kind='reg',
      space=0,
      xlim=[0.0, 1],
      ylim=[0.0, 1],
      height=2,
      annot_kws=dict(stat='r'),
      marginal_kws=dict(hist=False, kde_kws=dict(lw=0.75)),
      scatter_kws=dict(
          s=3, alpha=0.5, color='g', edgecolor='none', rasterized=True),
      joint_kws=dict(line_kws=dict(linewidth=0.75)))

  jax.ax_joint.set_xlabel('')
  jax.ax_joint.set_ylabel('')

  jax.ax_joint.set_xticks([0, 0.5, 1])
  jax.ax_joint.set_yticks([0, 0.5, 1])
  jax.ax_joint.set_xticklabels([0, 0.5, 1], fontsize=5)
  jax.ax_joint.set_yticklabels([0, 0.5, 1], fontsize=5)

  return jax  

# VCDR Predictions Scatter plots

In [None]:
for dataset in ['train', 'tune', 'test', 'ukb']:
  jax = prediction_vs_label_scatterplot(dataset)
  jax.savefig(f'{dataset}.pdf', transparent=True, bbox_inches='tight', dpi=1200)

# VCDR Bins vs Odds Ratio Boxplot

In [None]:
def get_phenos_plot(phenos: pd.DataFrame,
                    x: Text,
                    y: Text,
                    new_x: Text = '',
                    new_y: Text = '') -> pd.DataFrame:
  """Given the all phenotypes returns the phenotypes needed for plotting."""
  phenos_plot = phenos[[x, y]].dropna()
  if new_x and new_y:
    phenos_plot = phenos_plot.rename(columns={x: new_x, y: new_y})
  return phenos_plot


def _get_odds_ratio(counts_ref, counts):
    """Returns the sample odds ratio."""
    return (counts_ref[0] * counts[1]) / (counts_ref[1] * counts[0])


def get_bin_odds_ratios(phenos_plot: pd.DataFrame,
                        x_col: Text,
                        y_col: Text,
                        num_bootstrap_samples: int,
                        x_bins: np.ndarray,
                        seed: int,
                        fast_or=False) -> np.ndarray:
  """Returns the bootstrap ORs of <x> bins defined by <xbins>.

  The reference odds are computed for the entire set.
  """

  def _count_vals(array, ref_vals=[1, 2]):
    """Returns the counts of controls ("1") and cases ("2")."""
    return [(array == val).sum() for val in ref_vals]

  def _get_bin_counts(x, y, x_bins, rand_idx=None):
    """Retruns counts of controls and cases in bins defined by <x_bins>."""
    if rand_idx is not None:
      x = x[rand_idx]
      y = y[rand_idx]
    n_bins = len(x_bins) - 1
    bin_idx = np.digitize(x, x_bins)
    bin_counts = [_count_vals(y[bin_idx == j + 1]) for j in range(n_bins)]
    return bin_counts

  prng = np.random.RandomState(seed)
  num = phenos_plot.shape[0]

  x = phenos_plot[x_col].to_numpy(copy=True)
  y = phenos_plot[y_col].to_numpy(copy=True)

  perf_metrics.PerformanceMetrics(
      name='glaucoma prediction', default_metrics='binary').compute_and_print(
          y - 1, x, n_bootstrap=2000)

  x_bins = [-np.inf] + list(x_bins) + [np.inf]
  num_bins = len(x_bins) - 1

  # bin counts for the original (non-bootstrapped samples)
  ref_bin_counts = _get_bin_counts(x, y, x_bins)

  bs_stats = np.empty((num_bootstrap_samples, num_bins))
  for i in range(num_bootstrap_samples):
    if (i + 1) % 100 == 0:
      print('Processed %d bootstrap samples...' % (i + 1))
    rand_idx = prng.randint(0, high=num, size=num)
    bin_counts = _get_bin_counts(x, y, x_bins, rand_idx=rand_idx)
    # ORs are defined w.r.t the first bin.
    for j in range(num_bins):
      if fast_or:
        bs_stats[i, j] = _get_odds_ratio(bin_counts[0], bin_counts[j])
      else:
        bs_stats[i, j] = ss.fisher_exact([bin_counts[0], bin_counts[j]])[0]

  return bs_stats, ref_bin_counts


def plot_vcdr_bins_odds_ratios_boxplot(phenos_file: Text,
                                       num_bootstrap_samples: int, seed: int,
                                       fast_or) -> Tuple[FIG, AX]:
  x_bins = [0.3, 0.4, 0.5, 0.6, 0.7]

  phenos = pheno_utils.load_csv(phenos_file, sep='\t', index_col=None)
  phenos.replace(-9, np.nan, inplace=True)
  phenos.dropna(
      axis='rows',
      how='any',
      subset=['visit', 'visit_age', 'refractive_error'],
      inplace=True)

  df = get_phenos_plot(
      phenos,
      x='vcdr_visit',
      y='has_touchscreen_plus_icd_poag',
      new_x='vcdr',
      new_y='glaucoma')

  odds_ratios, ref_bin_counts = get_bin_odds_ratios(
      df,
      x_col='vcdr',
      y_col='glaucoma',
      num_bootstrap_samples=num_bootstrap_samples,
      x_bins=x_bins,
      seed=seed,
      fast_or=fast_or)

  num = np.asarray([sum(b) for b in ref_bin_counts])
  print('bin counts: ', num)
  print('bin fractions: ', 100 * num /(num.sum()))

  plot_data = [odds_ratios[:, i] for i in range(odds_ratios.shape[1])]

  print('OR: ', _get_odds_ratio(ref_bin_counts[0], ref_bin_counts[-1]))
  print('95% CI: ', np.percentile(plot_data[-1], [2.5, 97.5]))

  labels = [f'< {x_bins[0]}'] + [
      '{}-{}'.format(*x_bins[i:i + 2]) for i in range(len(x_bins) - 1)
  ] + [f'> {x_bins[-1]}']
  fig, ax = plt.subplots(figsize=(6, 4))
  ax.boxplot(plot_data, showfliers=False, labels=labels, whis=[5, 95])
  ax.axhline(y=1, ls='--', c='k', lw=0.5)
  ax.set_ylim([0, 100])

  ax.set_yticks([0, 20, 40, 60, 80, 100])

  return fig, ax

In [None]:
fig, ax = plot_vcdr_bins_odds_ratios_boxplot(
    phenos_filepath,
    num_bootstrap_samples=1000,
    seed=_RANDOM_SEED,
    fast_or=True)

fig.savefig(
    'vcdr_glaucoma_oddsratios_boxplots.pdf',
    transparent=True,
    bbox_inches='tight')

# VCDR-PRS Bins vs Odds Ratio Boxplot

In [None]:
def _get_prs_bins(preds, df_phenos, vcdr_bins):
  """Computes PRS bins using predicted VCDRs and vcdr_bins."""
  prs = preds.copy(deep=True)
  pheno = df_phenos.copy(deep=True)
  df = pd.merge(prs, pheno, left_index=True, right_index=True, how='inner')

  # drop samples that do not have fundus predictions or have
  # missing covariates
  df.dropna(
      axis='rows',
      how='any',
      subset=['visit', 'refractive_error', 'vcdr_visit', 'visit_age'],
      inplace=True)

  mlb_vcdr = df['vcdr_visit'].to_numpy(copy=True)
  prs_vcdr = df['prs'].to_numpy(copy=True)

  prs_avg = np.mean(prs_vcdr)
  prs_std = np.std(prs_vcdr)

  # convert PRS to Z-scores
  prs_z = (prs_vcdr - prs_avg) / prs_std

  # need to pad vcdr_bins from left
  vcdr_bins = [-np.inf] + list(vcdr_bins)
  n_bins = len(vcdr_bins) - 1
  bin_idx = np.digitize(mlb_vcdr, vcdr_bins)
  # compute counts in each bin
  bin_counts = [mlb_vcdr[bin_idx == j + 1].shape[0] for j in range(n_bins)]
  # convert counts to percentiles
  bin_pcs = 100 * np.cumsum(bin_counts) / mlb_vcdr.shape[0]
  # get the same PRS percentiles
  prs_bins = np.percentile(prs_z, bin_percs)
  return prs_bins, prs_avg, prs_std, df.index


def plot_vcdr_prs_bins_odds_ratios_boxplot(phenos_file: Text,
                                           num_bootstrap_samples: int,
                                           seed: int,
                                           fast_or) -> Tuple[FIG, AX]:
  # load the glaucoma phenotypes
  df_phenos = pheno_utils.load_csv(
      phenos_file,
      sep='\t',
      index_col=None,
      usecols=[
          'IID',
          'visit',
          'visit_age',
          'vcdr_visit',
          'refractive_error',
          'has_touchscreen_plus_icd_poag',
          'has_touchscreen_plus_icd_poag_nofundus',
      ])

  df_phenos.rename(
      columns={
          'has_touchscreen_plus_icd_poag': 'glaucoma',
          'has_touchscreen_plus_icd_poag_nofundus': 'glaucoma_nofundus'
      },
      inplace=True)

  df_phenos.replace(-9, np.nan, inplace=True)
  df_phenos = df_phenos.set_index('IID')

  # load PRS predictions.
  preds = pheno_utils.load_csv(
      _VCDR_PRS_PREDS,
      index_col=None,
      delim_whitespace=True,
      usecols=['IID', 'SCORE'])

  preds.rename(columns={'SCORE': 'prs'}, inplace=True)
  preds = preds.set_index('IID')

  # get the PRS bins using samples with fundus image.
  vcdr_bins = [0.3, 0.4, 0.5, 0.6, 0.7]
  prs_bins, prs_avg, prs_std, idx_fundus = _get_prs_bins(
      preds, df_phenos, vcdr_bins)

  df_plot = pd.merge(
      df_phenos, preds, left_index=True, right_index=True, how='inner')

  idx_nofundus = df_phenos.loc[df_phenos['vcdr_visit'].isna()].index

  freq_glaucoma_fundus = (df_phenos.loc[idx_fundus, 'glaucoma'] == 2).mean()
  freq_glaucoma_nofundus = (df_phenos.loc[idx_nofundus, 'glaucoma'] == 2).mean()

  print(
      'freq of glaucoma in inds. w/ fundus: {:.2f} - w/o fundus: {:.2f}'.format(
          100 * freq_glaucoma_fundus, 100 * freq_glaucoma_nofundus))

  df_plot = df_plot.loc[df_plot['vcdr_visit'].isna(), ['glaucoma', 'prs']]
  df_plot.dropna(inplace=True)

  # convert PRS predictions to Z-scores
  df_plot['prs'] = df_plot['prs'].apply(lambda x: (x - prs_avg) / prs_std)

  print(f'{df_plot.shape[0]} inds. with PRS + glaucoma stats')

  odds_ratios, ref_bin_counts = get_bin_odds_ratios(
      df_plot,
      x_col='prs',
      y_col='glaucoma',
      num_bootstrap_samples=num_bootstrap_samples,
      x_bins=prs_bins,
      seed=seed,
      fast_or=fast_or)

  plot_data = [odds_ratios[:, i] for i in range(odds_ratios.shape[1])]

  num = np.asarray([sum(b) for b in ref_bin_counts])
  print('bin counts: ', num)
  print('bin fractions: ', 100 * num / (num.sum()))

  print('OR and 95% CI of the top bucket:')
  print('mean: {:.3f} - ({:.3f}-{:.3f})'.format(
      _get_odds_ratio(ref_bin_counts[0], ref_bin_counts[-1]),
      * np.percentile(plot_data[-1], [2.5, 97.5])))

  labels = ['< {:.1f}'.format(prs_bins[0])] + [
      '{:.1f}-{:.1f}'.format(*prs_bins[i:i + 2])
      for i in range(len(prs_bins) - 1)
  ] + ['> {:.1f}'.format(prs_bins[-1])]

  fig, ax = plt.subplots(figsize=(6, 4))
  ax.boxplot(plot_data, showfliers=False, labels=labels, whis=[5, 95])

  ax.axhline(y=1, ls='--', c='k', lw=0.5)

  ax.set_ylim([0.0, 5])
  ax.set_yticks([0, 1, 2, 3, 4, 5])

  return fig, ax

In [None]:
fig, ax = plot_vcdr_prs_bins_odds_ratios_boxplot(
    phenos_file=phenos_filepath,
    num_bootstrap_samples=1000,
    seed=_RANDOM_SEED,
    fast_or=True)

fig.savefig(
    'vcdr_prs_glaucoma_oddsratios_boxplots.pdf',
    transparent=True,
    bbox_inches='tight')

# VCDR vs Glaucoma Referral Risk

In [None]:
def plot_vcdr_p4_scatter(phenos_file, seed):
  """Plots vcdr vs glaucoma liability 2D histogram."""
  n_bins = 50
  marker_size = 7.5
  vcdr = 'vcdr_visit'
  p4 = 'glaucoma_p4_max_logit'

  phenos = pheno_utils.load_csv(phenos_file, sep='\t', index_col=None)
  phenos.replace(-9, np.nan, inplace=True)
  phenos.dropna(
      axis='rows',
      how='any',
      subset=['visit', 'visit_age', 'refractive_error'],
      inplace=True)
  
  phenos_plot = phenos[[vcdr, p4]].dropna()

  perf_metrics.PerformanceMetrics(
      name='VCDR vs P4', metrics=_METRICS).compute_and_print(
          phenos_plot['vcdr_visit'].to_numpy(copy=True),
          phenos_plot['glaucoma_p4_max_logit'].to_numpy(copy=True),
          n_bootstrap=2000,
          seed=seed)

  jax = sns.jointplot(
      data=phenos_plot,
      x='vcdr_visit',
      y='glaucoma_p4_max_logit',
      kind='reg',
      xlim=[0.0, 1],
      ylim=[-12, 4],
      height=3,
      annot_kws=dict(stat='r'),
      scatter_kws=dict(s=2.5, alpha=0.1, color='g'))

  jax.ax_joint.cla()

  h, x_edges, y_edges = np.histogram2d(
      phenos_plot['vcdr_visit'],
      phenos_plot['glaucoma_p4_max_logit'],
      bins=n_bins)

  x_centers = (x_edges[:-1] + x_edges[1:]) / 2
  y_centers = (y_edges[:-1] + y_edges[1:]) / 2

  x_mesh, y_mesh = np.meshgrid(x_centers, y_centers)

  x = x_mesh.ravel()
  y = y_mesh.ravel()
  h = h.T.ravel()

  jax.ax_joint.scatter(
      x, y, edgecolors='none', s=marker_size, c=h, norm=mpl.colors.LogNorm())

  jax.ax_joint.set_xlabel('')
  jax.ax_joint.set_ylabel('')

  jax.ax_joint.set_xticks([0, 0.5, 1])
  jax.ax_joint.set_yticks([-12, -8, -4, 0, 4])

  return jax

In [None]:
jax = plot_vcdr_p4_scatter(phenos_filepath, seed=_RANDOM_SEED)

jax.savefig(
    'vcdr_glaucoma_p4.pdf',
    transparent=True,
    bbox_inches='tight')

#Manhattan Plot

In [None]:
def get_chromosome_sizes(chrom_sizes_filepath):
  """Returns the chromosome sizes.

  Args:
    chrom_sizes_filepath: The filepath or open file handle of a TSV file with
      two columns for chromosome names and chromosome sizes.

  Returns:
    A dictionary from chromosomes to their size in the number of base-pairs.
  """
  with open(chrom_sizes_filepath) as f:
    size_rows = list(csv.reader(f, delimiter='\t'))
  return {chrom: int(size) for chrom, size in size_rows}


def get_chromosome_offsets(chromosome_sizes):
  """Calculates the chromosome offsets needed for a Manhattan plot.

  Args:
    chromosome_sizes: A dictionary from chromosome to their sizes in base pairs.

  Returns:
    chr_offsets: Maps a chromosome to the number of base-pairs in
      chromosomes before it. This is the offset that must be added to base-pair
      positions of this chromosome in a Manhattan plot. The last element of
      ordered dictionary corresponds to the total size of chromosomes with `$`
      as its key.
  """
  chr_offsets = collections.OrderedDict()
  offset = 0
  for chrom in str(i) for i in range(1, 23):
    chr_offsets[chrom] = offset
    offset += chromosome_sizes[chrom]
  chr_offsets['$'] = offset
  return chr_offsets


def plot_manhattan():
  """Plots the Manhattan plot for the VCDR GWAS."""
  prng = np.random.RandomState(_RANDOM_SEED)

  # load chrom sizes and compute offsets
  with open(_CHROMSIZES_PATH) as f:
    chrom_sizes = get_chromosome_sizes(f)
    chrom_offsets = get_chromosome_offsets(chrom_sizes)

  # plot ticks and tick labels
  offsets = np.asarray(list(chrom_offsets.values()))
  xticks = (offsets[1:] + offsets[:-1]) / 2
  xticklabs = chrom_offsets.keys()

  df_gwas = pheno_utils.load_csv(_VCDR_GWAS, sep='\t', index_col=None)
  df_hits = pheno_utils.load_csv(_VCDR_HITS, sep='\t', index_col=None)
  df_loci = pheno_utils.load_csv(_VCDR_LOCI, sep='\t', index_col=None)

  df_hits['SNP'] = df_hits.apply(get_snp_id, axis='columns')
  df_hits = df_hits.assign(LOGP=-np.log10(df_hits['P']))

  df_hits = pd.merge(df_hits, df_loci, on='SNP', suffixes=('', '_LOCI'))

  # snp ids of genmed loci
  genmed_hits = set(df_hits['SNP'])

  # intersection of genmed and craig et al.
  df_intersection = pheno_utils.load_csv(_VCDR_CRAIG_OVERLAP, sep='\t', index_col=None)

  # snp ids of replicated hits
  common_hits = set(df_intersection['A_SNP'])

  # subset SNPs to those with a max p-value
  df = df_gwas.loc[df_gwas['P'] <= _MANHATTAN_CUTOFF]
  df = df.assign(LOGP=-np.log10(df['P']))

  # plot the does in PNG format (PDF will be too large)
  colors = ['#c2a5cf', '#a6dba0']
  hit_colors = ['#4575b4', '#d73027']

  max_y = 135
  min_y = -5
  min_x = -1.5e8
  max_x = 3.025e9

  xs, ys, cs = [], [], []
  xhs, yhs, chs = [], [], []

  for row in df.itertuples():
    x = chrom_offsets[str(row.CHR)] + row.BP
    y = row.LOGP
    if row.SNP in genmed_hits:
      xhs.append(x)
      yhs.append(y)
      if row.SNP in common_hits:
        chs.append(hit_colors[0])
      else:
        chs.append(hit_colors[1])
    else:
      xs.append(x)
      ys.append(y)
      cs.append(colors[row.CHR % len(colors)])

  fig, ax = plt.subplots(figsize=(7, 3.5), dpi=600)
  ax.scatter(xs, ys, c=cs, s=1, rasterized=True)
  ax.scatter(xhs, yhs, c=chs, s=1.5)

  ax.axhline(y=-np.log10(_SIG_CUTOFF), linestyle='--', color='r', linewidth=0.5)

  ax.set_xlabel('Chromosomes', fontsize=6)
  ax.set_ylabel(r'$-\log_{10}(P)$', fontsize=6)

  for row in df_hits.itertuples():
    x_hit = chrom_offsets[str(row.CHR)] + row.BP
    y_hit = row.LOGP
    if row.SNP in common_hits:
      c_text = '#08306b'
    else:
      c_text = 'k'

  ax.set_xticks(xticks)
  ax.set_xticklabels(xticklabs, fontsize=5)
  ax.set_yticks([0, 30, 60, 90, 120])
  ax.set_yticklabels([0, 30, 60, 90, 120], fontsize=5)

  ax.set_ylim([min_y, max_y])
  ax.set_xlim([min_x, max_x])

  return fig, df_hits

In [None]:
fig = plot_manhattan()

fig.savefig('manhattan.pdf', transparent=True, bbox_inches='tight', dpi=600)

# Craig *et al.* hits effect sizes in ML-Based GWAS

In [None]:
def plot_craig_genmed_effect_sizes():
  """Plots the effect sizes of Craig and Genmed SNP using Craig hits as ref."""
  df_gwas = pheno_utils.load_csv(_VCDR_GWAS, sep='\t', index_col=None)
  df_craig = pheno_utils.load_csv(_CRAIG_HITS, index_col=None)
  df_merged = pd.merge(df_craig, df_gwas, how='inner', on='SNP')

  if df_merged.loc[df_merged['EA'] != df_merged['EFF']].shape[0] > 0:
    raise ValueError('Alleles between GenMed and Craig hits are different.')

  colors = ['#2166ac', '#b2182b']
  xs, ex, ys, ey, cs = [], [], [], [], []
  px, py = [], []

  x_gene, y_gene, gene_name = [], [], []
  for row in df_merged.itertuples():
    # x: suffix for Craig et al
    # y: suffix for ML-based
    xs.append(row.BETA_x)
    ys.append(_VCDR_STD * row.BETA_y)

    ex.append(row.SE_x)
    ey.append(_VCDR_STD * row.SE_y)

    px.append(-np.log10(row.P_x))
    py.append(-np.log10(row.P_y))

    # color based on P-value
    if row.P_x >= row.P_y:
      cs.append(colors[0])
    else:
      cs.append(colors[1])

  fig, ax = plt.subplots(figsize=(3, 2.25))

  rho, pval = ss.pearsonr(xs, ys)
  print(f'rho: {rho} - p-value: {pval}')

  sct = ax.scatter(xs, ys, s=1.5, c=cs, alpha=0.75)

  ax.errorbar(
      xs,
      ys,
      xerr=ex,
      fmt='none',
      elinewidth=0.25,
      ecolor=cs,
      alpha=0.5,
      barsabove=False)
  ax.errorbar(
      xs,
      ys,
      yerr=ey,
      fmt='none',
      elinewidth=0.25,
      ecolor=cs,
      alpha=0.5,
      barsabove=False)

  ax.set_ylim([-0.042, 0.042])
  ax.set_xlim([-0.042, 0.042])

  ax.axhline([0], ls='--', lw=0.5)
  ax.axvline([0], ls='--', lw=0.5)

  sns.regplot(xs, ys, scatter=False, truncate=False, line_kws={'lw': 0.5})

  ticks = [-0.04, -0.02, 0, 0.02, 0.04]

  ax.set_yticks(ticks)
  ax.set_yticklabels(ticks, fontsize=5)

  ax.set_xticks(ticks)
  ax.set_xticklabels(ticks, fontsize=5)

  ax.set_aspect('equal')

  return fig

In [None]:
fig = plot_craig_genmed_effect_sizes()

fig.savefig(
    'craig_effect_size.pdf', transparent=True, bbox_inches='tight', dpi=300)

#Risk loci Venn diagram

In [None]:
def plot_venn_diagram(subsets, set_colors=None):
  """Plots the Venn diagram specified in `subsets`."""

  if set_colors is None:
    set_colors = ['#fbbc05', '#4285f4']
  
  fig, ax = plt.subplots(figsize=(5, 5))

  venn_out = matplotlib_venn.venn2(
      subsets=subsets,
      set_labels=['', ''],
      set_colors=set_colors,
      alpha=1,ax=ax)

  for text in venn_out.subset_labels:
    text.set_fontsize(20)

  fig.savefig("mlbased_craig_loci_venn.pdf", dpi=300)

In [None]:
subsets = {'10': 3, '01': 94, '11': 62}

plot_venn_diagram(subsets)

# PRS Results

In [None]:
# Note: These numberes are updated based on PLINK scores.
def plot_prs_with_cis():

  prun_thresh_bar_data = pd.DataFrame({
      'dataset': ['Craig et al', 'ML model', 'Craig et al', 'ML model'],
      'model': ['UKB', 'UKB', 'EPIC-Norfolk', 'Epic-Norfolk'],
      'R': [0.287, 0.366, 0.228, 0.310],
      'low': [0.247, 0.331, 0.203, 0.287],
      'high': [0.326, 0.401, 0.252, 0.333]
  })

  elastic_net_bar_data = pd.DataFrame({
      'dataset': ['Craig et al', 'ML model', 'Craig et al', 'ML model'],
      'model': ['UKB', 'UKB', 'EPIC-Norfolk', 'Epic-Norfolk'],
      'R': [0.313, 0.376, 0.272, 0.326],
      'low': [0.276, 0.341, 0.249, 0.303],
      'high': [0.349, 0.410, 0.296, 0.349]
  })

  orig_bar_data = pd.DataFrame({
      'dataset': ['Craig et al', 'ML model', 'Craig et al', 'ML model'],
      'model': ['P+T', 'P+T', 'Elastic Net', 'Elastic Net'],
      'R': [0.308, 0.359, 0.317, 0.376],
      'low': [0.269, 0.322, 0.280, 0.339],
      'high': [0.345, 0.395, 0.354, 0.410]
  })

  max_y = 0.50
  lab_y = 0.46
  width = 0.25
  fig_pt, ax = plt.subplots(figsize=(3, 2.25))

  x = [0, 0.8]
  labels = ['UKB', 'EPIC-Norfolk']
  x_bar = [
      x[0] - width / 2, x[0] + width / 2, x[1] - width / 2, x[1] + width / 2
  ]

  ml_patch = patches.Patch(color=ML_COLOR, label='ML-based')
  cr_patch = patches.Patch(color=CRAIG_COLOR, label='Craig et al.')

  ax.bar(
      x_bar,
      prun_thresh_bar_data.R,
      yerr=[
          prun_thresh_bar_data.R - prun_thresh_bar_data.low, 
          prun_thresh_bar_data.high - prun_thresh_bar_data.R,
      ],
      width=width,
      color=[CRAIG_COLOR, ML_COLOR, CRAIG_COLOR, ML_COLOR],
      capsize=2,
      error_kw=dict(elinewidth=0.5, capthick=0.5))
  ax.set_ylabel('Correlation', fontsize=6)
  ax.set_yticks([0, 0.1, 0.2, 0.3, 0.4])
  ax.set_yticklabels([0, 0.1, 0.2, 0.3, 0.4], fontsize=5)
  ax.set_xticks(x)
  ax.set_xticklabels(labels, fontsize=6)
  ax.set_title('P+T', fontsize=6)
  ax.tick_params(
      axis='x',  # changes apply to the x-axis
      which='both',  # both major and minor ticks are affected
      top=False)  # ticks along the top edge are off
  ax.set_ylim((0, max_y))
  ax.legend(handles=[ml_patch, cr_patch], fontsize=6, fancybox=False)

  for x_text, r in zip(x_bar, prun_thresh_bar_data.R):
    ax.text(x_text, lab_y, f'{r:.2f}', horizontalalignment='center', fontsize=6)

  fig_pt.show()

  fig_en, ax = plt.subplots(figsize=(3, 2.25))

  ax.bar(
      x_bar,
      elastic_net_bar_data.R,
      yerr=[
          elastic_net_bar_data.R - elastic_net_bar_data.low,
          elastic_net_bar_data.high - elastic_net_bar_data.R,
      ],
      color=[CRAIG_COLOR, ML_COLOR, CRAIG_COLOR, ML_COLOR],
      width=width,
      capsize=2,
      error_kw=dict(elinewidth=0.75, capthick=0.75))
  ax.set_ylabel('Correlation', fontsize=6)
  ax.set_yticks([0, 0.1, 0.2, 0.3, 0.4])
  ax.set_yticklabels([0, 0.1, 0.2, 0.3, 0.4], fontsize=5)
  ax.set_xticks(x)
  ax.set_xticklabels(labels, fontsize=6)
  ax.set_title('Elastic Net', fontsize=6)
  ax.tick_params(
      axis='x',  # changes apply to the x-axis
      which='both',  # both major and minor ticks are affected
      top=False)  # ticks along the top edge are off
  ax.set_ylim((0, max_y))

  for x_text, r in zip(x_bar, elastic_net_bar_data.R):
    ax.text(x_text, lab_y, f'{r:.2f}', horizontalalignment='center', fontsize=6)

  fig_en.show()

  # Results of the original hits
  fig_og, ax = plt.subplots(figsize=(3, 2.25))

  x = [0, 0.8]
  labels = ['P+T', 'Elastic Net']
  x_bar = [
      x[0] - width / 2, x[0] + width / 2, x[1] - width / 2, x[1] + width / 2
  ]

  ax.bar(
      x_bar,
      orig_bar_data.R,
      yerr=[
          orig_bar_data.R - orig_bar_data.low,
          orig_bar_data.high - orig_bar_data.R,
      ],
      color=[CRAIG_COLOR, ML_COLOR, CRAIG_COLOR, ML_COLOR],
      width=width,
      capsize=2,
      error_kw=dict(elinewidth=0.75, capthick=0.75))
  ax.set_ylabel('Correlation', fontsize=6)
  ax.set_yticks([0, 0.1, 0.2, 0.3, 0.4])
  ax.set_yticklabels([0, 0.1, 0.2, 0.3, 0.4], fontsize=5)
  ax.set_xticks(x)
  ax.set_xticklabels(labels, fontsize=6)
  ax.set_title('Original', fontsize=6)
  ax.tick_params(
      axis='x',  # changes apply to the x-axis
      which='both',  # both major and minor ticks are affected
      top=False)  # ticks along the top edge are off
  ax.set_ylim((0, max_y))

  for x_text, r in zip(x_bar, orig_bar_data.R):
    ax.text(x_text, lab_y, f'{r:.2f}', horizontalalignment='center', fontsize=6)

  fig_og.show()


  return fig_pt, fig_en, fig_og


fig_pt, fig_en, fig_og = plot_prs_with_cis()

fig_pt.savefig('pt_prs.pdf', transparent=True, bbox_inches='tight')

fig_en.savefig('en_prs.pdf', transparent=True, bbox_inches='tight')

fig_og.savefig('og_prs.pdf', transparent=True, bbox_inches='tight')

#EPIC PRS plots

In [None]:
def plot_epic_prs_deciles() -> Tuple[FIG, AX]:
  """Plots GenMed and Craig PRS deciles for glaucoma subtypes."""

  x = np.arange(1, 11)
  width = 0.25

  x_cr = x - width / 2
  x_ml = x + width / 2

  craig = {}
  genmed = {}

  genmed = {
      'poag':
          np.asarray([
              [1, 1, 1],
              [3.161, 1.021, 9.788],
              [2.025, 0.604, 6.785],
              [3.453, 1.126, 10.592],
              [3.951, 1.308, 11.936],
              [6.011, 2.058, 17.560],
              [3.995, 1.323, 12.067],
              [5.510, 1.880, 16.152],
              [5.941, 2.040, 17.298],
              [9.705, 3.413, 27.597],
          ]),
      'htg':
          np.asarray([
              [1, 1, 1],
              [3.257, 0.888, 11.942],
              [2.041, 0.506, 8.227],
              [3.607, 0.997, 13.047],
              [2.977, 0.799, 11.094],
              [3.469, 0.946, 12.722],
              [3.022, 0.811, 11.260],
              [3.684, 1.018, 13.325],
              [2.942, 0.789, 10.968],
              [7.413, 2.181, 25.194],
          ]),
      'ntg':
          np.asarray([
              [1, 1, 1],
              [2.921, 0.302, 28.209],
              [2.03, 0.183, 22.477],
              [2.96, 0.307, 28.587],
              [6.978, 0.854, 56.999],
              [13.456, 1.751, 103.394],
              [6.976, 0.854, 56.976],
              [11.049, 1.419, 86.019],
              [14.822, 1.948, 112.789],
              [16.543, 2.174, 125.904],
          ]),
  }

  craig = {
      'poag':
          np.asarray([
              [1, 1, 1],
              [0.756, 0.260, 2.202],
              [1.829, 0.766, 4.368],
              [2.008, 0.849, 4.751],
              [2.586, 1.131, 5.916],
              [3.014, 1.336, 6.797],
              [1.612, 0.660, 3.936],
              [2.143, 0.913, 5.030],
              [3.373, 1.507, 7.550],
              [3.859, 1.740, 8.557],
          ]),
      'htg':
          np.asarray([
              [1, 1, 1],
              [0.831, 0.251, 2.751],
              [1.951, 0.724, 5.261],
              [1.48, 0.521, 4.204],
              [2.115, 0.795, 5.629],
              [2.65, 1.025, 6.853],
              [0.82, 0.248, 2.712],
              [1.505, 0.530, 4.276],
              [2.073, 0.769, 5.588],
              [1.905, 0.697, 5.212],
          ]),
      'ntg':
          np.asarray([
              [1, 1, 1],
              [0.512, 0.046, 5.672],
              [1.482, 0.246, 8.921],
              [3.515, 0.725, 17.032],
              [3.975, 0.839, 18.844],
              [4.055, 0.856, 19.223],
              [3.992, 0.842, 18.927],
              [4.103, 0.866, 19.447],
              [7.257, 1.638, 32.149],
              [9.506, 2.190, 41.263],
          ]),
  }

  # figure has 3 panels: POAG, HTG, NTG
  fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(16, 3.5))

  for i, k in enumerate(['poag', 'htg', 'ntg']):

    # cmpute the upper and lower bounds for errorbars
    # upper bound
    craig[k][:, 2] = craig[k][:, 2] - craig[k][:, 0]
    genmed[k][:, 2] = genmed[k][:, 2] - genmed[k][:, 0]
    # lower bound
    craig[k][:, 1] = craig[k][:, 0] - craig[k][:, 1]
    genmed[k][:, 1] = genmed[k][:, 0] - genmed[k][:, 1]

    ax[i].errorbar(
        x_ml,
        genmed[k][:, 0],
        yerr=genmed[k][:, [1, 2]].T,
        fmt='o',
        c=ML_COLOR,
        linewidth=2,
        label='ML-based')
    ax[i].errorbar(
        x_cr,
        craig[k][:, 0],
        yerr=craig[k][:, [1, 2]].T,
        fmt='o',
        c=CRAIG_COLOR,
        linewidth=2,
        label='Craig et al.')

    ax[i].hlines(1, 0, 11, linestyle='--')
    ax[i].set_xlim([0, 11])

    lg = ax[i].legend(loc='upper left', fancybox=False)
    lg.get_frame().set_edgecolor('k')

    _ = ax[i].set_xticks(x)
    ax[i].set_xlabel('VCDR PRS deciles', fontsize=12)
    ax[i].set_ylabel(f'{k.upper()} (95% CI)', fontsize=12)
    ax[i].set_yscale('log')
    ax[i].set_yticks([0.5, 1, 2, 5, 10, 20, 50, 100])
    ax[i].set_yticklabels([0.5, 1, 2, 5, 10, 20, 50, 100])
    ax[i].set_ylim([0.25, 140])
    ax[i].minorticks_off()

  return fig, ax

fig, ax = plot_epic_prs_deciles()
fig.savefig('epic_poag_prs.pdf', transparent=True, bbox_inches='tight')