```
Copyright 2021 Google LLC.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

1. Redistributions of source code must retain the above copyright notice, this
   list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright notice,
   this list of conditions and the following disclaimer in the documentation
   and/or other materials provided with the distribution.

3. Neither the name of the copyright holder nor the names of its contributors
   may be used to endorse or promote products derived from this software
   without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ```


##PRS Analyses

This notebook reports the final PRS analyses. The models are fit using the ML-based VCDR prediction on all European individuals in UKB not in the test set, and evaluated in the adjudicated set of images.



In [None]:
import collections
from concurrent import futures
import copy
import csv
import matplotlib as mpl
from matplotlib import pyplot as plt
from matplotlib import patches
import multiprocessing
import numpy as np
import pandas as pd
import os
import re
from scipy import stats
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn import preprocessing
from sklearn.utils import resample
from typing import List, Sequence, Dict,Text

# Modules defined within this repository.
import perf_metrics
import pheno_utils

In [None]:
%matplotlib inline
mpl.rcParams['figure.dpi'] = 300
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['pdf.fonttype'] = 42

mpl.rcParams['savefig.transparent'] = True
mpl.rcParams['savefig.bbox'] = 'tight'
mpl.rcParams['savefig.dpi'] = 300

pd.set_option('mode.chained_assignment', 'raise')
NPROC = multiprocessing.cpu_count()

# Data input definitions

In [None]:
# This has fields "image_id,GLAUCOMA_GRADABILITY,GLAUCOMA_SUSPECT_RISK,
# VERTICAL_CD_VISIBILITY,VERTICAL_CUP_TO_DISC,VCDR_GRADERS,VCDR_CONFIDENCE,
# VCDR_STDEV,VCDR_MAX_DIFF"
# The image_id field is the full path to the file, which needs to be parsed to extract the EID.
VCDR_LABELS_FILE = '/path/to/labels/file.tsv'

# Craig VCDR GWAS variants. Downloaded directly from the paper.
# https://pubmed.ncbi.nlm.nih.gov/31959993/
# Fields SNP,Chr,Position,EA,NEA,Freq,BETA,SE,P,Nearest_gene
CRAIG_VCDR_SNPS = '/path/to/craig_et_al/hits.csv'

# The set of proxy SNPs for Craig et al. hits in EPIC-Norfolk.
CRAIG_VCDR_EPIC_SURROGATE_SNPS = '/path/to/craig_et_al/proxy_snps.tsv'

# The subset of the Craig et al SNPs that are present in EPIC-Norfolk.
CRAIG_VCDR_EPIC_ONLY_SNPS = '/path/to/craig_et_al/snps_in_epic.tsv'

# ML-based GWAS variants (299 total hits).
# This is a tab-separated file with fields
# CHR	BP	REF	ALT	RS	AFFX	EFF	AAF	NUM_INDV	SRC	INFO_SCORE	P_HWE_COH	
# P_HWE_POP	P	BETA	SE	CLUSTER_LEFT	CLUSTER_RIGHT	CLUSTER_SIZE	CYTOBAND	
# GENE_CONTEXT	CLOSEST_GENES	GLAUCOMA_HITS	INTRAOCULAR_HITS	CUPDISCRATIO_HITS	
# CUPTODISCRATIO_HITS	OTHER_HITS
MLDERIVED_VCDR_SNPS = '/path/to/mlbased_hits.tsv'

# ML-based GWAS variants from Oct 2020 that are present on the EPIC-Norfolk
# array. This is 282 hits generated by subsetting variants to those present on
# the EPIC-Norfolk array and then re-calling hits (so the 282 are not a proper
# subset of the 299 total discovered in UKB). This is an 18-field tab-separated
# file:
# 1: CHR
# 2: BP
# 3: SNP
# 4: REF
# 5: ALT
# 6: EFF
# 7: AAF
# 8: NUM_INDV
# 9: SRC
# 10: INFO_SCORE
# 11: P
# 12: BETA
# 13: SE
# 14: P_HWE_POP
# 15: EPIC_SNP
# 16: CLUSTER_LEFT
# 17: CLUSTER_RIGHT
# 18: CLUSTER_SIZE
MLDERIVED_VCDR_EPIC_SURROGATE_SNPS = '/path/to/mlbased_epic_hits.tsv'

# Raw image model predictions.
RAW_IMAGE_PREDICTIONS = '/path/to/model_predictions.tsv'

# PLINK PRS predictions path.
PLINK_PRS_PREDS = '/path/to/plink_predictions/'

## Code to load VCDR inputs

In [None]:
def _get_eid(row):
  """Returns the integer EID from the CSV row."""
  key = 'image_id' if 'image_id' in row else ''
  bn = os.path.basename(row[key])
  return int(bn.split('_')[0])


def _read_all_vcdrs(filename, eids):
  """Returns a dict from EID -> list of (VCDR measurement, confidence, stdev) tuples."""
  missing_vcdr = 0
  invalid_eids = 0
  total = 0
  measurements = collections.defaultdict(list)
  with open(filename) as f:
    reader = csv.DictReader(f)
    for row in reader:
      total += 1
      eid = _get_eid(row)
      if eid not in eids:
        # We are not allowed to use this EID.
        invalid_eids += 1
        continue
      try:
        vcdr = float(row['VERTICAL_CUP_TO_DISC'])
        assert 0 <= vcdr <= 1
        confidence = float(row.get('VCDR_CONFIDENCE', 0.5))
        stdev = float(row['VCDR_STDEV'])
      except (AssertionError, ValueError):
        missing_vcdr += 1
      else:
        measurements[eid].append((vcdr, confidence, stdev))
  num_eids = len(measurements)
  kept_records = sum(len(v) for v in measurements.values())
  assert total == (invalid_eids + missing_vcdr + kept_records)
  print(
      f'{total} total VCDR rows: {invalid_eids} invalid eids, {missing_vcdr} missing VCDR, {kept_records} kept.'
  )
  print(f'  {num_eids} unique EIDs.')
  return measurements


def _create_single_eid_level_vcdr(vc_tuples):
  """Returns the single EID-level VCDR to use, and a tuple for ordering by confidence.

  Args:
    vc_tuples: A list of (vcdr, confidence, stdev) triplets representing
      attributes of a single image's VCDR grading by one or more individuals.

  Returns:
    A pair of results: the EID-level VCDR prediction and a tuple of
    (adjudication status, sum of stdevs of image-level predictions, stdev of
     images).
  """
  # These are set so that sorting puts adjudicated images first.
  ADJUDICATED = 0
  NOT_ADJUDICATED = 1

  if len(vc_tuples) == 1:
    val, conf, std = vc_tuples[0]
    if conf < 0.25:
      raise ValueError('Unexpected')
    elif conf < 0.75:
      ordering = (NOT_ADJUDICATED, -1, 0)
    else:
      ordering = (ADJUDICATED, std, 0)
    return val, ordering

  # There are multiple graded images for the individual.
  best_confidence = max(x[1] for x in vc_tuples)
  # Keep only images that have the best confidence. For individuals with
  # multiple graded images, that means that if they have both multiple graders
  # on at least one image and images with just a single grader, the
  # single-grader images are ignored.
  conf_tuples = [x for x in vc_tuples if x[1] == best_confidence]
  if best_confidence < 0.25:
    raise ValueError('Expected at least 0.5: {}'.format(vc_tuples))
  elif best_confidence < 0.75:
    # This is multiple images with a single grade each. Just average them.
    best_value = np.mean([val for val, _, _ in conf_tuples])
    ordering = (
        NOT_ADJUDICATED,
        -len(conf_tuples),  # More images is better than fewer.
        np.std([val for val, _, _ in conf_tuples]))
  else:
    # There are multiple graded images for this EID, all of which have multiple
    # graders. Use inverse variance weighting to determine the EID-level VCDR.
    numer = denom = 0
    for val, _, std in conf_tuples:
      invvar = 1e10 if std == 0 else 1. / (std * std)
      numer += val * invvar
      denom += invvar
    best_value = numer / denom
    ordering = (ADJUDICATED, sum(s for _, _, s in conf_tuples),
                np.std([val for val, _, _ in conf_tuples]))
  return best_value, ordering


def _eid_vcdr(all_vcdrs):
  """Returns a triplet of VCDR results.

  The first result is a dictionary mapping from eid --> VCDR value to predict.
  The second result is all the EIDs from the first result, ordered by confidence
  in their results. I.e. first element in the output is the VCDR we are most
  confident in, and the last element is the one we are least confident in.
  The final result is a boolean array corresponding to the second result, where
  True means that the image was adjudicated (read by >1 grader with
  not-too-different results.

  Args:
    all_vcdrs: Dict mapping from EID -> list of (vcdr, confidence, stdev)
      triplets, generated by `read_all_vcdrs` function.
  """
  retval = {}
  adjudication_ordering = []
  for eid, vc_tuples in all_vcdrs.items():
    vcdr, adjudication_tuple = _create_single_eid_level_vcdr(vc_tuples)
    retval[eid] = vcdr
    # Attach EID to the end of the ordering so that it does not affect sort
    # order but we can extract the ordered set of EIDs.
    adjudication_ordering.append(adjudication_tuple + (eid,))

  ordered_results = sorted(adjudication_ordering)
  ordered_eids = [x[-1] for x in ordered_results]
  is_adjudicated = [x[0] == 0 for x in ordered_results]
  return retval, ordered_eids, is_adjudicated


class VcdrValues(object):

  def __init__(self, filename, eids_to_retain):
    eid_to_vcdr_list = _read_all_vcdrs(filename=filename, eids=eids_to_retain)
    self._eid_to_vcdr_value, self._eid_ordering, self._is_adjudicated = _eid_vcdr(
        eid_to_vcdr_list)
    assert len(self._eid_to_vcdr_value) == len(self._eid_ordering) == len(
        set(self._eid_ordering))
    assert sorted(self._eid_to_vcdr_value.keys()) == sorted(self._eid_ordering)

  def as_dataframe(self):
    """Returns a pandas DataFrame indexed by 'eid' with column 'vcdr' containing the value."""
    retval = pd.DataFrame([{
        'eid': k,
        'vcdr': v
    } for k, v in self._eid_to_vcdr_value.items()])
    return retval.set_index('eid')

  def top_eids(self, n, valid_eids):
    """Returns a set of the `n` best ordered EIDs when restricted to those in `valid_eids`."""
    retval = set()
    for i, eid in enumerate(self._eid_ordering, start=1):
      if eid in valid_eids:
        retval.add(eid)
      if len(retval) == n:
        print('Returning top {} EIDs after examining {}'.format(n, i))
        return retval
    raise ValueError('Unable to retrieve {} EIDs'.format(n))

  @property
  def eids(self):
    return set(self._eid_ordering)

  @property
  def adjudicated_eids(self):
    return {
        eid
        for eid, is_adjudicated in zip(self._eid_ordering, self._is_adjudicated)
        if is_adjudicated
    }


# Do some testing of the _create_single_eid_level_vcdr function.
def _test_create_single_eid_level_vcdr():

  def check(inputs, expected_vcdr):
    assert expected_vcdr == _create_single_eid_level_vcdr(inputs)[0]

  # Single VCDR values return exactly the same.
  check([(0.8, 0.5, 0.0)], 0.8)
  check([(0.9, 1.0, 0.6)], 0.9)

  # Multiple VCDR values for singly-graded images take the mean.
  check([
      (0.2, 0.5, 0.0),
      (0.4, 0.5, 0.0),
  ], (0.2 + 0.4) / 2)
  check([(0.2, 0.5, 0.0), (0.4, 0.5, 0.0), (0.6, 0.5, 0.0)],
        (0.2 + 0.4 + 0.6) / 3)

  # Multiple VCDR values that include at least one multi-graded only use
  # those multigraded.
  check([(0.1, 0.5, 0.0), (0.1, 0.5, 0.0), (0.9, 1.0, 0.2)], 0.9)

  # Multiple adjudicated images use inverse variance weighting of those.
  check([(0.9, 1.0, 0.2), (0.8, 1.0, 0.5)], 0.8862068965517241)
  check([(0.9, 1.0, 0.0), (0.5, 1.0, 0.5)], 0.89999999984)


_test_create_single_eid_level_vcdr()

## Code for loading all genotype data

In [None]:
Variant = collections.namedtuple('Variant',
                                 'chr pos A1 A0 name freq beta pvalue')

# Contains a numpy array of hard genotypes, numpy array of dosages,
# string reference allele, and string alternate allele. The 0/1/2 coding of the
# genotypes represents the number of alternate alleles present.
VariantCall = collections.namedtuple('VariantCall', 'geno dosage ref alt')


def _safename(s):
  return s.replace(':', '_').replace('-', '_')


def read_craig_snps(filename, thresh=5e-8):
  """Returns a list of Variant objects representing the CRAIG_VCDR_SNPS or CRAIG_META_SNPS file."""
  retval = []
  with open(filename) as f:
    reader = csv.DictReader(f)
    for row in reader:
      betakey = 'BETA' if 'BETA' in row else 'BETA_(MTAG)'
      pkey = 'P' if 'P' in row else 'P_(MTAG)'
      v = Variant(
          chr=int(row['Chr']),
          pos=int(row['Position']),
          A1=row['EA'],
          A0=row['NEA'],
          name=_safename(row['SNP']),
          freq=float(row['Freq']),
          beta=float(row[betakey]),
          pvalue=float(row[pkey]))
      if v.pvalue <= thresh:
        retval.append(v)
  return retval


def read_gwas_pipeline_results(filename, namecol='RS', thresh=5e-8):
  """Returns a list of Variant objects from the file."""
  retval = []
  with open(filename) as f:
    reader = csv.DictReader(f, delimiter='\t')
    for row in reader:
      if row[namecol] and row[namecol] != '.':
        name = row[namecol]
      else:
        name = '{}_{}_{}_{}'.format(row['CHR'], row['BP'], row['REF'],
                                    row['ALT'])
      assert row['EFF'] in [row['REF'], row['ALT']]
      if row['EFF'] == row['ALT']:
        a1 = row['ALT']
        a0 = row['REF']
        freq = float(row['AAF'])
      else:
        a1 = row['REF']
        a0 = row['ALT']
        freq = 1 - float(row['AAF'])
      assert 0 <= freq <= 1
      v = Variant(
          chr=int(row['CHR']),
          pos=int(row['BP']),
          A1=a1,
          A0=a0,
          name=_safename(name),
          freq=freq,
          beta=float(row['BETA']),
          pvalue=float(row['P']))
      if v.pvalue <= thresh:
        retval.append(v)
  return retval


def _load_variant_calls(variants, eids):
  """Returns a dictionary mapping Variant --> VariantCall."""
  # NOTE: This function must be implemented in order to run this colab. It
  # depends on the data storage and infrastructure used for variants. See the
  # definition of Variant and VariantCall objects above.
  return {}


class VariantValues(object):

  def __init__(self, eids, variant_calls):
    self._eids = eids
    self._variant_calls = variant_calls

  @classmethod
  def from_variants(cls, variants, eids):
    calls = _load_variant_calls(variants, eids)
    return cls(eids=eids, variant_calls=calls)

  def get_call(self, variant):
    """Returns the VariantCall object associated with this Variant."""
    return self._variant_calls[variant]

  def variant_df(self, variants, dosage=False):
    """Returns a pd.DataFrame indexed by eid with all requested variants.

    Variants are returned as columns with the name being v.name.

    Args:
      variants: list of Variant objects to extract.
      dosage: bool. If True, the dosage is used. Otherwise, the geno call is
        used.

    Returns:
      pd.DataFrame of variants.
    """
    attr = 'dosage' if dosage else 'geno'
    data = {v.name: getattr(self.get_call(v), attr) for v in variants}
    assert len(data) == len(variants)
    assert 'eid' not in data
    data['eid'] = self._eids
    return pd.DataFrame(data).set_index('eid')

  def prs(self, variants, name='prs'):
    """Returns a pd.DataFrame indexed by eid with the PRS and its PLINK repr."""
    plink_strs = []
    value = np.zeros_like(list(self._variant_calls.values())[0].dosage)
    for v in variants:
      vcall = self.get_call(v)
      plink_str = f'{v.chr}:{v.pos}_{vcall.ref}_{vcall.alt}\t{v.A1}\t{v.beta}\n'
      plink_strs.append(plink_str)
      assert ((vcall.ref == v.A0 and vcall.alt == v.A1) or
              (vcall.ref == v.A1 and vcall.alt == v.A0))
      if vcall.alt == v.A1:
        value += v.beta * vcall.geno
      else:
        value += v.beta * (2 - vcall.geno)

    df = pd.DataFrame({'eid': self._eids, name: value}).set_index('eid')
    return df, ''.join(plink_strs)

## Code to load covariate and ML-based data


In [None]:
# Load covariates.
def load_covariates(eids_to_retain, subset_to_vcdr=True):
  """Returns a pd.DataFrame indexed by eid containing all covariates and ML-based VCDR."""
  cols = COVARS + [
      'IID', 'vcdr_visit', 'has_touchscreen_plus_icd_poag',
      'has_touchscreen_plus_icd_poag_nofundus'
  ]
  full_covar_df = pheno_utils.load_csv(
      COVARIATES_FILE, index_col=None, delimiter='\t', usecols=cols)
  full_covar_df.rename(columns={'IID': 'eid'}, inplace=True)
  num_records = len(full_covar_df)
  eid_restricted_df = full_covar_df.loc[
      full_covar_df.eid.isin(eids_to_retain), :]
  num_restricted = len(eid_restricted_df)
  if subset_to_vcdr:
    retval = eid_restricted_df.loc[eid_restricted_df.vcdr_visit > -9, :].copy()
    print(f'From {num_records} total entries, reduced to {num_restricted} '
          f'from input EIDs, and finally {len(retval)} with vcdr_visit')
  else:
    retval = eid_restricted_df.copy()
    print(f'From {num_records} total entries, reduced to {num_restricted}')

  # Transform the BOLT encoding of binary records [1, 2] to [0, 1]
  for field in ['sex', 'genotyping_array']:
    assert sorted(retval[field].unique()) == [1, 2]
    retval[field] -= 1

  retval.rename(
      columns={
          'has_touchscreen_plus_icd_poag': 'glaucoma',
          'has_touchscreen_plus_icd_poag_nofundus': 'glaucoma_no_fundus'
      },
      inplace=True)
  return retval.set_index('eid')

## Load all the data


In [None]:
valid_european_eids = ... # set of UKB EIDs consented for research and of genetically European ancestry.

covar_df = load_covariates(valid_european_eids, subset_to_vcdr=True)

# There are 165,457 inds with glaucoma status and all covariates, while only
# 67,300 of them have fundus images. We use the 165,457 - 67,300 for evaluating
# the relationship between glaucoma and PRS-predicted VCDR.
covar_all_df = load_covariates(valid_european_eids, subset_to_vcdr=False)

european_eids_with_covars = set(covar_df.index)
european_eids_with_covars_all = set(covar_all_df.index)

vcdr_container = VcdrValues(
    filename=VCDR_LABELS_FILE, eids_to_retain=valid_european_eids)

eids_to_load_genotypes = european_eids_with_covars_all.union(
    vcdr_container.eids)


craig_snps = read_craig_snps(CRAIG_VCDR_SNPS)
craig_epic_surrogate_snps = read_craig_snps(CRAIG_VCDR_EPIC_SURROGATE_SNPS)
craig_epic_only_snps = read_craig_snps(CRAIG_VCDR_EPIC_ONLY_SNPS)

mlderived_snps = read_gwas_pipeline_results(filename=MLDERIVED_VCDR_SNPS,
                                            namecol='RS',
                                            thresh=5e-8)

mlderived_epic_surrogate_snps = read_gwas_pipeline_results(
    filename=MLDERIVED_VCDR_EPIC_SURROGATE_SNPS,
    namecol='SNP',
    thresh=5e-8
)

In [None]:
# Run the very slow variant loading in a separate cell so it can be skipped if
# not needed.
variant_container = VariantValues.from_variants(
    craig_snps + craig_epic_surrogate_snps + craig_epic_only_snps +
    mlderived_snps + mlderived_epic_surrogate_snps,
    eids_to_load_genotypes
)

## Save Genotypes in PLINK (.map + .ped) format
We first save the genotypes as .map + .ped text PLINK files and then manually 
convert them to .bed + .bim + .fam files.

In [None]:
def write_plink_map_ped_files(path_prefix, df_genotypes, variants_name_map):
  """Writes <df_genotypes> in PLINK's MAP and PED format."""
  variants = list(df_genotypes.columns)
  eids = list(df_genotypes.index)

  def _get_allele_map(ref, alt):
    """Returns genotype to allele pair map."""
    return {
        np.nan: '0 0',
        0: f'{ref} {ref}',
        1: f'{ref} {alt}',
        2: f'{alt} {alt}'
    }

  def _get_ped_call(genotype, allele_map):
    """Returns the PED allele code based on genotype."""
    # If genotype is `float`, we assume it has been mean-imputed
    # so was missing in the first place.
    if genotype.is_integer():
      return allele_map[genotype]
    else:
      return allele_map[np.nan]

  allele_maps = []
  map_path = path_prefix + '.map'
  with open(map_path, 'w') as fw:
    for variant in variants:
      prsname = variants_name_map[variant]
      chrom, pos_ref_alt = prsname.split(':')
      pos, ref, alt = pos_ref_alt.split('_')
      allele_maps.append(_get_allele_map(ref, alt))
      line = '\t'.join([chrom, prsname, '0', pos])
      fw.write(f'{line}\n')

  ped_path = path_prefix + '.ped'
  with open(ped_path, 'w') as fw:
    for eid in eids:
      line = '\t'.join(['0', str(eid), '0', '0', '0', '0'] + [
          _get_ped_call(g, am)
          for g, am in zip(df_genotypes.loc[eid], allele_maps)
      ])
      fw.write(f'{line}\n')


all_raw_vars = (craig_snps + craig_epic_surrogate_snps + craig_epic_only_snps +
    mlderived_snps + mlderived_epic_surrogate_snps)
all_variants = set(_get_canonical_variant_map(all_raw_vars).values())
all_samples = variant_container.variant_df(all_variants)

variants_name_map = dict()
for variant in all_variants:
  vcall = variant_container.get_call(variant)
  prsname = '{}:{}_{}_{}'.format(variant.chr, variant.pos, vcall.ref, vcall.alt)
  variants_name_map[variant.name] = prsname

path_prefix = '/path/to/save/plink_data/'

write_plink_map_ped_files(path_prefix, all_samples, variants_name_map)

## Build the full datasets for training and evaluation

In [None]:
def load_dataset(v_df,
                 target_df,
                 eids=None,
                 ensure_variation='raise'):
  """Returns a pd.DataFrame with `target` and all variants.

  Args:
    v_df: pd.DataFrame of variants to include.
    target_df: pd.DataFrame containing a single field that is the target label.
    eids: Iterable(int). eids to restrict to further.
    ensure_variation: str in ('quiet', 'warn', 'raise'). Indicates what to do if
      no variation exists in a column.

  Returns:
    pd.DataFrame containing requested data.
  """
  full_df = v_df.join(target_df, how='inner')
  if eids is not None:
    no_na = full_df.loc[full_df.index.intersection(set(eids))].dropna(axis=0)
  else:
    no_na = full_df.dropna(axis=0)

  retval = no_na.loc[(no_na == -9).sum(axis=1) == 0]

  print(f'{len(v_df)} with variants --> {len(full_df)} with v and target.')
  print(f'{len(retval)} final.\n')

  if ensure_variation != 'quiet':
    for col in retval.columns:
      if len(retval[col].unique()) < 2:
        msg = f'Column {col} has no variation in retval.'
        if ensure_variation == 'warn':
          print(msg)
        else:
          raise ValueError(msg)

  if retval.isna().sum().sum() > 0:
    raise ValueError('Unexpected NaN present in return value.')
  if (retval == -9).sum().sum() > 0:
    raise ValueError('Unexpected -9 present in return value.')

  return retval


def create_design(split, variants):
  """Returns a design matrix with the given data split and variants."""
  v_df = variant_container.variant_df(variants)
  if split == 'train':
    target_df = covar_df[['vcdr_visit'
                         ]].rename(columns={'vcdr_visit': 'target'})
    eids = set(covar_df.index) - set(vcdr_container.eids)
    ensure_variation = 'raise'
  elif split == 'testadj':
    target_df = vcdr_container.as_dataframe().rename(columns={'vcdr': 'target'})
    target_df = target_df.loc[target_df.index.intersection(
        vcdr_container.adjudicated_eids)]
    eids = None
    ensure_variation = 'raise'
    covs_df = covar_df
  else:
    raise ValueError(f'Invalid split: {split}')

  return load_dataset(
      v_df=v_df,
      target_df=target_df,
      eids=eids,
      ensure_variation=ensure_variation)


def get_X_y(df, y_name='target'):
  x_columns = df.columns.tolist()
  x_columns.remove(y_name)
  y = df[y_name].to_numpy()
  X = df[x_columns].to_numpy()
  return X, y


def _bootstrap_ci(truths, preds, metric_fn, preds2=None):
  # If two predictions are passed in then return the difference of the metrics.
  if preds2 is not None:
    re_truths, re_preds, re_preds2 = resample(truths, preds, preds2)
    return metric_fn(re_truths, re_preds) - metric_fn(re_truths, re_preds2)
  else:
    return metric_fn(*resample(truths, preds))


def bootstrap_ci(truths,
                 preds,
                 metric_fn,
                 iters=2000,
                 preds2=None,
                 ci_interval=97.5):
  future_objs = []
  with futures.ThreadPoolExecutor(max_workers=NPROC) as pool:
    for _ in range(iters):
      future_objs.append(
          pool.submit(_bootstrap_ci, truths, preds, metric_fn, preds2=preds2))
  metrics = []
  for future_obj in future_objs:
    metrics.append(future_obj.result())
  return np.percentile(metrics, [100 - ci_interval, ci_interval])  # 95% CI

In [None]:
print('### Craig et al 76 SNPs.')
craig_train_design = create_design('train', craig_snps)
craig_test_design = create_design('testadj', craig_snps)

print('\n### Craig et al 76 EPIC-surrogate SNPs.')
craig_epic_surrogate_train_design = create_design('train', craig_epic_surrogate_snps)
craig_epic_surrogate_test_design = create_design('testadj', craig_epic_surrogate_snps)

print('\n### Craig et al 58 EPIC-present SNPs.')
craig_epic_only_train_design = create_design('train', craig_epic_only_snps)
craig_epic_only_test_design = create_design('testadj', craig_epic_only_snps)

print('\n### ML-derived 299 SNPs.')
mlderived_train_design = create_design('train', mlderived_snps)
mlderived_test_design = create_design('testadj', mlderived_snps)

print('\n### ML-derived 282 EPIC-surrogate SNPs.')
mlderived_epic_surrogate_train_design = create_design('train', mlderived_epic_surrogate_snps)
mlderived_epic_surrogate_test_design = create_design('testadj', mlderived_epic_surrogate_snps)

In [None]:
# Sanity check the design matrices.
def check_pair(df1, df2, same=['target']):
  if len(df1) != len(df2):
    raise ValueError(
        f'Number of entries is different: {len(df1)} vs {len(df2)}')
  if set(df1.index) != set(df2.index):
    raise ValueError('Indexes are different.')
  
  shared_columns = sorted(set(df1.columns) & set(df2.columns))
  assert 'target' in shared_columns
  print(f'{len(shared_columns)} total shared columns.')

  if (df1[shared_columns] != df2[shared_columns]).any().any():
    raise ValueError(f'Expected identical results for {shared_columns}')


check_pair(craig_train_design, mlderived_train_design)
check_pair(craig_test_design, mlderived_test_design)

check_pair(craig_epic_surrogate_train_design, mlderived_epic_surrogate_train_design)
check_pair(craig_epic_surrogate_test_design, mlderived_epic_surrogate_test_design)

check_pair(craig_epic_only_train_design, mlderived_train_design)
check_pair(craig_epic_only_test_design, mlderived_test_design)

# Train on ML-based VCDR predictions, evaluate on human labels

In [None]:
Stats = collections.namedtuple('Stats', 'mae mse r r_low_95 r_high_95 pvalue r2')

def train_and_test(train_df,
                   test_df,
                   title='',
                   model='linear',
                   bootstrap=False):
  """Trains and evaluates a model. Returns (model, truth, preds, stats) tuple."""
  assert list(train_df.columns) == list(test_df.columns)
  train_X, train_y = get_X_y(train_df)
  test_X, test_y = get_X_y(test_df)
  assert not bool(set(train_df.index) & set(test_df.index))
  if model == 'linear':
    reg = linear_model.LinearRegression()
  elif model == 'elastic':
    # l1_ratio == 0 is Ridge regression; l1_ratio == 1 is LASSO.
    reg = linear_model.ElasticNetCV(
        cv=5, l1_ratio=[.1, .5, .7, .9, .95, .99, 1.], n_jobs=-1)
  else:
    raise ValueError('Unknown model: {}'.format(model))
  reg.fit(train_X, train_y)
  preds = reg.predict(test_X)
  pearson_r, pvalue = stats.pearsonr(test_y, preds)
  if bootstrap:
    low_r, high_r = bootstrap_ci(
        test_y, preds, lambda x, y: stats.pearsonr(x, y)[0], iters=3000)
    r_str = 'R: {:.5f} ({:.5f}, {:.5f})'.format(pearson_r, low_r, high_r)
  else:
    r_str = 'R: {:.5f}'.format(pearson_r)
    low_r = None
    high_r = None
  stats_container = Stats(mae=metrics.mean_absolute_error(test_y, preds),
                         mse=metrics.mean_squared_error(test_y, preds),
                         r=pearson_r,
                         r_low_95=low_r,
                         r_high_95=high_r,
                         pvalue=pvalue,
                         r2=pearson_r**2)
  stats_strs = [
      'MAE: {:.5f}'.format(stats_container.mae),
      'MSE: {:.5f}'.format(stats_container.mse),
      r_str,
      'R^2: {:.5f}'.format(stats_container.r2),
      'P-value: {:.3e}'.format(stats_container.pvalue),
  ]

  lr = stats.linregress(test_y, preds)
  assert abs(pvalue - lr.pvalue) < 1e-6, 'P-value estimates of {} vs {}'.format(
      pvalue, lr.pvalue)
  equation_str = 'y = {:.3f}x + {:.3f}'.format(lr.slope, lr.intercept)
  plt.scatter(test_y, preds, alpha=0.1)
  plt.xlabel('Labeled adjudicated VCDR')
  plt.ylabel('Predicted VCDR')
  plt.plot([0.1, 0.9],
           [lr.intercept + lr.slope * 0.1, lr.intercept + lr.slope * 0.9])

  s = '\n'.join(['; '.join(stats_strs), equation_str])
  t = title + f' ({model})\n' + s
  plt.title(t)
  plt.show()
  return (reg, test_y, preds, stats_container)


def get_design_with_prs(initial_design_df, variants, prs_name):
  prs, prs_plink_str = variant_container.prs(variants, name=prs_name)
  retval = initial_design_df[['target']].join(prs)
  assert len(retval) == len(initial_design_df)
  return retval, prs_plink_str

In [None]:
def eval_all():
  retval = {}
  for name, train_df, test_df, variants in [
    ('Craig orig', craig_train_design, craig_test_design, craig_snps),
    ('Craig EPIC-only', craig_epic_only_train_design, craig_epic_only_test_design, craig_epic_only_snps),
    ('Craig EPIC-surrogate', craig_epic_surrogate_train_design, craig_epic_surrogate_test_design, craig_epic_surrogate_snps),
    ('ML-derived orig', mlderived_train_design, mlderived_test_design, mlderived_snps),
    ('ML-derived EPIC-surrogate', mlderived_epic_surrogate_train_design, mlderived_epic_surrogate_test_design, mlderived_epic_surrogate_snps),
  ]:
    title = f'{name} ({train_df.shape[1] - 1}) - eval adjudicated'
    retval[f'{name} elastic'] = train_and_test(train_df=train_df,
                                               test_df=test_df,
                                               title=title,
                                               model='elastic',
                                               bootstrap=True)
    
    title = f'{name} ({train_df.shape[1] - 1}) - P+T eval adjudicated'
    prs_train_df, plink_str = get_design_with_prs(train_df, variants, 'prs')
    prs_test_df, plink_str2 = get_design_with_prs(test_df, variants, 'prs')
    assert plink_str == plink_str2
    retval[f'{name} P+T'] = train_and_test(train_df=prs_train_df,
                                           test_df=prs_test_df,
                                           title=title,
                                           model='linear',
                                           bootstrap=True) + (plink_str,)
  return retval


def _one_elastic_prs_str(model, train_df, variants):
  cols = [c for c in train_df.columns if c != 'target']
  assert len(model.coef_) == len(cols) == len(variants)

  name_to_variant = {}
  for name in cols:
    vs = [v for v in variants if v.name == name]
    assert len(vs) == 1
    name_to_variant[name] = vs[0]

  name_to_prs_name = {}
  for name, variant in name_to_variant.items():
    vcall = variant_container.get_call(variant)
    prsname = '{}:{}_{}_{}\t{}'.format(variant.chr, variant.pos, vcall.ref,
                                       vcall.alt, vcall.alt)
    name_to_prs_name[name] = prsname

  retval = []
  for name, beta in zip(cols, model.coef_):
    retval.append(f'{name_to_prs_name[name]}\t{beta}\n')
  return ''.join(retval)


def write_all_prs_outputs(results_dict, outdir='/path/to/output/directory'):
  """Write all PRS outputs to the given output directory."""
  # P+T models.
  for name, plink_str in [
      # Used in Fig 3, since we can test the equivalent in EPIC-Norfolk.
      ('craig_prs.epic_only_58.p_plus_t.tsv', results_dict['Craig EPIC-only P+T'][-1]),
      ('mlbased_prs.epic_surrogates_282.p_plus_t.tsv', results_dict['ML-derived EPIC-surrogate P+T'][-1]),
      # Used in suppl fig, since we can only evaluate in UKB.
      ('craig_prs.original_76.p_plus_t.tsv', results_dict['Craig orig P+T'][-1]),
      ('mlbased_prs.original_299.p_plus_t.tsv', results_dict['ML-derived orig P+T'][-1]),
  ]:
    with open(os.path.join(outdir, name), 'w') as f:
      f.write(plink_str)

  # Elastic net models.
  for name, train_df, variants, model in [
      # Used in Fig 3.
      ('craig_prs.epic_surrogates_76.elastic.tsv', craig_epic_surrogate_train_design, craig_epic_surrogate_snps, results_dict['Craig EPIC-surrogate elastic'][0]),
      ('mlbased_prs.epic_surrogates_282.elastic.tsv', mlderived_epic_surrogate_train_design, mlderived_epic_surrogate_snps, results_dict['ML-derived EPIC-surrogate elastic'][0]),
      # Used in suppl fig, since we can only evaluate in UKB.
      ('craig_prs.epic_only_58.elastic.tsv', craig_epic_only_train_design, craig_epic_only_snps, results_dict['Craig EPIC-only elastic'][0]),
      ('craig_prs.original_76.elastic.tsv', craig_train_design, craig_snps, results_dict['Craig orig elastic'][0]),
      ('mlbased_prs.original_299.elastic.tsv', mlderived_train_design, mlderived_snps, results_dict['ML-derived orig elastic'][0]),
  ]:
    with open(os.path.join(outdir, name), 'w') as f:
      f.write(_one_elastic_prs_str(model, train_df, variants))
  

# Run all the model training and evaluations.
all_results = eval_all()

# Set this to True if the PRSes should be written.
if True:
  write_all_prs_outputs(all_results)

# Evaluate models based on existing bootstraps and pairwise bootstrapping.

In [None]:
def compare_stats(label, craig_stats, ml_stats):
  print(f'Results for the {label} data:')
  print(
      f'* Mean absolute error: Craig {craig_stats.mae:.5f} vs our {ml_stats.mae:.5f}, absolute improvement {craig_stats.mae - ml_stats.mae:.5f}, error reduction of {(1 - ml_stats.mae / craig_stats.mae) * 100:.1f}%'
  )
  print(
      f'* Mean squared error: Craig {craig_stats.mse:.5f} vs our {ml_stats.mse:.5f}, absolute improvement {craig_stats.mse - ml_stats.mse:.5f}, error reduction of {(1 - ml_stats.mse / craig_stats.mse) * 100:.1f}%'
  )
  print(
      f'* R: Craig {craig_stats.r:.5f} vs our {ml_stats.r:.5f}, absolute improvement {ml_stats.r - craig_stats.r:.5f}, prediction improvement of {100 * (ml_stats.r - craig_stats.r) / craig_stats.r:.1f}%'
  )
  print(
      f'* R^2: Craig {craig_stats.r2:.5f} vs our {ml_stats.r2:.5f}, absolute improvement {ml_stats.r2 - craig_stats.r2:.5f}, prediction improvement of {100 * (ml_stats.r2 - craig_stats.r2) / craig_stats.r2:.1f}%'
  )


def paired_bootstrap(results_dict, ds1: str, ds2: str) -> None:
  _, truth1, preds1 = results_dict[ds1][:3]
  _, truth2, preds2 = results_dict[ds2][:3]
  assert (truth1 == truth2).all()

  def corr_fn(x, y):
    return np.corrcoef(x, y)[0, 1]
  
  print(f'* Paired R 95% CI improvement of {ds1} over {ds2}: ',
        bootstrap_ci(
            truth1,
            preds1,
            metric_fn=corr_fn,
            iters=3000,
            preds2=preds2,
            ci_interval=97.5))

In [None]:
for label, craig_key, ml_key in [
    # 58 Craig variants present in EPIC vs 282 ML EPIC surrogates.
    ('EPIC-possible P+T', 'Craig EPIC-only P+T', 'ML-derived EPIC-surrogate P+T'),
    # 76 Craig variants present in EPIC vs 299 ML-based.
    ('Original P+T', 'Craig orig P+T', 'ML-derived orig P+T'),
    # 76 Craig surrogates vs 282 ML EPIC surrogates.
    ('EPIC-possible elastic', 'Craig EPIC-surrogate elastic', 'ML-derived EPIC-surrogate elastic'),
    # 76 Craig variants present in EPIC vs 299 ML-based.
    ('Original elastic', 'Craig orig elastic', 'ML-derived orig elastic'),
]:
  compare_stats(label=label,
                craig_stats=all_results[craig_key][3],
                ml_stats=all_results[ml_key][3])
  paired_bootstrap(all_results, ml_key, craig_key)
  print()

#Analyze PLINK predictions

In [None]:
def pearson_corr(targets, predictions):
  """Returns Pearson correlation between <targets> and <predictions>."""
  return stats.pearsonr(targets, predictions)[0]


def spearman_corr(targets, predictions):
  """Returns Pearman correlation between <targets> and <predictions>."""
  return stats.spearmanr(targets, predictions)[0]


_SEED = 23

_METRICS = [
    perf_metrics.Metric(
        'num', lambda y_true, y_pred: len(y_true), binary_only=False),
    perf_metrics.Metric('Pearson corr', pearson_corr, binary_only=False),
    perf_metrics.Metric('Spearman corr', spearman_corr, binary_only=False),
]


def load_plink_predictions(source, model):
  """Loads PLINK's predictions for <dataset>."""
  lookup = {
      ('craig_orig', 'p+t'): 'craig_prs.original_76.p_plus_t.profile',
      ('craig_orig', 'elastic'): 'craig_prs.original_76.elastic.profile',
      ('craig_epic_only', 'p+t'): 'craig_prs.epic_only_58.p_plus_t.profile',
      ('craig_epic_only', 'elastic'): 'craig_prs.epic_only_58.elastic.profile',
      ('craig_epic_surrogates', 'elastic'): 'craig_prs.epic_surrogates_76.elastic.profile',
      ('genmed_orig', 'p+t'): 'mlbased_prs.original_299.p_plus_t.profile',
      ('genmed_orig', 'elastic'): 'mlbased_prs.original_299.elastic.profile',
      ('genmed_epic_surrogates', 'p+t'): 'mlbased_prs.epic_surrogates_282.p_plus_t.profile',
      ('genmed_epic_surrogates', 'elastic'): 'mlbased_prs.epic_surrogates_282.elastic.profile',
  }
  try:
    filename = lookup[(source, model)]
  except KeyError:
    raise ValueError(
        f'Unknown source ({source}) and model ({model}) combination.')

  preds_path = os.path.join(PLINK_PRS_PREDS, filename)

  preds = pheno_utils.load_csv(
      preds_path,
      index_col=None,
      delim_whitespace=True,
      usecols=['IID', 'SCORE'])
  preds.rename(columns={'IID': 'eid', 'SCORE': 'prediction'}, inplace=True)

  return preds.set_index('eid')


def analyze_one_plink_set(source, model, seed=_SEED):
  """Computes performance metrics for <source> and <model>."""
  preds_df = load_plink_predictions(source, model)
  target_df = vcdr_container.as_dataframe()
  target_df = target_df.loc[target_df.index.intersection(
      vcdr_container.adjudicated_eids)]
  merged_df = pd.merge(
      preds_df, target_df, left_index=True, right_index=True, how='inner')

  return perf_metrics.PerformanceMetrics(
      name=f'{source} - {model}', metrics=_METRICS).compute(
          merged_df['prediction'].to_numpy(copy=True),
          merged_df['vcdr'].to_numpy(copy=True),
          n_bootstrap=2000,
          seed=seed)


def get_permute_pvalue(df, col_1, col_2, ground_truth, n_permute=2000, seed=42):
  """Computes permutation P for <col_1> preds being better than <col_2>."""

  def _corr_diff(p1, p2, gt):
    """Computes difference in correlation."""
    corr_1 = np.corrcoef(p1, gt)[0, 1]
    corr_2 = np.corrcoef(p2, gt)[0, 1]
    return corr_1 - corr_2
    
  prng = np.random.RandomState(seed=seed)
  n = df.shape[0]
  p1 = df[col_1].to_numpy(copy=True)
  p2 = df[col_2].to_numpy(copy=True)
  gt = df[ground_truth].to_numpy(copy=True)
  
  diffs = np.empty(n_permute)
  
  # get the observed corr diff
  diff_obs = _corr_diff(p1, p2, gt)
  # we append observed corr diff to make sure P is never zero. 
  diffs[-1] = diff_obs
  
  # concatenate two preds
  p = np.hstack([p1, p2])
    
  for i in range(n_permute):
    # randomly select predictions for p1/2_permut[i] from p1[i] or p2[i]
    # For each individual, randomly select either their `p1` or `p2` value 
    # to be used in the permuted p1 vector, and use the other value in the
    # permuted p2 vector.
    rand_indicator = prng.choice([0, n], size=n)
    idx_1 = np.arange(n) + rand_indicator
    idx_2 = np.arange(n) - rand_indicator + n
    diffs[i] = _corr_diff(p[idx_1], p[idx_2], gt)

  # one-sided permutation p-value
  P = np.max([1/n_permute, np.mean(diffs > diff_obs)])
  print(f'one-sided p-value for {col_1} > {col_2}')
  print('p-value <= {:.3e}\n'.format(P))


def get_corr_ci(df, col_1, col_2, gt, n_bootstrap=2000, seed=42):
  """Computes CI for the difference in correlations."""
  prng = np.random.RandomState(seed=seed)
  n = df.shape[0]
  metrics = np.empty((n_bootstrap, 3))

  p1 = df[col_1].to_numpy(copy=True)
  p2 = df[col_2].to_numpy(copy=True)
  gt = df[gt].to_numpy(copy=True)

  for i in range(n_bootstrap):
    idx = prng.randint(0, high=n, size=n)
    corr_1 = np.corrcoef(gt[idx], p1[idx])[0, 1]
    corr_2 = np.corrcoef(gt[idx], p2[idx])[0, 1]
    metrics[i, :] = [corr_1, corr_2, corr_1 - corr_2]

  print(f'CI for {col_1} > {col_2}')
  labs = [col_1, col_2, 'diff']
  for i in range(3):
    m = np.mean(metrics[:, i])
    s = np.std(metrics[:, i])
    lo, hi = np.percentile(metrics[:, i], [2.5, 97.5])
    print('{}\tm: {:.3f} s: {:.3f} cl: {:.3f}-{:.3f}'.format(
        labs[i], m, s, lo, hi))
  print()


def analyze_plink_pairs(comparison, n_bootstrap=2000, n_permute=10000, seed=_SEED):
  """Computes performance metrics for <comparison> for 'craig' and 'genmed'."""
  target_df = vcdr_container.as_dataframe()
  df = target_df.loc[target_df.index.intersection(
      vcdr_container.adjudicated_eids)]

  comparison_to_preds = {
      'fig3 p+t': [('genmed_epic_surrogates', 'p+t'), ('craig_epic_only', 'p+t')],
      'fig3 elastic': [('genmed_epic_surrogates', 'elastic'), ('craig_epic_surrogates', 'elastic')],
      'orig p+t': [('genmed_orig', 'p+t'), ('craig_orig', 'p+t')],
      'orig elastic': [('genmed_orig', 'elastic'), ('craig_orig', 'elastic')],
  }

  cols = []
  for source, model in comparison_to_preds[comparison]:
    cols.append(f'{source}_{model}')
    preds_df = load_plink_predictions(source, model)
    preds_df.rename(columns={'prediction': f'{source}_{model}'}, inplace=True)
    df = pd.merge(df, preds_df, left_index=True, right_index=True, how='inner')

  get_corr_ci(df, cols[0], cols[1], 'vcdr', n_bootstrap=n_bootstrap, seed=seed)
  get_permute_pvalue(
      df, cols[0], cols[1], 'vcdr', n_permute=n_permute, seed=seed)

In [None]:
# Individual metrics.
def compute_all_individual_metrics():
  retval = {}
  for key in [
      ('craig_orig', 'p+t'),
      ('craig_orig', 'elastic'),
      ('craig_epic_only', 'p+t'),
      ('craig_epic_only', 'elastic'),
      ('craig_epic_surrogates', 'elastic'),
      ('genmed_orig', 'p+t'),
      ('genmed_orig', 'elastic'),
      ('genmed_epic_surrogates', 'p+t'),
      ('genmed_epic_surrogates', 'elastic'),]:
    source, model = key
    retval[key] = analyze_one_plink_set(source, model)
  
  # Print out the metrics before returning them.
  for (source, model), metrics_dict in sorted(retval.items()):
    print(f'{source} {model}:')
    for metric_name, result in sorted(metrics_dict.items()):
      print(perf_metrics._build_metric_str(metric_name, result))  

  return retval


indiv_metrics = compute_all_individual_metrics()

In [None]:
# Pairwise comparisons.
for comparison in ['fig3 p+t', 'fig3 elastic', 'orig p+t', 'orig elastic']:
  analyze_plink_pairs(comparison, n_bootstrap=2000, n_permute=100000)