```
Copyright 2021 Google LLC.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

1. Redistributions of source code must retain the above copyright notice, this
   list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright notice,
   this list of conditions and the following disclaimer in the documentation
   and/or other materials provided with the distribution.

3. Neither the name of the copyright holder nor the names of its contributors
   may be used to endorse or promote products derived from this software
   without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
```




##Phenotype Calling

In this notebook, we load the model predictions, process them, call the phenotypes, join them with covariates and then store the final phenotypes.

In [None]:
import datetime
import functools
import os
import csv

import numpy as np
import pandas as pd
import scipy as sp

# Module defined within this repository.
import pheno_utils

# Constants

In [None]:
# "age" is age at the first visit (visit 0)
# "age_1" is age at the second visit (visit 1)
COVARS = ['age', 'age_1', 'sex', 'genotype_array_enum', 'refractive_error'
         ] + ['pc%d' % i for i in range(1, 16)]

# The predction file should have the following columns. Categorical predictions
# are encoded as {outcome}:{category} and their values should be in [0, 1]
# and the values of all categories should sum to 1.
# 'vertical_cup_to_disc:VERTICAL_CUP_TO_DISC' is a number in [0, 1].
PRED_COLS = [
    'eid',
    'image_id',
    'glaucoma_gradability:GRADABLE',
    'glaucoma_gradability:UNGRADABLE',
    'glaucoma_gradability:WITH_DIFFICULTY',
    'vertical_cup_to_disc:VERTICAL_CUP_TO_DISC',
    'vertical_cd_visibility:SUFFICIENT',
    'vertical_cd_visibility:COMPROMISED',
    'vertical_cd_visibility:UNABLE_TO_ASSESS',
    'glaucoma_suspect_risk:HIGH_RISK',
    'glaucoma_suspect_risk:LIKELY',
    'glaucoma_suspect_risk:LOW_RISK',
    'glaucoma_suspect_risk:NON_GLAUCOMATOUS',
]

# The covariates file with one row per <eid> that has <COVARS> + 'eid' columns.
COVARIATES_FILE = '/path/to/file'

# the set of European ancestry EIDs
EUROPEAN_EID_FILE = '/path/to/file'

# Model predictions with one row per <image_id> that has <PRED_COLS> columns.
PREDS_FILE = '/path/to/file'

OUTPUT_FILE = '/path/to/file'

# indicates visits to use for calling VCDR
_VISIT_IDS = [0, 1]

# indicates eyes to use for calling VCDR
# 1: left, 2: right
_EYES = [1, 2]

#Helper Functions

In [None]:
def _nan_date(datestr):
  """Convert date str to datetime."""
  try:
    return datetime.datetime.strptime(datestr, '%Y-%m-%d')
  except TypeError:
    return np.nan


def compute_age_1(df):
  """Returns a series corresponding to age_1."""
  retval = df['age_1_raw'].copy()
  visit0_date = df['date_visit_0'].apply(_nan_date)
  visit1_date = df['date_visit_1'].apply(_nan_date)
  # Pick the midpoint between start of 2012 and end of 2013.
  # Empirically, the mean date of the second visit is 2013/01/17,
  # so this is reasonable.
  visit1_mean_date = datetime.datetime(2013, 1, 1)
  known_date_delta = ((visit1_date - visit0_date) /
                      np.timedelta64(1, 'Y')).round()
  mean_date_delta = ((visit1_mean_date - visit0_date) /
                     np.timedelta64(1, 'Y')).round()

  needed_mask = df['age_1_raw'].isna()
  from_known_mask = needed_mask & known_date_delta.notna()
  from_imputed_mask = needed_mask & ~from_known_mask
  print('Inferring {} ages, {} from both visits and {} from first only.'.format(
      needed_mask.sum(), from_known_mask.sum(), from_imputed_mask.sum()))
  retval[from_known_mask] = (df['age'] + known_date_delta)[from_known_mask]
  retval[from_imputed_mask] = (df['age'] + mean_date_delta)[from_imputed_mask]
  assert retval.isna().sum() == 0
  return retval


def load_eid_file(filename):
  """Returns a set of EIDs from a CSV file with an 'eid' column."""
  retval = set()
  with open(filename) as f:
    reader = csv.DictReader(f)
    for row in reader:
      try:
        eid = int(row['eid'])
      except (KeyError, ValueError):
        raise ValueError(
            'Row must contain an integer 'eid' field: {}'.format(row))
      else:
        retval.add(eid)
  return retval


def call_linear_risk(df_all,
                     risk_col,
                     filter_col=None,
                     filter_threshold=0.7,
                     base_cols=None,
                     agg_op='max',
                     pheno_name=None):
  """Call linear risk for the given risk type, filtering if needed."""
  name = pheno_name if pheno_name is not None else risk_col
  print('Calling pheno for: {}'.format(name), flush=True)
  df = df_all.copy()

  if base_cols is None:
    base_cols = ['eid', 'file_name']

  cols_to_keep = [risk_col] + base_cols

  if filter_col is not None:
    df = drop_col_below_threshold(
        df, filter_col, lower_bound=filter_threshold)

  df = df[cols_to_keep]
  if agg_op == 'max':
    df = take_max(df, risk_col).drop(columns=['file_name'])
  elif agg_op == 'avg':
    df = take_avg(df, risk_col).reset_index()
  else:
    raise ValueError('agg_op can be either 'max' or 'avg'')  

  if pheno_name is not None:
    df = df.rename(columns={risk_col: pheno_name})
  return df  


def call_per_visit_phenotype(images, add_pheno):
  """Calls a per visit phenotype.

  Args:
    images: A list of pairs where each pair represents an image. The first 
      element is the name of image that we use to extract the corresponding eye
      and visit for the image. The second element is the predicted phenotype.
    add_pheno: A function that given the dictionary of visits and eyes, returns 
      a tuple of the phenotype and its covariates.

  Returns:
    A tuple of the phenotype and its covariates.
  """
  phenos = {}
  for image in images:
    _, eye, visit, _ = extract_attr(image[0])
    pheno = image[1]
    if visit not in phenos:
      phenos[visit] = {}
    if eye not in phenos[visit]:
      phenos[visit][eye] = []
    phenos[visit][eye].append(pheno)

  return add_pheno(phenos)


def pheno_visit(phenos):
  """Returns the aggregated phenotype for a visit with some predictions.

  Visit 0 is preferred to visit 1.

  Args:
    phenos: Dictionary of per visit per eye predictions.

  Returns:
    A triple: 1) the phenotype which is aggregated PHENO for a visit, 2) a 
      covariate for the visit used in the aggregation, and 3) the number of eyes
      with a prediction.
  """
  for visit in _VISIT_IDS:
    if visit in phenos:
      eye_vals = [phenos[visit][eye] for eye in _EYES if eye in phenos[visit]]      
      avg = np.mean([np.mean(vals) for vals in eye_vals])
      return avg, visit, len(phenos[visit])
  raise ValueError('No data in any visit')


def pheno_visit_eye(phenos, visit, eye):
  """Returns the aggregated phenotype for the given visit and eye.

  Args:
    phenos: Dictionary of per visit per eye predictions.
    visit: The visit to use for aggregation.
    eye: The eye to use for aggregation.

  Returns:
    A pair: the phenotype which is aggregated PHENO over visit and eye, and a 
      covariate for the number of images used.
    Note: If there are no images available, we return NAN for the phenotype 
    value.
  """
  if visit in phenos and eye in phenos[visit]:
    return np.mean(phenos[visit][eye]), len(phenos[visit][eye])
  return np.nan, 0


def get_pheno_df(images_df, pheno, columns, add_pheno_func):
  """Given the images DataFrame creates the phenotype DataFrame.

  Args:
    images_df: The DataFrame of images.
    pheno: The name of the phenotype.
    columns: The name of columns for the phenotype and its corresponding 
      covariates. These column names must match the return tuple of 
      `add_pheno_func`.
    add_pheno_func: A function that given the dictionary of visits and eyes, 
      returns a tuple of the phenotype and its covariates.

  Returns:
    The phenotype DataFrame to be merged on EID with other phenotypes.
  """
  pheno_df = images_df.copy()

  # DataFrames are much faster if we apply a change on a single column rather 
  # than a row. We create a column to gather all information needed in a single
  # column. Later, we derive the value of phenotype and covariates from the
  # value of this column.
  pheno_df['filename_pheno'] = list(
      zip(pheno_df['file_name'], pheno_df[pheno]))
  pheno_df = pheno_df[['eid', 'filename_pheno']]  # Keep only required columns.
  pheno_df = pheno_df.groupby('eid').agg(list).reset_index()

  # We define a column for each column name in `columns` in one shot.
  pheno_df[columns] = pd.DataFrame(pheno_df['filename_pheno'].apply(
      functools.partial(call_per_visit_phenotype,
                        add_pheno=add_pheno_func)).tolist())

  return pheno_df.drop(columns='filename_pheno')  # Drop the auxiliary column.     

# Extract Covariates

In [None]:
df_covar_all = pheno_utils.load_csv(COVARIATES_FILE, sep='\t')

# compute age at visit 1, so we have ages at both visits
df_covar_all['age_1'] = compute_age_1(df_covar_all)
df_covar = df_covar_all[['eid'] + COVARS]

assert (df_covar_all['age_1'] >= df_covar_all['age']).all()

# Load European EIDs
euro_eids = load_eid_file(EUROPEAN_EID_FILE)

# Load and preprocess predictions

In [None]:
# Load model predictions for all UKB images
preds_all = pheno_utils.load_predictions(
    preds_csv=PREDS_CSV, cols_to_use=PRED_COLS)

# Return value is a quadruple of:
# EID, eye in [1, 2], visit in [0, 1], index of the image.
extract_attr = pheno_utils.extract_attributes_from_image_path

# extract eids
image_eids = preds_all['file_name'].apply(
    lambda filename: extract_attr(filename)[0])
pd.testing.assert_series_equal(preds_all['eid'], image_eids, check_names=False)

# extract left/right eye
preds_all['eye'] = preds_all['file_name'].apply(
    lambda filename: extract_attr(filename)[1])

# extract visit id
preds_all['visit'] = preds_all['file_name'].apply(
    lambda filename: extract_attr(filename)[2])

In [None]:
# Add 'is_euro' flag to preds based on eid.
preds_all['is_euro'] = preds_all['eid'].isin(euro_eids)

# Immediately drop ungradable images
preds_all = pheno_utils.drop_col_below_threshold(
    preds_all, 'gradability', lower_bound=0.7)

# Separate European and Non-European samples
preds_euro = preds_all[preds_all['is_euro'] == True]

# Call Phenotypes

## Glaucoma liability phenotype

In [None]:
linear_risk_phenos = [{
    'label': 'glaucoma_risk',
    'risk_col': 'glaucoma_risk',
    'filter_col': 'gradability',
    'filter_threshold': 0.7,
}]

In [None]:
# Create a 'master' dataframe of EIDs and then merge each phenotype to this
# dataframe.
phenos_euro = preds_euro[['eid']].copy().drop_duplicates()

In [None]:
# Call linear risk phenos
for pheno in linear_risk_phenos:
  pheno_df = call_linear_risk(
      preds_euro,  
      pheno['risk_col'],
      filter_col=pheno['filter_col'],
      filter_threshold=pheno['filter_threshold'],
      agg_op=pheno['agg_op'] if 'agg_op' in pheno else 'max',
      pheno_name=pheno['label'])
  phenos_euro = phenos_euro.merge(pheno_df, on='eid', how='left')

# Compute glaucoma risk logit
phenos_euro['glaucoma_liability'] = sp.special.logit(
    phenos_euro['glaucoma_risk'])  

## VCDR per-visit phenotype



In [None]:
# For VCDR, drop the images with low 'vertical_cd_visibility'
preds_euro_vcdr = pheno_utils.drop_col_below_threshold(
    preds_euro.copy(), 'vertical_cd_visibility', lower_bound=0.7)

In [None]:
# Note: grdabaility_visit is the covariate for the VCDR GWAS.
visit_phenos = [{
    'label': 'vcdr_visit',
    'pheno': 'vertical_cup_to_disc',
}, {
    'label': 'gradability_visit',
    'pheno': 'gradability'
}]

In [None]:
visit_and_eyes_set = False

for visit_pheno in visit_phenos:
  label = visit_pheno['label']
  pheno = visit_pheno['pheno']
  print('Adding {} phenotype...'.format(label), flush=True)
  pheno_df = get_pheno_df(preds_euro_vcdr, pheno, [label, 'visit', 'num_eyes'],
                          pheno_visit)
  # visit and num_eyes covariate will be the same for all 'visit' phenotypes.
  if visit_and_eyes_set:
    pheno_df = pheno_df.drop(columns=['visit', 'num_eyes'])

  phenos_euro = phenos_euro.merge(pheno_df, on='eid', how='left')
  for visit in _VISIT_IDS:
    for eye in _EYES:
      ve = '{}_eye{}'.format(visit, eye)
      # The current `label` already has '_visit' as the suffix, so no need to 
      # add 'visit' prefix to `ve`.
      pheno_name = label + ve
      covariate_name = 'visit' + ve + '_num_images'

      print('Adding {} phenotype...'.format(pheno_name), flush=True)
      pheno_df = get_pheno_df(
          preds_euro_vcdr, pheno, [pheno_name, covariate_name],
          functools.partial(pheno_visit_eye, visit=visit, eye=eye))
      if visit_and_eyes_set:
        pheno_df = pheno_df.drop(columns=[covariate_name])
      phenos_euro = phenos_euro.merge(pheno_df, on='eid', how='left')
      
  # all proper covariates are added so we do not need to add them again.
  if not visit_and_eyes_set:
    visit_and_eyes_set = True

# Prepare and write pheno-covar file


In [None]:
# Join with covariates
phenos_euro = phenos_euro.merge(
    df_covar, on='eid', how='inner', suffixes=('', '_covar'))

In [None]:
# Define a new covariate for the age of visit used in `vcdr_visit`.
phenos_euro['visit_age'] = np.where(phenos_euro['visit'] == 0,
                                    phenos_euro['age'], phenos_euro['age_1'])

print('Num individuals with `vcdr_visit`:',
      len(phenos_euro[phenos_euro['vcdr_visit'].notna()]))
print(
    'Num individuals with `vcdr_visit` and `visit_age`:',
    len(phenos_euro[phenos_euro['vcdr_visit'].notna()
                    & phenos_euro['visit_age'].notna()]))

In [None]:
with open(OUTPUT_FILE, 'w') as fw:
  phenos_euro.to_csv(fw, index=False)