# Sample notebook demonstrating cluster generation, selection, and evaluation

This notebook demonstrates how to utilize the provided open-source code on sample data to generate cluster-derived machine learned features while controlling for baseline features, and how to evaluate the models derived from utilizing these features.

In [3]:
import os
import math

import numpy as np
import pandas as pd
import sklearn

import cluster_utils
import data_utils

# Constants

In [4]:
N_CASES = 1000
EMBEDDING_DIM = 4
PATCHES_PER_CASE = 1000
K = 200
CLUSTER_COLS = list(range(K))
N_TOP_CLUSTERS = 5
LABEL_COL = 'lnm'

# Data Generation

## Metadata

In [5]:
# Generate random metadata
rs = np.random.RandomState(0)

BASELINE_COLS = [
  'age_bins',
  'sex',
  't_stage',
  'grade',
  'venous_inv',
  'lymphovascular_inv'
]

df = pd.DataFrame({
    'case_id': [f'case_{i}' for i in range(N_CASES)],
    'split': rs.choice(['train', 'validation', 'test'], p=[0.6, 0.2, 0.2], size=N_CASES),
    'age': rs.randint(low=50, high=90, size=N_CASES),
    'sex': rs.choice(['M', 'W'], size=N_CASES),
    't_stage': rs.choice(['T3', 'T4'], size=N_CASES),
    'grade': rs.choice(['G1', 'G2', 'G3'], size=N_CASES),
    'venous_inv': rs.choice([0, 1], size=N_CASES),
    'lymphovascular_inv': rs.choice([0, 1], size=N_CASES),
})

df['age_bins'] = df['age'].apply(data_utils.bin_age)
df, BASELINE_COLS = data_utils.prep_features(df, BASELINE_COLS, ['case_id', 'split'])

In [6]:
df.head()

Unnamed: 0,case_id,split,venous_inv,lymphovascular_inv,age_bins_60-69,age_bins_70-79,age_bins_>=80,sex_W,t_stage_T4,grade_G2,grade_G3
0,case_0,train,1,0,1,0,0,0,0,0,1
1,case_1,validation,0,1,0,1,0,0,0,0,0
2,case_2,validation,1,0,0,0,0,1,1,0,1
3,case_3,train,1,1,0,1,0,0,0,1,0
4,case_4,train,1,0,1,0,0,0,1,0,1


## Embeddings

In [7]:
rs = np.random.RandomState(0)
embeddings = {}
for i in range(N_CASES):
  embeddings[f'case_{i}'] = rs.uniform(size=[PATCHES_PER_CASE, EMBEDDING_DIM])

## Cluster Quantitations

In [8]:
# Fit kmeans model on train embeddings
train_ids = set(df[df['split'] == 'train']['case_id'])
train_embeddings = {case_id: emb for case_id, emb in embeddings.items() if case_id in train_ids}
kmeans_model = cluster_utils.train_k_means_model(train_embeddings, K)

Embeddings shape: (610000, 4)


In [9]:
# Compute cluster quantitations
df_cq = cluster_utils.get_cluster_quantitation_df(embeddings, kmeans_model)

In [10]:
# Standardize cluster quantitations
df_cq_train = cluster_utils.get_cluster_quantitation_df(train_embeddings, kmeans_model)
scaler = sklearn.preprocessing.StandardScaler().fit(df_cq_train[CLUSTER_COLS])
df_cq[CLUSTER_COLS] = scaler.transform(df_cq[CLUSTER_COLS])

## Labels

In [11]:
# Simulate lables and associations with baseline features and cluster quantitations
def get_label(r):
  rs = np.random.RandomState(int(r['case_id'][-1]))

  logit = 0
  logit += 0.5 * r['venous_inv']
  logit += -0.5 * r['age_bins_60-69']
  logit += -1.0 * r['age_bins_70-79']
  logit += -1.0 * r['age_bins_>=80']
  logit += 0.1 * r['sex_W']
  logit += 0.5 * r['t_stage_T4']
  logit += 0.1 * r['grade_G2']
  logit += 0.5 * r['grade_G3']
  logit += 1.0 * r['lymphovascular_inv']

  # Clusters of interest
  logit += 1.0 * r[0]
  logit += -1.0 * r[1]
  logit += 1.0 * r[2]
  logit += -1.0 * r[3]
  logit += 1.0 * r[4]
  prob = 1 / (1 + math.exp(-logit))
  return rs.binomial(1, prob)

In [12]:
df = df.merge(df_cq, on='case_id')
df[LABEL_COL] = df.apply(get_label, axis=1)
df[LABEL_COL].value_counts()

1    575
0    425
Name: lnm, dtype: int64

In [13]:
# Cluster of interest
sklearn.metrics.roc_auc_score(df[LABEL_COL], df[0])

0.6263427109974424

In [14]:
# Non-informative cluster
sklearn.metrics.roc_auc_score(df[LABEL_COL], df[5])

0.5419846547314578

# Select top clusters

In [15]:
df_train = df.query("split=='train'")
df_valid = df.query("split=='validation'")
df_test = df.query("split=='test'")

In [16]:
df_cluster = cluster_utils.select_top_clusters(
    df_train=df_train,
    df_valid=df_valid,
    label_col=LABEL_COL,
    baseline_cols=BASELINE_COLS,
    cluster_cols=CLUSTER_COLS,
    n=N_TOP_CLUSTERS
)
df_cluster

Unnamed: 0,order,cluster_id,auc
0,0,4,0.682931
1,1,3,0.757471
2,2,1,0.802241
3,3,0,0.827816
4,4,2,0.841609


In [17]:
TOP_CLUSTERS = list(df_cluster['cluster_id'])
TOP_CLUSTERS

[4, 3, 1, 0, 2]

# Eval top clusters

### Analysis

#### Likelihood ratio test

In [18]:
p_value = cluster_utils.likelihood_ratio_test(
    df=df_test,
    label_col=LABEL_COL,
    baseline_cols=BASELINE_COLS,
    cluster_cols=TOP_CLUSTERS
)
p_value

3.326469144250546e-12

#### Evaluate multivariate odds ratios

In [19]:
odds_ratios = cluster_utils.get_odds_ratios_p_values(
    df=df_test,
    label_col=LABEL_COL,
    baseline_cols=BASELINE_COLS,
    cluster_cols=TOP_CLUSTERS
)
odds_ratios

Unnamed: 0,OR,p
venous_inv,"1.19 [0.59, 2.39]",0.622
lymphovascular_inv,"3.45 [1.69, 7.02]",<0.001
age_bins_60-69,"0.67 [0.27, 1.64]",0.379
age_bins_70-79,"0.42 [0.17, 1.06]",0.067
age_bins_>=80,"0.36 [0.13, 1.00]",0.049
sex_W,"0.85 [0.42, 1.71]",0.647
t_stage_T4,"0.94 [0.47, 1.88]",0.864
grade_G2,"1.42 [0.62, 3.26]",0.410
grade_G3,"2.07 [0.93, 4.62]",0.076
4,"2.42 [1.59, 3.68]",<0.001


### Evaluate predictive performance of model

In [20]:
aucs = cluster_utils.get_eval_aucs(
    df_train=df_train,
    df_valid=df_test,
    label_col=LABEL_COL,
    baseline_cols=BASELINE_COLS,
    cluster_cols=TOP_CLUSTERS
)
aucs

Unnamed: 0,Baseline features only,Cluster features only,Baseline + cluster features
AUC,0.618361,0.785176,0.813417
