In [90]:
%load_ext autoreload
%autoreload 2

import os
import pymc as pm
import arviz as az
import numpy as np
import pandas as pd
import pytensor
from pytensor import tensor as T
from sklearn.preprocessing import scale, StandardScaler, LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import linkage, leaves_list
import itertools
import pickle
import dill
from pyprojroot.here import here

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
here('submission/draft/survival_clustering.ipynb')

PosixPath('/Users/alzhang/Documents/projects/tfri_halo/submission/draft/survival_clustering.ipynb')

In [3]:
def create_trace_table(trace, export_variables = ['props', 'beta_clust', 'beta_stage', 'beta_age', 'beta_chemo', 'beta_rt', 'beta_brachy', 'beta_histotype']):
    # Create an empty list to store DataFrames
    export_dfs = []
    
    # Iterate over the list of column names
    for variable in export_variables:
        print(variable)
        posterior = trace.posterior[variable][0]
        
        if posterior.ndim == 2:
            posterior_clust_assignments = pd.DataFrame(posterior, columns=[f'{variable}_{i}' for i in range(posterior.shape[1])])
        elif posterior.ndim == 1:
            posterior_clust_assignments = pd.DataFrame(posterior, columns=[variable])
        else:
            print("Should not get here.")
        
        # Append the DataFrame to the list
        export_dfs.append(posterior_clust_assignments)
    
    # Concatenate the DataFrames horizontally (column bind)
    trace_table = pd.concat(export_dfs, axis=1)
    return trace_table

## Inputs

In [4]:
# Survival data, long format by outcome
clinical_long = pd.read_csv(here('results/survival_cluster/clinical_long.tsv'), sep='\t')

# Counts data, long format by TIL type and region
counts_long = pd.read_csv(here('results/survival_cluster/counts_final.tsv'), sep='\t')

In [5]:
# Filter for p53abn
clinical_long = clinical_long.loc[clinical_long['eclass2_ngs'] == 'p53abn'].dropna()

eclass_encoder = LabelEncoder()
stage_encoder = LabelEncoder()

clinical_long['eclass2_ngs_idx'] = eclass_encoder.fit_transform(clinical_long['eclass2_ngs'])
clinical_long['stage_idx'] = stage_encoder.fit_transform(clinical_long['stage_main'])
clinical_long['carcinosarcoma'] = (clinical_long['hist_rev'] == 'carcinosarcoma (MMMT)')
clinical_long['age_dx'] = scale(clinical_long['age_dx'], axis=0, with_mean=True, with_std=True, copy=True)

clinical_vars = ["acc_num", "outcome", "time", "status", "chemo", "rt", "brachy", "eclass2_ngs_idx", "stage_idx", "age_dx", "carcinosarcoma"]

clinical_selected = clinical_long[clinical_vars].drop_duplicates()

In [6]:
counts_long['variable_region'] = counts_long['variable'] + '_' + counts_long['region']

counts_wide = counts_long.pivot(index='acc_num', columns='variable_region', values='value').dropna()
areas_wide = counts_long.pivot(index='acc_num', columns='variable_region', values='area_region_mm').dropna()

In [7]:
# Accepts counts and areas in wide format, clinical data in long format
# Returns clinical data, areas, and counts indexed identically
def get_inputs(counts, areas, clinical, outcome):
    clinical = clinical[clinical['outcome'] == outcome].set_index('acc_num')
    
    common_samples = np.intersect1d(counts.index, clinical.index)
    counts = counts.loc[common_samples,:]
    areas = areas.loc[common_samples,:]
    clinical = clinical.loc[common_samples,:]

    return {'clinical': clinical, 'areas': areas, 'counts': counts}

In [8]:
# Output N x C (number of samples X number of region*cell types) counts array, with corresponding N x C area array
# Note that this is DIFFERENT from previous

os_inputs = get_inputs(counts_wide, areas_wide, clinical_selected, outcome = 'os')
pfs_inputs = get_inputs(counts_wide, areas_wide, clinical_selected, outcome = 'pfs')
dss_inputs = get_inputs(counts_wide, areas_wide, clinical_selected, outcome = 'dss')

In [44]:
def fit_survcluster_model(inputs, nclusts = 2, ncenters = 20, interval_length = 0.3, epsilon = 1e-6, ndraw = 1000, ntune=1000):
    count_mat = np.array(inputs['counts'])
    area_mat = np.array(inputs['areas'])
    clinical_df = inputs['clinical']
    time = clinical_df['time'].values
    event = clinical_df['status'].values
    stage = clinical_df['stage_idx'].values
    age = clinical_df['age_dx'].values
    chemo = clinical_df['chemo'].values.astype(int)
    rt = clinical_df['rt'].values.astype(int)
    brachy = clinical_df['brachy'].values.astype(int)
    
    nstages = len(np.unique(stage))
    ncelltypes = count_mat.shape[1]
    nsamples = count_mat.shape[0]
    
    mean_mu = np.sum(count_mat, axis = 0)/np.sum(area_mat, axis = 0)
    mean_mu = np.repeat(mean_mu[np.newaxis,:], nclusts, axis=0).transpose(0, 1)
    
    area_nonzero = area_mat + epsilon
    
    rbf_step = np.max(count_mat)/(ncenters-1.)
    centers = np.arange(ncenters) * rbf_step
    
    # intervals 
    
    samples = np.arange(nsamples)
    interval_bounds = np.arange(0, time.max() + interval_length + 1, interval_length)
    
    nintervals = interval_bounds.size - 1
    intervals = np.arange(nintervals)
    
    last_period = np.floor((time - 0.01) / interval_length).astype(int)
    
    death = np.zeros((nsamples, nintervals))
    death[samples, last_period] = event
    
    exposure = np.greater_equal.outer(time, interval_bounds[:-1]) * interval_length
    exposure[samples, last_period] = time - interval_bounds[last_period]
    
    coords = {"intervals": intervals}

    with pm.Model(coords = coords) as survival_mixture_model:
        # Priors for survival coefficients
        
        beta_clust0 = pm.Normal("beta_clust0", mu=0, sigma=5, shape=nclusts-1)
        beta_clust = pm.Deterministic("beta_clust", pm.math.concatenate([[0], beta_clust0]))
        beta_stage0 = pm.Normal("beta_stage0", mu=0, sigma=5, shape=nstages-1)
        beta_stage = pm.Deterministic("beta_stage", pm.math.concatenate([[0], beta_stage0]))
        beta_age = pm.Normal("beta_age", mu=0, sigma=5)
        beta_chemo = pm.Normal("beta_chemo", mu = 0, sigma = 5)
        beta_rt = pm.Normal("beta_rt", mu = 0, sigma = 5)
        beta_brachy = pm.Normal("beta_brachy", mu = 0, sigma = 5)
    
        # Priors for count coefficient
        mu_clust = pm.Gamma("mu_clust", mu = mean_mu, sigma = 100, shape = (nclusts, ncelltypes))
        
        # Latent categorical variable for 'clust'
        props = pm.Dirichlet('props', np.ones(nclusts))
        clust = pm.Categorical("clust", p=props, shape=nsamples)
    
        # NB distribution for count data using adjusted area and mu_clust
        count_mu = area_nonzero * mu_clust[clust]  # Use 'area' as multiplier
    
        # RBF based NB dispersion 
        theta_a = pm.Normal("theta_a", mu = 0, sigma=1, shape=ncenters)
        theta_b = pm.Normal("theta_b", mu = 0, sigma=1, shape=ncenters)
        count_disp = T.dot(T.exp(-T.exp(theta_b) * (T.reshape(T.repeat(count_mu, repeats=ncenters), newshape=(nsamples, ncelltypes, ncenters)) - centers[np.newaxis, np.newaxis, :])**2), T.exp(theta_a)) + epsilon
    
        # Likelihood for count data
        count_v = pm.NegativeBinomial("count_v", mu=count_mu, alpha=count_disp, observed=count_mat)
        
        # Don't forget lambda_0 AFTERWARDS ADD THIS -- need to fix intercepts and dimensions as a result
        lambda0 = pm.Gamma("lambda0", 0.1, 0.1, dims = "intervals") # 0.01, 0.05, 0.1
        
        # Linear predictor for Cox PH model
        linear_predictor = (
            beta_clust[clust]
            + beta_stage[stage]
            + beta_age * age
            + beta_chemo * chemo
            + beta_rt * rt
            + beta_brachy * brachy
        )
    
    
        lambda_ = pm.Deterministic("lambda_", T.outer(T.exp(linear_predictor), lambda0))
    
        mu = pm.Deterministic("mu", exposure * lambda_)
    
        # Poisson-Cox PH equivalence
        obs = pm.Poisson("obs", mu=mu, observed=death)
    
        survival_mixture_model.debug(verbose = True)
    
        trace = pm.sample(ndraw, tune=ntune, chains=1, progressbar=True)

        return {'trace': trace, 'model': survival_mixture_model}

In [55]:
os_outputs = fit_survcluster_model(os_inputs, nclusts = 2, ncenters = 20, interval_length = 0.3, epsilon = 1e-6, ndraw = 1000, ntune=1000)

point={'beta_clust0': array([0.]), 'beta_stage0': array([0., 0., 0.]), 'beta_age': array(0.), 'beta_chemo': array(0.), 'beta_rt': array(0.), 'beta_brachy': array(0.), 'mu_clust_log__': array([[5.74134442, 2.3838835 , 4.46843508, 1.23147393, 2.59370369,
        1.00939819, 5.61453408, 4.55177006, 2.46390665, 1.13416853,
        5.53723606, 3.99462307, 3.63119581, 2.00995528, 4.84529417,
        2.38146401, 5.03593986, 2.95538798, 4.79499169, 3.24137003,
        6.13982856, 4.427772  , 6.11182582, 3.85459295, 5.21345766,
        2.7528743 ],
       [5.74134442, 2.3838835 , 4.46843508, 1.23147393, 2.59370369,
        1.00939819, 5.61453408, 4.55177006, 2.46390665, 1.13416853,
        5.53723606, 3.99462307, 3.63119581, 2.00995528, 4.84529417,
        2.38146401, 5.03593986, 2.95538798, 4.79499169, 3.24137003,
        6.13982856, 4.427772  , 6.11182582, 3.85459295, 5.21345766,
        2.7528743 ]]), 'props_simplex__': array([0.]), 'clust': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

Sequential sampling (1 chains in 1 job)
CompoundStep
>NUTS: [beta_clust0, beta_stage0, beta_age, beta_chemo, beta_rt, beta_brachy, mu_clust, props, theta_a, theta_b, lambda0]
>BinaryGibbsMetropolis: [clust]


Sampling 1 chain for 1_000 tune and 1_000 draw iterations (1_000 + 1_000 draws total) took 622 seconds.
Only one chain was sampled, this makes it impossible to run some convergence checks


In [56]:
pm.summary(os_outputs['trace'], var_names=["props", "beta_clust", "beta_stage", "beta_age", "beta_chemo", "beta_rt", "beta_brachy"])



Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
props[0],0.509,0.038,0.439,0.577,0.004,0.003,91.0,541.0,
props[1],0.491,0.038,0.423,0.561,0.004,0.003,91.0,541.0,
beta_clust[0],0.0,0.0,0.0,0.0,0.0,0.0,1000.0,1000.0,
beta_clust[1],-0.447,0.253,-0.945,0.017,0.008,0.006,956.0,733.0,
beta_stage[0],0.0,0.0,0.0,0.0,0.0,0.0,1000.0,1000.0,
beta_stage[1],0.625,0.694,-0.609,1.938,0.024,0.018,956.0,613.0,
beta_stage[2],1.063,0.333,0.397,1.642,0.014,0.011,568.0,581.0,
beta_stage[3],2.894,0.44,2.069,3.745,0.023,0.017,360.0,446.0,
beta_age,0.205,0.127,-0.017,0.454,0.004,0.003,1042.0,588.0,
beta_chemo,-0.872,0.33,-1.458,-0.25,0.013,0.009,641.0,566.0,


In [51]:
pfs_outputs = fit_survcluster_model(pfs_inputs, nclusts = 2, ncenters = 20, interval_length = 0.3, epsilon = 1e-6, ndraw = 1000, ntune=1000)

point={'beta_clust0': array([0.]), 'beta_stage0': array([0., 0., 0.]), 'beta_age': array(0.), 'beta_chemo': array(0.), 'beta_rt': array(0.), 'beta_brachy': array(0.), 'mu_clust_log__': array([[5.68196172, 2.24199047, 4.48661082, 1.18186252, 2.61167384,
        1.02248025, 5.59179312, 4.53887048, 2.45747758, 1.10592862,
        5.5459417 , 4.01413411, 3.62124397, 2.01490522, 4.83820893,
        2.37418687, 5.05412827, 2.97242243, 4.77933392, 3.23039384,
        6.15899309, 4.37556219, 6.12371957, 3.82459632, 5.22114322,
        2.75491914],
       [5.68196172, 2.24199047, 4.48661082, 1.18186252, 2.61167384,
        1.02248025, 5.59179312, 4.53887048, 2.45747758, 1.10592862,
        5.5459417 , 4.01413411, 3.62124397, 2.01490522, 4.83820893,
        2.37418687, 5.05412827, 2.97242243, 4.77933392, 3.23039384,
        6.15899309, 4.37556219, 6.12371957, 3.82459632, 5.22114322,
        2.75491914]]), 'props_simplex__': array([0.]), 'clust': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

Sequential sampling (1 chains in 1 job)
CompoundStep
>NUTS: [beta_clust0, beta_stage0, beta_age, beta_chemo, beta_rt, beta_brachy, mu_clust, props, theta_a, theta_b, lambda0]
>BinaryGibbsMetropolis: [clust]


Sampling 1 chain for 1_000 tune and 1_000 draw iterations (1_000 + 1_000 draws total) took 584 seconds.
Only one chain was sampled, this makes it impossible to run some convergence checks


In [53]:
pm.summary(pfs_outputs['trace'], var_names=["props", "beta_clust", "beta_stage", "beta_age", "beta_chemo", "beta_rt", "beta_brachy"])



Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
props[0],0.494,0.036,0.427,0.562,0.001,0.001,787.0,739.0,
props[1],0.506,0.036,0.438,0.573,0.001,0.001,787.0,739.0,
beta_clust[0],0.0,0.0,0.0,0.0,0.0,0.0,1000.0,1000.0,
beta_clust[1],-0.43,0.235,-0.887,0.004,0.008,0.006,877.0,771.0,
beta_stage[0],0.0,0.0,0.0,0.0,0.0,0.0,1000.0,1000.0,
beta_stage[1],-0.215,0.833,-1.823,1.182,0.027,0.028,1111.0,444.0,
beta_stage[2],1.509,0.314,0.944,2.091,0.012,0.009,749.0,601.0,
beta_stage[3],2.865,0.43,2.071,3.66,0.018,0.013,569.0,474.0,
beta_age,0.013,0.124,-0.236,0.219,0.004,0.004,1115.0,680.0,
beta_chemo,-0.47,0.346,-1.083,0.196,0.014,0.01,649.0,578.0,


In [59]:
dss_outputs = fit_survcluster_model(dss_inputs, nclusts = 2, ncenters = 20, interval_length = 0.3, epsilon = 1e-6, ndraw = 1000, ntune=1000)

point={'beta_clust0': array([0.]), 'beta_stage0': array([0., 0., 0.]), 'beta_age': array(0.), 'beta_chemo': array(0.), 'beta_rt': array(0.), 'beta_brachy': array(0.), 'mu_clust_log__': array([[5.75870179, 2.31838358, 4.46375287, 1.24355049, 2.60599007,
        1.03637965, 5.60291106, 4.55877398, 2.46794208, 1.15072592,
        5.54594424, 4.0232128 , 3.6484534 , 2.04259638, 4.85501928,
        2.39959025, 5.05330594, 2.98003355, 4.80937621, 3.27254467,
        6.14341007, 4.42421188, 6.11291037, 3.83479628, 5.2219894 ,
        2.75504639],
       [5.75870179, 2.31838358, 4.46375287, 1.24355049, 2.60599007,
        1.03637965, 5.60291106, 4.55877398, 2.46794208, 1.15072592,
        5.54594424, 4.0232128 , 3.6484534 , 2.04259638, 4.85501928,
        2.39959025, 5.05330594, 2.98003355, 4.80937621, 3.27254467,
        6.14341007, 4.42421188, 6.11291037, 3.83479628, 5.2219894 ,
        2.75504639]]), 'props_simplex__': array([0.]), 'clust': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

Sequential sampling (1 chains in 1 job)
CompoundStep
>NUTS: [beta_clust0, beta_stage0, beta_age, beta_chemo, beta_rt, beta_brachy, mu_clust, props, theta_a, theta_b, lambda0]
>BinaryGibbsMetropolis: [clust]


Sampling 1 chain for 1_000 tune and 1_000 draw iterations (1_000 + 1_000 draws total) took 755 seconds.
Only one chain was sampled, this makes it impossible to run some convergence checks


In [60]:
pm.summary(dss_outputs['trace'], var_names=["props", "beta_clust", "beta_stage", "beta_age", "beta_chemo", "beta_rt", "beta_brachy"])



Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
props[0],0.513,0.038,0.44,0.58,0.002,0.001,551.0,559.0,
props[1],0.487,0.038,0.42,0.56,0.002,0.001,551.0,559.0,
beta_clust[0],0.0,0.0,0.0,0.0,0.0,0.0,1000.0,1000.0,
beta_clust[1],-0.481,0.287,-0.97,0.124,0.01,0.008,865.0,628.0,
beta_stage[0],0.0,0.0,0.0,0.0,0.0,0.0,1000.0,1000.0,
beta_stage[1],0.099,1.364,-2.207,2.542,0.052,0.051,806.0,476.0,
beta_stage[2],1.618,0.399,0.865,2.297,0.016,0.012,618.0,577.0,
beta_stage[3],3.449,0.54,2.494,4.444,0.024,0.017,521.0,641.0,
beta_age,0.093,0.14,-0.168,0.365,0.005,0.004,849.0,618.0,
beta_chemo,-0.991,0.402,-1.814,-0.305,0.015,0.011,739.0,569.0,


## Output traces

In [81]:
def create_trace_table(trace, export_variables = ['props', 'beta_clust', 'beta_stage', 'beta_age', 'beta_chemo', 'beta_rt', 'beta_brachy']):
    # Create an empty list to store DataFrames
    export_dfs = []
    
    # Iterate over the list of column names
    for variable in export_variables:
        #print(variable)
        posterior = trace.posterior[variable][0]
        
        if posterior.ndim == 2:
            posterior_clust_assignments = pd.DataFrame(posterior, columns=[f'{variable}_{i}' for i in range(posterior.shape[1])])
        elif posterior.ndim == 1:
            posterior_clust_assignments = pd.DataFrame(posterior, columns=[variable])
        else:
            print("Should not get here.")
        
        # Append the DataFrame to the list
        export_dfs.append(posterior_clust_assignments)
    
    # Concatenate the DataFrames horizontally (column bind)
    trace_table = pd.concat(export_dfs, axis=1)
    return trace_table

def extract_cluster_assignments(trace, sample_names):
    cluster_df = pd.DataFrame(trace.posterior['clust'][0].T)
    cluster_df.index = sample_names
    return cluster_df.reset_index()

In [148]:
os_trace_table = create_trace_table(os_outputs['trace'])
os_cluster_assignments = extract_cluster_assignments(os_outputs['trace'], sample_names = os_inputs['counts'].index)

pfs_trace_table = create_trace_table(pfs_outputs['trace'])
pfs_cluster_assignments = extract_cluster_assignments(pfs_outputs['trace'], sample_names = pfs_inputs['counts'].index)

dss_trace_table = create_trace_table(dss_outputs['trace'])
dss_cluster_assignments = extract_cluster_assignments(dss_outputs['trace'], sample_names = dss_inputs['counts'].index)

In [149]:
trace_output_dir = here('results/survival_cluster/traces')
#model_output_dir = here('results/survival_cluster/models')

os_trace_table.to_csv(os.path.join(trace_output_dir, 'os_trace_table.tsv'), sep='\t')
os_cluster_assignments.to_csv(os.path.join(trace_output_dir, 'os_cluster_assignments.tsv'), sep='\t')

pfs_trace_table.to_csv(os.path.join(trace_output_dir, 'pfs_trace_table.tsv'), sep='\t')
pfs_cluster_assignments.to_csv(os.path.join(trace_output_dir, 'pfs_cluster_assignments.tsv'), sep='\t')

dss_trace_table.to_csv(os.path.join(trace_output_dir, 'dss_trace_table.tsv'), sep='\t')
dss_cluster_assignments.to_csv(os.path.join(trace_output_dir, 'dss_cluster_assignments.tsv'), sep='\t')

# Output arviz inferencedata objects
os_outputs['trace'].to_netcdf(os.path.join(trace_output_dir, 'os_results.nc'))
pfs_outputs['trace'].to_netcdf(os.path.join(trace_output_dir, 'pfs_results.nc'))
dss_outputs['trace'].to_netcdf(os.path.join(trace_output_dir, 'dss_results.nc'))

'/Users/alzhang/Documents/projects/tfri_halo/results/survival_cluster/traces/dss_results.nc'

## 3 cluster analysis

Formalization of the 3 cluster analysis I've briefly looked at. What if we provide the model with nclusts = 3? 

In [160]:
os_outputs_3clust = fit_survcluster_model(os_inputs, nclusts = 3, ncenters = 20, interval_length = 0.1, epsilon = 1e-6, ndraw = 1000, ntune=1000)

point={'beta_clust0': array([0., 0.]), 'beta_stage0': array([0., 0., 0.]), 'beta_age': array(0.), 'beta_chemo': array(0.), 'beta_rt': array(0.), 'beta_brachy': array(0.), 'mu_clust_log__': array([[5.74134442, 2.3838835 , 4.46843508, 1.23147393, 2.59370369,
        1.00939819, 5.61453408, 4.55177006, 2.46390665, 1.13416853,
        5.53723606, 3.99462307, 3.63119581, 2.00995528, 4.84529417,
        2.38146401, 5.03593986, 2.95538798, 4.79499169, 3.24137003,
        6.13982856, 4.427772  , 6.11182582, 3.85459295, 5.21345766,
        2.7528743 ],
       [5.74134442, 2.3838835 , 4.46843508, 1.23147393, 2.59370369,
        1.00939819, 5.61453408, 4.55177006, 2.46390665, 1.13416853,
        5.53723606, 3.99462307, 3.63119581, 2.00995528, 4.84529417,
        2.38146401, 5.03593986, 2.95538798, 4.79499169, 3.24137003,
        6.13982856, 4.427772  , 6.11182582, 3.85459295, 5.21345766,
        2.7528743 ],
       [5.74134442, 2.3838835 , 4.46843508, 1.23147393, 2.59370369,
        1.00939819, 5

Sequential sampling (1 chains in 1 job)
CompoundStep
>NUTS: [beta_clust0, beta_stage0, beta_age, beta_chemo, beta_rt, beta_brachy, mu_clust, props, theta_a, theta_b, lambda0]
>CategoricalGibbsMetropolis: [clust]


Sampling 1 chain for 1_000 tune and 1_000 draw iterations (1_000 + 1_000 draws total) took 1269 seconds.
Only one chain was sampled, this makes it impossible to run some convergence checks


In [161]:
pm.summary(os_outputs_3clust['trace'], var_names=["props", "beta_clust", "beta_stage", "beta_age", "beta_chemo", "beta_rt", "beta_brachy"])



Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
props[0],0.426,0.039,0.358,0.502,0.002,0.002,313.0,466.0,
props[1],0.24,0.04,0.167,0.311,0.009,0.007,19.0,138.0,
props[2],0.334,0.037,0.267,0.402,0.006,0.004,40.0,487.0,
beta_clust[0],0.0,0.0,0.0,0.0,0.0,0.0,1000.0,1000.0,
beta_clust[1],-0.92,0.335,-1.581,-0.32,0.014,0.01,607.0,755.0,
beta_clust[2],-0.6,0.289,-1.093,-0.018,0.015,0.011,380.0,740.0,
beta_stage[0],0.0,0.0,0.0,0.0,0.0,0.0,1000.0,1000.0,
beta_stage[1],0.508,0.69,-0.879,1.76,0.024,0.017,869.0,712.0,
beta_stage[2],1.054,0.36,0.396,1.773,0.016,0.011,505.0,531.0,
beta_stage[3],2.769,0.455,1.923,3.602,0.021,0.015,460.0,533.0,


In [156]:
pfs_outputs_3clust = fit_survcluster_model(pfs_inputs, nclusts = 3, ncenters = 20, interval_length = 0.1, epsilon = 1e-6, ndraw = 1000, ntune=1000)

point={'beta_clust0': array([0., 0.]), 'beta_stage0': array([0., 0., 0.]), 'beta_age': array(0.), 'beta_chemo': array(0.), 'beta_rt': array(0.), 'beta_brachy': array(0.), 'mu_clust_log__': array([[5.68196172, 2.24199047, 4.48661082, 1.18186252, 2.61167384,
        1.02248025, 5.59179312, 4.53887048, 2.45747758, 1.10592862,
        5.5459417 , 4.01413411, 3.62124397, 2.01490522, 4.83820893,
        2.37418687, 5.05412827, 2.97242243, 4.77933392, 3.23039384,
        6.15899309, 4.37556219, 6.12371957, 3.82459632, 5.22114322,
        2.75491914],
       [5.68196172, 2.24199047, 4.48661082, 1.18186252, 2.61167384,
        1.02248025, 5.59179312, 4.53887048, 2.45747758, 1.10592862,
        5.5459417 , 4.01413411, 3.62124397, 2.01490522, 4.83820893,
        2.37418687, 5.05412827, 2.97242243, 4.77933392, 3.23039384,
        6.15899309, 4.37556219, 6.12371957, 3.82459632, 5.22114322,
        2.75491914],
       [5.68196172, 2.24199047, 4.48661082, 1.18186252, 2.61167384,
        1.02248025, 5

Sequential sampling (1 chains in 1 job)
CompoundStep
>NUTS: [beta_clust0, beta_stage0, beta_age, beta_chemo, beta_rt, beta_brachy, mu_clust, props, theta_a, theta_b, lambda0]
>CategoricalGibbsMetropolis: [clust]


Sampling 1 chain for 1_000 tune and 1_000 draw iterations (1_000 + 1_000 draws total) took 1087 seconds.
Only one chain was sampled, this makes it impossible to run some convergence checks


In [158]:
pm.summary(pfs_outputs_3clust['trace'], var_names=["props", "beta_clust", "beta_stage", "beta_age", "beta_chemo", "beta_rt", "beta_brachy"])



Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
props[0],0.405,0.038,0.34,0.48,0.002,0.002,299.0,610.0,
props[1],0.277,0.035,0.215,0.34,0.004,0.003,58.0,338.0,
props[2],0.318,0.034,0.257,0.383,0.002,0.001,268.0,637.0,
beta_clust[0],0.0,0.0,0.0,0.0,0.0,0.0,1000.0,1000.0,
beta_clust[1],-0.667,0.323,-1.278,-0.094,0.019,0.014,296.0,524.0,
beta_clust[2],-0.291,0.286,-0.846,0.241,0.016,0.012,287.0,725.0,
beta_stage[0],0.0,0.0,0.0,0.0,0.0,0.0,1000.0,1000.0,
beta_stage[1],-0.282,0.821,-1.729,1.135,0.037,0.039,623.0,417.0,
beta_stage[2],1.469,0.289,0.899,1.957,0.011,0.008,657.0,592.0,
beta_stage[3],2.743,0.402,1.975,3.474,0.017,0.012,591.0,485.0,


In [157]:
dss_outputs_3clust = fit_survcluster_model(dss_inputs, nclusts = 3, ncenters = 20, interval_length = 0.1, epsilon = 1e-6, ndraw = 1000, ntune=1000)

point={'beta_clust0': array([0., 0.]), 'beta_stage0': array([0., 0., 0.]), 'beta_age': array(0.), 'beta_chemo': array(0.), 'beta_rt': array(0.), 'beta_brachy': array(0.), 'mu_clust_log__': array([[5.75870179, 2.31838358, 4.46375287, 1.24355049, 2.60599007,
        1.03637965, 5.60291106, 4.55877398, 2.46794208, 1.15072592,
        5.54594424, 4.0232128 , 3.6484534 , 2.04259638, 4.85501928,
        2.39959025, 5.05330594, 2.98003355, 4.80937621, 3.27254467,
        6.14341007, 4.42421188, 6.11291037, 3.83479628, 5.2219894 ,
        2.75504639],
       [5.75870179, 2.31838358, 4.46375287, 1.24355049, 2.60599007,
        1.03637965, 5.60291106, 4.55877398, 2.46794208, 1.15072592,
        5.54594424, 4.0232128 , 3.6484534 , 2.04259638, 4.85501928,
        2.39959025, 5.05330594, 2.98003355, 4.80937621, 3.27254467,
        6.14341007, 4.42421188, 6.11291037, 3.83479628, 5.2219894 ,
        2.75504639],
       [5.75870179, 2.31838358, 4.46375287, 1.24355049, 2.60599007,
        1.03637965, 5

Sequential sampling (1 chains in 1 job)
CompoundStep
>NUTS: [beta_clust0, beta_stage0, beta_age, beta_chemo, beta_rt, beta_brachy, mu_clust, props, theta_a, theta_b, lambda0]
>CategoricalGibbsMetropolis: [clust]


Sampling 1 chain for 1_000 tune and 1_000 draw iterations (1_000 + 1_000 draws total) took 1128 seconds.
Only one chain was sampled, this makes it impossible to run some convergence checks


In [159]:
pm.summary(dss_outputs_3clust['trace'], var_names=["props", "beta_clust", "beta_stage", "beta_age", "beta_chemo", "beta_rt", "beta_brachy"])



Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
props[0],0.421,0.037,0.354,0.489,0.002,0.001,395.0,569.0,
props[1],0.226,0.031,0.165,0.282,0.002,0.001,250.0,421.0,
props[2],0.353,0.035,0.286,0.42,0.002,0.001,288.0,525.0,
beta_clust[0],0.0,0.0,0.0,0.0,0.0,0.0,1000.0,1000.0,
beta_clust[1],-1.324,0.435,-2.135,-0.511,0.021,0.015,444.0,460.0,
beta_clust[2],-0.723,0.345,-1.349,-0.118,0.019,0.013,333.0,475.0,
beta_stage[0],0.0,0.0,0.0,0.0,0.0,0.0,1000.0,1000.0,
beta_stage[1],0.455,1.272,-1.988,2.705,0.054,0.038,599.0,641.0,
beta_stage[2],1.634,0.407,0.88,2.38,0.02,0.014,414.0,562.0,
beta_stage[3],3.186,0.535,2.276,4.291,0.029,0.021,336.0,651.0,


In [162]:
os_3clust_trace_table = create_trace_table(os_outputs_3clust['trace'])
os_3clust_cluster_assignments = extract_cluster_assignments(os_outputs_3clust['trace'], sample_names = os_inputs['counts'].index)

pfs_3clust_trace_table = create_trace_table(pfs_outputs_3clust['trace'])
pfs_3clust_cluster_assignments = extract_cluster_assignments(pfs_outputs_3clust['trace'], sample_names = pfs_inputs['counts'].index)

dss_3clust_trace_table = create_trace_table(dss_outputs_3clust['trace'])
dss_3clust_cluster_assignments = extract_cluster_assignments(dss_outputs_3clust['trace'], sample_names = dss_inputs['counts'].index)

In [163]:
trace_3clust_output_dir = here('results/survival_cluster/traces_3clust')

os_3clust_trace_table.to_csv(os.path.join(trace_3clust_output_dir, 'os_trace_table.tsv'), sep='\t')
os_3clust_cluster_assignments.to_csv(os.path.join(trace_3clust_output_dir, 'os_cluster_assignments.tsv'), sep='\t')

pfs_3clust_trace_table.to_csv(os.path.join(trace_3clust_output_dir, 'pfs_trace_table.tsv'), sep='\t')
pfs_3clust_cluster_assignments.to_csv(os.path.join(trace_3clust_output_dir, 'pfs_cluster_assignments.tsv'), sep='\t')

dss_3clust_trace_table.to_csv(os.path.join(trace_3clust_output_dir, 'dss_trace_table.tsv'), sep='\t')
dss_3clust_cluster_assignments.to_csv(os.path.join(trace_3clust_output_dir, 'dss_cluster_assignments.tsv'), sep='\t')

# Output arviz inferencedata objects
os_outputs_3clust['trace'].to_netcdf(os.path.join(trace_3clust_output_dir, 'os_results.nc'))
pfs_outputs_3clust['trace'].to_netcdf(os.path.join(trace_3clust_output_dir, 'pfs_results.nc'))
dss_outputs_3clust['trace'].to_netcdf(os.path.join(trace_3clust_output_dir, 'dss_results.nc'))

'/Users/alzhang/Documents/projects/tfri_halo/results/survival_cluster/traces_3clust/dss_results.nc'

## Exploratory

In [106]:
os_outputs2 = fit_survcluster_model(os_inputs, nclusts = 2, ncenters = 20, interval_length = 0.3, epsilon = 1e-6, ndraw = 2000, ntune=2000)

point={'beta_clust0': array([0.]), 'beta_stage0': array([0., 0., 0.]), 'beta_age': array(0.), 'beta_chemo': array(0.), 'beta_rt': array(0.), 'beta_brachy': array(0.), 'mu_clust_log__': array([[5.74134442, 2.3838835 , 4.46843508, 1.23147393, 2.59370369,
        1.00939819, 5.61453408, 4.55177006, 2.46390665, 1.13416853,
        5.53723606, 3.99462307, 3.63119581, 2.00995528, 4.84529417,
        2.38146401, 5.03593986, 2.95538798, 4.79499169, 3.24137003,
        6.13982856, 4.427772  , 6.11182582, 3.85459295, 5.21345766,
        2.7528743 ],
       [5.74134442, 2.3838835 , 4.46843508, 1.23147393, 2.59370369,
        1.00939819, 5.61453408, 4.55177006, 2.46390665, 1.13416853,
        5.53723606, 3.99462307, 3.63119581, 2.00995528, 4.84529417,
        2.38146401, 5.03593986, 2.95538798, 4.79499169, 3.24137003,
        6.13982856, 4.427772  , 6.11182582, 3.85459295, 5.21345766,
        2.7528743 ]]), 'props_simplex__': array([0.]), 'clust': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

Sequential sampling (1 chains in 1 job)
CompoundStep
>NUTS: [beta_clust0, beta_stage0, beta_age, beta_chemo, beta_rt, beta_brachy, mu_clust, props, theta_a, theta_b, lambda0]
>BinaryGibbsMetropolis: [clust]


Sampling 1 chain for 2_000 tune and 2_000 draw iterations (2_000 + 2_000 draws total) took 1707 seconds.
Only one chain was sampled, this makes it impossible to run some convergence checks


In [107]:
pm.summary(os_outputs2['trace'], var_names=["props", "beta_clust", "beta_stage", "beta_age", "beta_chemo", "beta_rt", "beta_brachy"])



Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
props[0],0.512,0.036,0.445,0.582,0.002,0.001,361.0,1290.0,
props[1],0.488,0.036,0.418,0.555,0.002,0.001,361.0,1290.0,
beta_clust[0],0.0,0.0,0.0,0.0,0.0,0.0,2000.0,2000.0,
beta_clust[1],-0.443,0.253,-0.905,0.037,0.007,0.005,1527.0,1209.0,
beta_stage[0],0.0,0.0,0.0,0.0,0.0,0.0,2000.0,2000.0,
beta_stage[1],0.643,0.722,-0.811,1.863,0.018,0.013,1689.0,1259.0,
beta_stage[2],1.039,0.348,0.368,1.661,0.01,0.007,1152.0,938.0,
beta_stage[3],2.876,0.464,1.97,3.687,0.016,0.011,862.0,1187.0,
beta_age,0.21,0.122,-0.025,0.424,0.003,0.002,1708.0,1246.0,
beta_chemo,-0.851,0.345,-1.466,-0.169,0.011,0.008,1043.0,926.0,


In [108]:
os_outputs3 = fit_survcluster_model(os_inputs, nclusts = 3, ncenters = 20, interval_length = 0.3, epsilon = 1e-6, ndraw = 2000, ntune=2000)

point={'beta_clust0': array([0., 0.]), 'beta_stage0': array([0., 0., 0.]), 'beta_age': array(0.), 'beta_chemo': array(0.), 'beta_rt': array(0.), 'beta_brachy': array(0.), 'mu_clust_log__': array([[5.74134442, 2.3838835 , 4.46843508, 1.23147393, 2.59370369,
        1.00939819, 5.61453408, 4.55177006, 2.46390665, 1.13416853,
        5.53723606, 3.99462307, 3.63119581, 2.00995528, 4.84529417,
        2.38146401, 5.03593986, 2.95538798, 4.79499169, 3.24137003,
        6.13982856, 4.427772  , 6.11182582, 3.85459295, 5.21345766,
        2.7528743 ],
       [5.74134442, 2.3838835 , 4.46843508, 1.23147393, 2.59370369,
        1.00939819, 5.61453408, 4.55177006, 2.46390665, 1.13416853,
        5.53723606, 3.99462307, 3.63119581, 2.00995528, 4.84529417,
        2.38146401, 5.03593986, 2.95538798, 4.79499169, 3.24137003,
        6.13982856, 4.427772  , 6.11182582, 3.85459295, 5.21345766,
        2.7528743 ],
       [5.74134442, 2.3838835 , 4.46843508, 1.23147393, 2.59370369,
        1.00939819, 5

Sequential sampling (1 chains in 1 job)
CompoundStep
>NUTS: [beta_clust0, beta_stage0, beta_age, beta_chemo, beta_rt, beta_brachy, mu_clust, props, theta_a, theta_b, lambda0]
>CategoricalGibbsMetropolis: [clust]


Sampling 1 chain for 2_000 tune and 2_000 draw iterations (2_000 + 2_000 draws total) took 1799 seconds.
Only one chain was sampled, this makes it impossible to run some convergence checks


In [109]:
pm.summary(os_outputs3['trace'], var_names=["props", "beta_clust", "beta_stage", "beta_age", "beta_chemo", "beta_rt", "beta_brachy"])



Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
props[0],0.354,0.037,0.285,0.425,0.003,0.002,114.0,550.0,
props[1],0.219,0.033,0.157,0.278,0.006,0.005,26.0,128.0,
props[2],0.427,0.037,0.363,0.5,0.001,0.001,757.0,942.0,
beta_clust[0],0.0,0.0,0.0,0.0,0.0,0.0,2000.0,2000.0,
beta_clust[1],-0.621,0.37,-1.368,0.045,0.015,0.01,607.0,1418.0,
beta_clust[2],0.118,0.291,-0.413,0.694,0.014,0.01,449.0,734.0,
beta_stage[0],0.0,0.0,0.0,0.0,0.0,0.0,2000.0,2000.0,
beta_stage[1],0.705,0.713,-0.658,1.963,0.02,0.014,1390.0,1195.0,
beta_stage[2],1.102,0.344,0.418,1.695,0.01,0.007,1150.0,1114.0,
beta_stage[3],2.867,0.46,1.936,3.673,0.016,0.011,813.0,1142.0,


In [125]:
os_outputs4 = fit_survcluster_model(os_inputs, nclusts = 3, ncenters = 20, interval_length = 0.2, epsilon = 1e-6, ndraw = 2000, ntune=2000)

point={'beta_clust0': array([0., 0.]), 'beta_stage0': array([0., 0., 0.]), 'beta_age': array(0.), 'beta_chemo': array(0.), 'beta_rt': array(0.), 'beta_brachy': array(0.), 'mu_clust_log__': array([[5.74134442, 2.3838835 , 4.46843508, 1.23147393, 2.59370369,
        1.00939819, 5.61453408, 4.55177006, 2.46390665, 1.13416853,
        5.53723606, 3.99462307, 3.63119581, 2.00995528, 4.84529417,
        2.38146401, 5.03593986, 2.95538798, 4.79499169, 3.24137003,
        6.13982856, 4.427772  , 6.11182582, 3.85459295, 5.21345766,
        2.7528743 ],
       [5.74134442, 2.3838835 , 4.46843508, 1.23147393, 2.59370369,
        1.00939819, 5.61453408, 4.55177006, 2.46390665, 1.13416853,
        5.53723606, 3.99462307, 3.63119581, 2.00995528, 4.84529417,
        2.38146401, 5.03593986, 2.95538798, 4.79499169, 3.24137003,
        6.13982856, 4.427772  , 6.11182582, 3.85459295, 5.21345766,
        2.7528743 ],
       [5.74134442, 2.3838835 , 4.46843508, 1.23147393, 2.59370369,
        1.00939819, 5

Sequential sampling (1 chains in 1 job)
CompoundStep
>NUTS: [beta_clust0, beta_stage0, beta_age, beta_chemo, beta_rt, beta_brachy, mu_clust, props, theta_a, theta_b, lambda0]
>CategoricalGibbsMetropolis: [clust]


Sampling 1 chain for 2_000 tune and 2_000 draw iterations (2_000 + 2_000 draws total) took 1707 seconds.
Only one chain was sampled, this makes it impossible to run some convergence checks


In [126]:
pm.summary(os_outputs4['trace'], var_names=["props", "beta_clust", "beta_stage", "beta_age", "beta_chemo", "beta_rt", "beta_brachy"])



Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
props[0],0.419,0.039,0.346,0.495,0.003,0.002,233.0,441.0,
props[1],0.252,0.038,0.183,0.321,0.007,0.005,25.0,293.0,
props[2],0.329,0.036,0.26,0.393,0.003,0.002,160.0,974.0,
beta_clust[0],0.0,0.0,0.0,0.0,0.0,0.0,2000.0,2000.0,
beta_clust[1],-0.847,0.338,-1.45,-0.194,0.011,0.008,873.0,993.0,
beta_clust[2],-0.478,0.296,-1.036,0.06,0.011,0.008,758.0,921.0,
beta_stage[0],0.0,0.0,0.0,0.0,0.0,0.0,2000.0,2000.0,
beta_stage[1],0.573,0.698,-0.715,1.804,0.023,0.017,952.0,1045.0,
beta_stage[2],1.115,0.343,0.474,1.771,0.011,0.008,972.0,1070.0,
beta_stage[3],2.869,0.446,2.106,3.769,0.016,0.011,812.0,1168.0,


In [136]:
os_outputs5 = fit_survcluster_model(os_inputs, nclusts = 3, ncenters = 20, interval_length = 0.2, epsilon = 1e-6, ndraw = 2000, ntune=2000)

point={'beta_clust0': array([0., 0.]), 'beta_stage0': array([0., 0., 0.]), 'beta_age': array(0.), 'beta_chemo': array(0.), 'beta_rt': array(0.), 'beta_brachy': array(0.), 'mu_clust_log__': array([[5.74134442, 2.3838835 , 4.46843508, 1.23147393, 2.59370369,
        1.00939819, 5.61453408, 4.55177006, 2.46390665, 1.13416853,
        5.53723606, 3.99462307, 3.63119581, 2.00995528, 4.84529417,
        2.38146401, 5.03593986, 2.95538798, 4.79499169, 3.24137003,
        6.13982856, 4.427772  , 6.11182582, 3.85459295, 5.21345766,
        2.7528743 ],
       [5.74134442, 2.3838835 , 4.46843508, 1.23147393, 2.59370369,
        1.00939819, 5.61453408, 4.55177006, 2.46390665, 1.13416853,
        5.53723606, 3.99462307, 3.63119581, 2.00995528, 4.84529417,
        2.38146401, 5.03593986, 2.95538798, 4.79499169, 3.24137003,
        6.13982856, 4.427772  , 6.11182582, 3.85459295, 5.21345766,
        2.7528743 ],
       [5.74134442, 2.3838835 , 4.46843508, 1.23147393, 2.59370369,
        1.00939819, 5

Sequential sampling (1 chains in 1 job)
CompoundStep
>NUTS: [beta_clust0, beta_stage0, beta_age, beta_chemo, beta_rt, beta_brachy, mu_clust, props, theta_a, theta_b, lambda0]
>CategoricalGibbsMetropolis: [clust]


Sampling 1 chain for 2_000 tune and 2_000 draw iterations (2_000 + 2_000 draws total) took 2009 seconds.
Only one chain was sampled, this makes it impossible to run some convergence checks


In [137]:
pm.summary(os_outputs5['trace'], var_names=["props", "beta_clust", "beta_stage", "beta_age", "beta_chemo", "beta_rt", "beta_brachy"])



Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
props[0],0.218,0.031,0.162,0.276,0.002,0.002,188.0,877.0,
props[1],0.35,0.037,0.287,0.421,0.003,0.002,164.0,768.0,
props[2],0.433,0.036,0.368,0.502,0.001,0.001,943.0,1182.0,
beta_clust[0],0.0,0.0,0.0,0.0,0.0,0.0,2000.0,2000.0,
beta_clust[1],0.041,0.345,-0.595,0.675,0.019,0.014,314.0,586.0,
beta_clust[2],0.349,0.311,-0.2,0.992,0.018,0.014,306.0,403.0,
beta_stage[0],0.0,0.0,0.0,0.0,0.0,0.0,2000.0,2000.0,
beta_stage[1],0.55,0.683,-0.743,1.779,0.023,0.016,918.0,911.0,
beta_stage[2],1.06,0.341,0.436,1.689,0.013,0.009,715.0,763.0,
beta_stage[3],2.821,0.453,2.016,3.71,0.019,0.013,600.0,657.0,


In [141]:
os_outputs6 = fit_survcluster_model(os_inputs, nclusts = 2, ncenters = 20, interval_length = 0.1, epsilon = 1e-6, ndraw = 1000, ntune=1000)

point={'beta_clust0': array([0.]), 'beta_stage0': array([0., 0., 0.]), 'beta_age': array(0.), 'beta_chemo': array(0.), 'beta_rt': array(0.), 'beta_brachy': array(0.), 'mu_clust_log__': array([[5.74134442, 2.3838835 , 4.46843508, 1.23147393, 2.59370369,
        1.00939819, 5.61453408, 4.55177006, 2.46390665, 1.13416853,
        5.53723606, 3.99462307, 3.63119581, 2.00995528, 4.84529417,
        2.38146401, 5.03593986, 2.95538798, 4.79499169, 3.24137003,
        6.13982856, 4.427772  , 6.11182582, 3.85459295, 5.21345766,
        2.7528743 ],
       [5.74134442, 2.3838835 , 4.46843508, 1.23147393, 2.59370369,
        1.00939819, 5.61453408, 4.55177006, 2.46390665, 1.13416853,
        5.53723606, 3.99462307, 3.63119581, 2.00995528, 4.84529417,
        2.38146401, 5.03593986, 2.95538798, 4.79499169, 3.24137003,
        6.13982856, 4.427772  , 6.11182582, 3.85459295, 5.21345766,
        2.7528743 ]]), 'props_simplex__': array([0.]), 'clust': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

Sequential sampling (1 chains in 1 job)
CompoundStep
>NUTS: [beta_clust0, beta_stage0, beta_age, beta_chemo, beta_rt, beta_brachy, mu_clust, props, theta_a, theta_b, lambda0]
>BinaryGibbsMetropolis: [clust]


Sampling 1 chain for 1_000 tune and 1_000 draw iterations (1_000 + 1_000 draws total) took 776 seconds.
Only one chain was sampled, this makes it impossible to run some convergence checks


In [142]:
pm.summary(os_outputs6['trace'], var_names=["props", "beta_clust", "beta_stage", "beta_age", "beta_chemo", "beta_rt", "beta_brachy"])



Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
props[0],0.499,0.039,0.425,0.573,0.001,0.001,951.0,466.0,
props[1],0.501,0.039,0.427,0.575,0.001,0.001,951.0,466.0,
beta_clust[0],0.0,0.0,0.0,0.0,0.0,0.0,1000.0,1000.0,
beta_clust[1],-0.547,0.245,-0.986,-0.064,0.007,0.005,1169.0,662.0,
beta_stage[0],0.0,0.0,0.0,0.0,0.0,0.0,1000.0,1000.0,
beta_stage[1],0.696,0.661,-0.558,1.884,0.021,0.015,1031.0,738.0,
beta_stage[2],0.986,0.317,0.41,1.608,0.012,0.008,725.0,743.0,
beta_stage[3],2.818,0.431,1.995,3.507,0.018,0.013,593.0,645.0,
beta_age,0.165,0.119,-0.058,0.372,0.004,0.003,1056.0,628.0,
beta_chemo,-0.973,0.325,-1.573,-0.353,0.013,0.009,671.0,676.0,


In [132]:
pfs_outputs2 = fit_survcluster_model(pfs_inputs, nclusts = 3, ncenters = 20, interval_length = 0.2, epsilon = 1e-6, ndraw = 2000, ntune=2000)

point={'beta_clust0': array([0., 0.]), 'beta_stage0': array([0., 0., 0.]), 'beta_age': array(0.), 'beta_chemo': array(0.), 'beta_rt': array(0.), 'beta_brachy': array(0.), 'mu_clust_log__': array([[5.68196172, 2.24199047, 4.48661082, 1.18186252, 2.61167384,
        1.02248025, 5.59179312, 4.53887048, 2.45747758, 1.10592862,
        5.5459417 , 4.01413411, 3.62124397, 2.01490522, 4.83820893,
        2.37418687, 5.05412827, 2.97242243, 4.77933392, 3.23039384,
        6.15899309, 4.37556219, 6.12371957, 3.82459632, 5.22114322,
        2.75491914],
       [5.68196172, 2.24199047, 4.48661082, 1.18186252, 2.61167384,
        1.02248025, 5.59179312, 4.53887048, 2.45747758, 1.10592862,
        5.5459417 , 4.01413411, 3.62124397, 2.01490522, 4.83820893,
        2.37418687, 5.05412827, 2.97242243, 4.77933392, 3.23039384,
        6.15899309, 4.37556219, 6.12371957, 3.82459632, 5.22114322,
        2.75491914],
       [5.68196172, 2.24199047, 4.48661082, 1.18186252, 2.61167384,
        1.02248025, 5

Sequential sampling (1 chains in 1 job)
CompoundStep
>NUTS: [beta_clust0, beta_stage0, beta_age, beta_chemo, beta_rt, beta_brachy, mu_clust, props, theta_a, theta_b, lambda0]
>CategoricalGibbsMetropolis: [clust]


Sampling 1 chain for 2_000 tune and 2_000 draw iterations (2_000 + 2_000 draws total) took 1957 seconds.
Only one chain was sampled, this makes it impossible to run some convergence checks


In [133]:
pm.summary(pfs_outputs2['trace'], var_names=["props", "beta_clust", "beta_stage", "beta_age", "beta_chemo", "beta_rt", "beta_brachy"])



Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
props[0],0.262,0.036,0.2,0.334,0.003,0.002,140.0,499.0,
props[1],0.325,0.036,0.262,0.395,0.002,0.001,299.0,892.0,
props[2],0.413,0.037,0.347,0.484,0.002,0.001,543.0,872.0,
beta_clust[0],0.0,0.0,0.0,0.0,0.0,0.0,2000.0,2000.0,
beta_clust[1],0.013,0.303,-0.525,0.609,0.012,0.009,604.0,1395.0,
beta_clust[2],-0.131,0.297,-0.678,0.442,0.013,0.009,524.0,645.0,
beta_stage[0],0.0,0.0,0.0,0.0,0.0,0.0,2000.0,2000.0,
beta_stage[1],-0.347,0.843,-1.909,1.091,0.023,0.019,1511.0,1240.0,
beta_stage[2],1.47,0.306,0.914,2.041,0.009,0.007,1107.0,1114.0,
beta_stage[3],2.688,0.425,1.903,3.499,0.017,0.012,652.0,796.0,


In [143]:
pfs_outputs3 = fit_survcluster_model(pfs_inputs, nclusts = 2, ncenters = 20, interval_length = 0.1, epsilon = 1e-6, ndraw = 1000, ntune=1000)

point={'beta_clust0': array([0.]), 'beta_stage0': array([0., 0., 0.]), 'beta_age': array(0.), 'beta_chemo': array(0.), 'beta_rt': array(0.), 'beta_brachy': array(0.), 'mu_clust_log__': array([[5.68196172, 2.24199047, 4.48661082, 1.18186252, 2.61167384,
        1.02248025, 5.59179312, 4.53887048, 2.45747758, 1.10592862,
        5.5459417 , 4.01413411, 3.62124397, 2.01490522, 4.83820893,
        2.37418687, 5.05412827, 2.97242243, 4.77933392, 3.23039384,
        6.15899309, 4.37556219, 6.12371957, 3.82459632, 5.22114322,
        2.75491914],
       [5.68196172, 2.24199047, 4.48661082, 1.18186252, 2.61167384,
        1.02248025, 5.59179312, 4.53887048, 2.45747758, 1.10592862,
        5.5459417 , 4.01413411, 3.62124397, 2.01490522, 4.83820893,
        2.37418687, 5.05412827, 2.97242243, 4.77933392, 3.23039384,
        6.15899309, 4.37556219, 6.12371957, 3.82459632, 5.22114322,
        2.75491914]]), 'props_simplex__': array([0.]), 'clust': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

Sequential sampling (1 chains in 1 job)
CompoundStep
>NUTS: [beta_clust0, beta_stage0, beta_age, beta_chemo, beta_rt, beta_brachy, mu_clust, props, theta_a, theta_b, lambda0]
>BinaryGibbsMetropolis: [clust]


Sampling 1 chain for 1_000 tune and 1_000 draw iterations (1_000 + 1_000 draws total) took 670 seconds.
Only one chain was sampled, this makes it impossible to run some convergence checks


In [144]:
pm.summary(pfs_outputs3['trace'], var_names=["props", "beta_clust", "beta_stage", "beta_age", "beta_chemo", "beta_rt", "beta_brachy"])



Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
props[0],0.492,0.037,0.419,0.556,0.002,0.002,231.0,365.0,
props[1],0.508,0.037,0.444,0.581,0.002,0.002,231.0,365.0,
beta_clust[0],0.0,0.0,0.0,0.0,0.0,0.0,1000.0,1000.0,
beta_clust[1],-0.586,0.245,-1.081,-0.166,0.013,0.01,336.0,432.0,
beta_stage[0],0.0,0.0,0.0,0.0,0.0,0.0,1000.0,1000.0,
beta_stage[1],-0.22,0.782,-1.684,1.226,0.033,0.027,569.0,496.0,
beta_stage[2],1.361,0.292,0.821,1.919,0.015,0.011,363.0,389.0,
beta_stage[3],2.782,0.418,1.979,3.567,0.025,0.018,269.0,257.0,
beta_age,-0.01,0.115,-0.212,0.21,0.006,0.005,409.0,478.0,
beta_chemo,-0.635,0.315,-1.271,-0.078,0.022,0.016,208.0,187.0,


In [134]:
dss_outputs2 = fit_survcluster_model(dss_inputs, nclusts = 3, ncenters = 20, interval_length = 0.2, epsilon = 1e-6, ndraw = 2000, ntune=2000)

point={'beta_clust0': array([0., 0.]), 'beta_stage0': array([0., 0., 0.]), 'beta_age': array(0.), 'beta_chemo': array(0.), 'beta_rt': array(0.), 'beta_brachy': array(0.), 'mu_clust_log__': array([[5.75870179, 2.31838358, 4.46375287, 1.24355049, 2.60599007,
        1.03637965, 5.60291106, 4.55877398, 2.46794208, 1.15072592,
        5.54594424, 4.0232128 , 3.6484534 , 2.04259638, 4.85501928,
        2.39959025, 5.05330594, 2.98003355, 4.80937621, 3.27254467,
        6.14341007, 4.42421188, 6.11291037, 3.83479628, 5.2219894 ,
        2.75504639],
       [5.75870179, 2.31838358, 4.46375287, 1.24355049, 2.60599007,
        1.03637965, 5.60291106, 4.55877398, 2.46794208, 1.15072592,
        5.54594424, 4.0232128 , 3.6484534 , 2.04259638, 4.85501928,
        2.39959025, 5.05330594, 2.98003355, 4.80937621, 3.27254467,
        6.14341007, 4.42421188, 6.11291037, 3.83479628, 5.2219894 ,
        2.75504639],
       [5.75870179, 2.31838358, 4.46375287, 1.24355049, 2.60599007,
        1.03637965, 5

Sequential sampling (1 chains in 1 job)
CompoundStep
>NUTS: [beta_clust0, beta_stage0, beta_age, beta_chemo, beta_rt, beta_brachy, mu_clust, props, theta_a, theta_b, lambda0]
>CategoricalGibbsMetropolis: [clust]


Sampling 1 chain for 2_000 tune and 2_000 draw iterations (2_000 + 2_000 draws total) took 1915 seconds.
Only one chain was sampled, this makes it impossible to run some convergence checks


In [135]:
pm.summary(dss_outputs2['trace'], var_names=["props", "beta_clust", "beta_stage", "beta_age", "beta_chemo", "beta_rt", "beta_brachy"])



Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
props[0],0.428,0.036,0.361,0.496,0.001,0.001,880.0,1352.0,
props[1],0.352,0.034,0.286,0.414,0.002,0.001,272.0,1203.0,
props[2],0.22,0.032,0.163,0.282,0.002,0.001,428.0,1111.0,
beta_clust[0],0.0,0.0,0.0,0.0,0.0,0.0,2000.0,2000.0,
beta_clust[1],-0.439,0.362,-1.153,0.213,0.011,0.008,1096.0,1424.0,
beta_clust[2],-1.128,0.459,-2.017,-0.317,0.013,0.009,1216.0,1174.0,
beta_stage[0],0.0,0.0,0.0,0.0,0.0,0.0,2000.0,2000.0,
beta_stage[1],0.444,1.287,-1.971,2.65,0.031,0.027,1983.0,1359.0,
beta_stage[2],1.694,0.419,0.956,2.528,0.011,0.008,1604.0,1161.0,
beta_stage[3],3.35,0.54,2.323,4.326,0.016,0.012,1094.0,1140.0,


In [145]:
dss_outputs3 = fit_survcluster_model(dss_inputs, nclusts = 2, ncenters = 20, interval_length = 0.1, epsilon = 1e-6, ndraw = 1000, ntune=1000)

point={'beta_clust0': array([0.]), 'beta_stage0': array([0., 0., 0.]), 'beta_age': array(0.), 'beta_chemo': array(0.), 'beta_rt': array(0.), 'beta_brachy': array(0.), 'mu_clust_log__': array([[5.75870179, 2.31838358, 4.46375287, 1.24355049, 2.60599007,
        1.03637965, 5.60291106, 4.55877398, 2.46794208, 1.15072592,
        5.54594424, 4.0232128 , 3.6484534 , 2.04259638, 4.85501928,
        2.39959025, 5.05330594, 2.98003355, 4.80937621, 3.27254467,
        6.14341007, 4.42421188, 6.11291037, 3.83479628, 5.2219894 ,
        2.75504639],
       [5.75870179, 2.31838358, 4.46375287, 1.24355049, 2.60599007,
        1.03637965, 5.60291106, 4.55877398, 2.46794208, 1.15072592,
        5.54594424, 4.0232128 , 3.6484534 , 2.04259638, 4.85501928,
        2.39959025, 5.05330594, 2.98003355, 4.80937621, 3.27254467,
        6.14341007, 4.42421188, 6.11291037, 3.83479628, 5.2219894 ,
        2.75504639]]), 'props_simplex__': array([0.]), 'clust': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

Sequential sampling (1 chains in 1 job)
CompoundStep
>NUTS: [beta_clust0, beta_stage0, beta_age, beta_chemo, beta_rt, beta_brachy, mu_clust, props, theta_a, theta_b, lambda0]
>BinaryGibbsMetropolis: [clust]


Sampling 1 chain for 1_000 tune and 1_000 draw iterations (1_000 + 1_000 draws total) took 954 seconds.
Only one chain was sampled, this makes it impossible to run some convergence checks


In [146]:
pm.summary(dss_outputs3['trace'], var_names=["props", "beta_clust", "beta_stage", "beta_age", "beta_chemo", "beta_rt", "beta_brachy"])



Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
props[0],0.521,0.036,0.449,0.587,0.002,0.001,564.0,741.0,
props[1],0.479,0.036,0.413,0.551,0.002,0.001,564.0,741.0,
beta_clust[0],0.0,0.0,0.0,0.0,0.0,0.0,1000.0,1000.0,
beta_clust[1],-0.644,0.288,-1.211,-0.08,0.012,0.009,589.0,504.0,
beta_stage[0],0.0,0.0,0.0,0.0,0.0,0.0,1000.0,1000.0,
beta_stage[1],0.022,1.305,-2.074,2.271,0.072,0.068,464.0,356.0,
beta_stage[2],1.439,0.386,0.79,2.21,0.023,0.017,309.0,547.0,
beta_stage[3],3.31,0.533,2.376,4.321,0.034,0.025,264.0,395.0,
beta_age,0.034,0.14,-0.219,0.298,0.005,0.004,823.0,633.0,
beta_chemo,-1.258,0.433,-2.07,-0.395,0.022,0.016,384.0,458.0,


In [138]:
test_trace_table = create_trace_table(dss_outputs2['trace'])
test_cluster_assignments = extract_cluster_assignments(dss_outputs2['trace'], sample_names = dss_inputs['counts'].index)

In [139]:
test_cluster_assignments

Unnamed: 0,acc_num,0,1,2,3,4,5,6,7,8,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,02S-2772,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1621020349,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,1621020869,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
3,1621021457,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
4,16RS-25610,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201,VS17-3680,0,0,0,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,2
202,VS17-3718,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
203,VS17-4669,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
204,VS17-57,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2


In [140]:
test_trace_table.to_csv(os.path.join(trace_output_dir, 'test_trace_table.tsv'), sep='\t')
test_cluster_assignments.to_csv(os.path.join(trace_output_dir, 'test_cluster_assignments.tsv'), sep='\t')

In [147]:
os_outputs = os_outputs6
pfs_outputs = pfs_outputs3
dss_outputs = dss_outputs3