# Model diagnostics

## Setup

### Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from time import time

import pandas as pd

In [3]:
from speclet.analysis.sublineage_model_analysis import load_sublineage_model_posteriors
from speclet.managers.posterior_data_manager import PosteriorDataManager as PostDataMan

In [4]:
# Notebook execution timer.
notebook_tic = time()

### Data

#### Model posteriors

In [5]:
postmen = load_sublineage_model_posteriors()

In [6]:
len(postmen)

43

## Analysis

In [7]:
def posterior_dims_to_dataframe(pm: PostDataMan) -> pd.DataFrame:
    return pd.DataFrame(dict(pm.trace.posterior.dims), index=[0]).assign(
        lineage_subtype=pm.id
    )

In [8]:
posterior_dims = pd.concat(
    [posterior_dims_to_dataframe(pm) for pm in postmen.posteriors]
).reset_index(drop=True)

In [9]:
posterior_dims.describe().round(1)

Unnamed: 0,chain,draw,delta_genes_dim_0,delta_genes_dim_1,sgrna,delta_cells_dim_0,delta_cells_dim_1,cell_chrom,genes_chol_cov_dim_0,cells_chol_cov_dim_0,genes_chol_cov_corr_dim_0,genes_chol_cov_corr_dim_1,genes_chol_cov_stds_dim_0,gene,cancer_gene,cells_chol_cov_corr_dim_0,cells_chol_cov_corr_dim_1,cells_chol_cov_stds_dim_0,cell_line
count,43.0,43.0,43.0,43.0,43.0,43.0,43.0,43.0,43.0,43.0,43.0,43.0,43.0,43.0,19.0,43.0,43.0,43.0,43.0
mean,4.0,1000.0,6.3,18119.0,71062.0,2.0,19.4,446.1,29.0,3.0,6.3,6.3,6.3,18119.0,5.1,2.0,2.0,2.0,19.4
std,0.0,0.0,3.6,0.0,0.0,0.0,17.5,403.5,34.2,0.0,3.6,3.6,3.6,0.0,3.8,0.0,0.0,0.0,17.5
min,4.0,1000.0,4.0,18119.0,71062.0,2.0,4.0,92.0,10.0,3.0,4.0,4.0,4.0,18119.0,1.0,2.0,2.0,2.0,4.0
25%,4.0,1000.0,4.0,18119.0,71062.0,2.0,5.0,115.0,10.0,3.0,4.0,4.0,4.0,18119.0,1.5,2.0,2.0,2.0,5.0
50%,4.0,1000.0,4.0,18119.0,71062.0,2.0,15.0,345.0,10.0,3.0,4.0,4.0,4.0,18119.0,5.0,2.0,2.0,2.0,15.0
75%,4.0,1000.0,8.0,18119.0,71062.0,2.0,26.5,609.5,36.5,3.0,8.0,8.0,8.0,18119.0,8.0,2.0,2.0,2.0,26.5
max,4.0,1000.0,16.0,18119.0,71062.0,2.0,83.0,1909.0,136.0,3.0,16.0,16.0,16.0,18119.0,12.0,2.0,2.0,2.0,83.0


In [10]:
n_cell_lines = 20
n_chromosomes = 23
n_cancer_genes = 5
n_sgrnas = 72000
n_genes = 18000

n_cell_vars = (n_cell_lines * 2) + (n_cell_lines * n_chromosomes * 2)
n_gene_vars = (n_genes * 4) + n_sgrnas
n_comut_vars = n_genes * n_cancer_genes
total = n_cell_vars + n_gene_vars + n_comut_vars

n_cell_vars, n_gene_vars, n_comut_vars, total

(960, 144000, 90000, 234960)

---

In [11]:
notebook_toc = time()
print(f"execution time: {(notebook_toc - notebook_tic) / 60:.2f} minutes")

execution time: 0.39 minutes


In [12]:
%load_ext watermark
%watermark -d -u -v -iv -b -h -m

Last updated: 2022-10-18

Python implementation: CPython
Python version       : 3.10.6
IPython version      : 8.5.0

Compiler    : GCC 10.4.0
OS          : Linux
Release     : 3.10.0-1160.76.1.el7.x86_64
Machine     : x86_64
Processor   : x86_64
CPU cores   : 28
Architecture: 64bit

Hostname: compute-e-16-231.o2.rc.hms.harvard.edu

Git branch: figures

pandas: 1.4.4

