# Miscellaneous helpers for analyzing the lineage models

## Setup

### Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re
from time import time

import numpy as np
import seaborn as sns

In [3]:
from speclet.io import project_root
from speclet.managers.posterior_data_manager import PosteriorDataManagers as PostDataMen
from speclet.model_configuration import read_model_configurations
from speclet.plot import set_speclet_theme
from speclet.project_configuration import get_model_configuration_file
from speclet.project_enums import ModelFitMethod

In [4]:
# Notebook execution timer.
notebook_tic = time()

# Plotting setup.
set_speclet_theme()
%config InlineBackend.figure_format = "retina"

# Constants
RANDOM_SEED = 709
np.random.seed(RANDOM_SEED)

# File paths
config_path = project_root() / get_model_configuration_file()

### Data

#### Model posteriors

In [5]:
model_configs = read_model_configurations(config_path, active_only=True)
f"Number of configurations: {len(model_configs.configurations)}"

'Number of configurations: 43'

In [6]:
model_names = [c.name for c in model_configs.configurations]
model_names.sort()
pattern = r"(?<=hnb-single-lineage-).*$"
sublineage_names = [re.findall(pattern, m)[0] for m in model_names]
sublineage_names = [n.replace("_", " ") for n in sublineage_names]

postmen = PostDataMen(
    names=model_names,
    fit_methods=ModelFitMethod.PYMC_NUMPYRO,
    config_paths=config_path,
    keys=sublineage_names,
)

In [7]:
postmen.keys

['bile duct (cholangiocarcinoma)',
 'bile duct (gallbladder adenocarcinoma)',
 'blood (ALL)',
 'blood (AML)',
 'blood (CLL)',
 'blood (CML)',
 'bone (Ewing sarcoma)',
 'bone (chordoma)',
 'bone (osteosarcoma)',
 'breast',
 'central nervous system (glioma)',
 'central nervous system (medulloblastoma)',
 'cervix (cervical carcinoma)',
 'cervix (cervical squamous)',
 'colorectal',
 'esophagus (esophagus adenocarcinoma)',
 'esophagus (esophagus squamous)',
 'eye (uveal melanoma)',
 'gastric (gastric adenocarcinoma)',
 'kidney (renal cell carcinoma)',
 'liver (hepatocellular carcinoma)',
 'lung (NSCLC)',
 'lung (SCLC)',
 'lung (mesothelioma)',
 'lymphocyte (hodgkin lymphoma)',
 'lymphocyte (lymphoma unspecified)',
 'lymphocyte (non hodgkin lymphoma)',
 'ovary (ovary adenocarcinoma)',
 'pancreas',
 'peripheral nervous system (neuroblastoma)',
 'plasma cell (multiple myeloma)',
 'prostate',
 'skin (melanoma)',
 'skin (skin squamous)',
 'soft tissue (ATRT)',
 'soft tissue (liposarcoma)',
 'sof

In [8]:
len(postmen)

43

In [9]:
sub_to_lineage = {pm.id: pm.id.split(" (")[0] for pm in postmen.posteriors}
lineages = list(set(sub_to_lineage.values()))
lineages.sort()

### Generate color palettes.

In [10]:
sublineage_cmap = sns.color_palette("Spectral", n_colors=len(postmen), as_cmap=False)
sublineage_pal = {line: sublineage_cmap[i] for i, line in enumerate(sublineage_names)}

lineage_cmap = sns.color_palette("terrain", n_colors=len(lineages), as_cmap=False)
lineage_pal = {line: lineage_cmap[i] for i, line in enumerate(lineages)}

In [11]:
sublineage_pal

{'bile duct (cholangiocarcinoma)': (0.6618992695117263,
  0.050826605151864664,
  0.26881968473663975),
 'bile duct (gallbladder adenocarcinoma)': (0.7126489811610919,
  0.10711264898116109,
  0.28081507112648985),
 'blood (ALL)': (0.7633986928104575, 0.1633986928104575, 0.2928104575163399),
 'blood (AML)': (0.8141484044598232, 0.2196847366397539, 0.3048058439061899),
 'blood (CLL)': (0.8519800076893502, 0.26843521722414454, 0.3033448673587082),
 'blood (CML)': (0.8758169934640523, 0.3045751633986928, 0.29411764705882354),
 'bone (Ewing sarcoma)': (0.9044213763936948,
  0.34794309880815066,
  0.28304498269896194),
 'bone (chordoma)': (0.9330257593233372,
  0.3913110342176086,
  0.27197231833910035),
 'bone (osteosarcoma)': (0.958246828143022,
  0.43744713571703187,
  0.267358708189158),
 'breast': (0.9665513264129182, 0.49742406766628217, 0.295040369088812),
 'central nervous system (glioma)': (0.9748558246828143,
  0.5574009996155325,
  0.32272202998846594),
 'central nervous system (

In [12]:
lineage_pal

{'bile duct': (0.15294117647058825, 0.29411764705882354, 0.6941176470588235),
 'blood': (0.10065359477124183, 0.39869281045751637, 0.7986928104575164),
 'bone': (0.04836601307189542, 0.5032679738562091, 0.9032679738562092),
 'breast': (0.0, 0.6058823529411764, 0.9823529411764705),
 'central nervous system': (0.0, 0.6843137254901961, 0.747058823529412),
 'cervix': (0.0, 0.7627450980392158, 0.5117647058823529),
 'colorectal': (0.06666666666666667, 0.8133333333333334, 0.41333333333333333),
 'esophagus': (0.2235294117647059, 0.8447058823529412, 0.4447058823529412),
 'eye': (0.3803921568627451, 0.876078431372549, 0.476078431372549),
 'gastric': (0.5372549019607841, 0.9074509803921569, 0.5074509803921569),
 'kidney': (0.6941176470588235, 0.9388235294117647, 0.5388235294117647),
 'liver': (0.8509803921568627, 0.9701960784313726, 0.5701960784313725),
 'lung': (0.996078431372549, 0.9949803921568627, 0.5978823529411764),
 'lymphocyte': (0.9254901960784314, 0.9046274509803922, 0.5597647058823529)

---

In [13]:
notebook_toc = time()
print(f"execution time: {(notebook_toc - notebook_tic) / 60:.2f} minutes")

execution time: 0.09 minutes


In [14]:
%load_ext watermark
%watermark -d -u -v -iv -b -h -m

Last updated: 2022-09-18

Python implementation: CPython
Python version       : 3.10.6
IPython version      : 8.5.0

Compiler    : GCC 10.4.0
OS          : Linux
Release     : 3.10.0-1160.76.1.el7.x86_64
Machine     : x86_64
Processor   : x86_64
CPU cores   : 28
Architecture: 64bit

Hostname: compute-e-16-231.o2.rc.hms.harvard.edu

Git branch: figures

seaborn: 0.11.2
numpy  : 1.23.3
re     : 2.2.1

