# Comparing results to the Broads data

## Setup

### Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re
from pathlib import Path
from time import time

import arviz as az
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotnine as gg

In [3]:
from speclet.io import DataFile, data_path, project_root
from speclet.managers.posterior_data_manager import (
    PosteriorDataManager,
    PosteriorDataManagers,
)
from speclet.model_configuration import read_model_configurations
from speclet.plot import set_speclet_theme
from speclet.project_configuration import arviz_config, get_model_configuration_file
from speclet.project_enums import ModelFitMethod

In [4]:
# Notebook execution timer.
notebook_tic = time()

# Plotting setup.
set_speclet_theme()
%config InlineBackend.figure_format = "retina"

# Constants
RANDOM_SEED = 709
np.random.seed(RANDOM_SEED)
arviz_config()

# File paths
config_path = project_root() / get_model_configuration_file()

### Data

#### Model posteriors

In [5]:
model_configs = read_model_configurations(config_path, active_only=True)
f"Number of configurations: {len(model_configs.configurations)}"

'Number of configurations: 5'

In [6]:
# Use these test models while pipeline is running.
model_names = [
    "hnb-single-lineage-prostate-009",
    "hnb-single-lineage-colorectal-009",
]
pattern = r"(?<=hnb-single-lineage-).*$"
lineage_names = [re.findall(pattern, m)[0] for m in model_names]

postmen = PosteriorDataManagers(
    names=model_names,
    fit_methods=ModelFitMethod.PYMC_NUMPYRO,
    config_paths=config_path,
    keys=lineage_names,
)

#### CERES and Chronos data

In [7]:
# TODO: fix the DataFile.ACHILLES_GENE_EFFECT to point to real file.
# broad_crispr_effect = pd.read_csv(data_path(DataFile.ACHILLES_GENE_EFFECT))

broad_crispr_effect = pd.read_csv(data_path("crispr_gene_effect.csv"), low_memory=False)
broad_crispr_effect.head()

Unnamed: 0,depmap_id,hugo_symbol,chronos_gene_effect,ceres_gene_effect
0,ACH-000001,A1BG,-0.147896,-0.338801
1,ACH-000001,A1CF,0.054635,-0.061894
2,ACH-000001,A2M,0.00142,-0.025479
3,ACH-000001,A2ML1,-0.01339,-0.029948
4,ACH-000001,A3GALT2,-0.103499,-0.131452


## ...

---

In [8]:
notebook_toc = time()
print(f"execution time: {(notebook_toc - notebook_tic) / 60:.2f} minutes")

execution time: 0.18 minutes


In [9]:
%load_ext watermark
%watermark -d -u -v -iv -b -h -m

Last updated: 2022-08-12

Python implementation: CPython
Python version       : 3.10.5
IPython version      : 8.4.0

Compiler    : GCC 10.3.0
OS          : Linux
Release     : 3.10.0-1160.45.1.el7.x86_64
Machine     : x86_64
Processor   : x86_64
CPU cores   : 28
Architecture: 64bit

Hostname: compute-e-16-233.o2.rc.hms.harvard.edu

Git branch: expand-lineages

matplotlib: 3.5.2
re        : 2.2.1
pandas    : 1.4.3
arviz     : 0.12.1
numpy     : 1.23.1
plotnine  : 0.0.0

