# Exploring lineages and sublineages

## Setup

### Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from time import time

import dask.dataframe as dd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import Markdown, display

In [3]:
from speclet.io import DataFile, data_path, notebook_table_dir
from speclet.plot import set_speclet_theme

In [4]:
# Notebook execution timer.
notebook_tic = time()

# Plotting setup.
set_speclet_theme()
%config InlineBackend.figure_format = "retina"

# Constants
RANDOM_SEED = 847
np.random.seed(RANDOM_SEED)

In [5]:
output_dir = notebook_table_dir("001_020_lineage-exploration")

### Data

In [6]:
modeling_data_file = data_path(DataFile.DEPMAP_DATA)
modeling_data_file

PosixPath('/n/data1/hms/dbmi/park/Cook/speclet/modeling_data/depmap-modeling-data.csv')

In [7]:
modeling_data_dd = dd.read_csv(
    modeling_data_file, low_memory=False, dtype={"age": "float64"}
)
modeling_data_dd.head()

Unnamed: 0,sgrna,replicate_id,lfc,p_dna_batch,genome_alignment,hugo_symbol,screen,multiple_hits_on_gene,sgrna_target_chr,sgrna_target_pos,...,any_deleterious,any_tcga_hotspot,any_cosmic_hotspot,is_mutated,copy_number,lineage,lineage_subtype,primary_or_metastasis,is_male,age
0,AAACCTGCGGCGGTCGCCA,OVR3_c905R1,-0.299958,CRISPR_C6596666.sample,chr8_66505451_-,VXN,sanger,True,8,66505451,...,,,,False,1.139595,ovary,ovary_adenocarcinoma,metastasis,False,60.0
1,AACAGCACACCGGCCCCGT,OVR3_c905R1,0.267092,CRISPR_C6596666.sample,chrX_156009834_-,IL9R,sanger,True,X,156009834,...,,,,False,0.656377,ovary,ovary_adenocarcinoma,metastasis,False,60.0
2,AACCTCCGGACTCCTCAGC,OVR3_c905R1,0.550477,CRISPR_C6596666.sample,chr7_39609658_-,YAE1,sanger,True,7,39609658,...,,,,False,0.923715,ovary,ovary_adenocarcinoma,metastasis,False,60.0
3,AACTCAAACTGACGCCGAA,OVR3_c905R1,-0.391922,CRISPR_C6596666.sample,chr1_117623388_-,TENT5C,sanger,True,1,117623388,...,,,,False,1.352975,ovary,ovary_adenocarcinoma,metastasis,False,60.0
4,AACTGACCTTGAAACGCTG,OVR3_c905R1,-1.562577,CRISPR_C6596666.sample,chr16_66933623_+,CIAO2B,sanger,True,16,66933623,...,,,,False,1.157211,ovary,ovary_adenocarcinoma,metastasis,False,60.0


## Analysis

In [8]:
lineage_data = (
    modeling_data_dd.query("screen == 'broad'")[
        ["depmap_id", "lineage", "lineage_subtype"]
    ]
    .drop_duplicates()
    .compute()
)
lineage_data

Unnamed: 0,depmap_id,lineage,lineage_subtype
86746,ACH-000004,blood,AML
157808,ACH-000005,blood,AML
228871,ACH-000007,colorectal,colorectal_adenocarcinoma
117813,ACH-000009,colorectal,colorectal_adenocarcinoma
25860,ACH-000011,urinary_tract,bladder_carcinoma
...,...,...,...
36493,ACH-002460,skin,melanoma
107555,ACH-002508,skin,melanoma
178617,ACH-002510,skin,melanoma
249679,ACH-002512,skin,melanoma


In [9]:
(
    lineage_data.copy()
    .assign(missing_subtype=lambda d: d["lineage_subtype"].isna())
    .groupby("lineage")["missing_subtype"]
    .sum()
    .reset_index()
    .set_index("lineage")
)

Unnamed: 0_level_0,missing_subtype
lineage,Unnamed: 1_level_1
bile_duct,0
blood,0
bone,0
breast,1
central_nervous_system,0
cervix,0
colorectal,0
epidermoid_carcinoma,0
esophagus,0
eye,0


In [10]:
lineage_counts = (
    lineage_data.copy()
    .reset_index(drop=True)
    .fillna({"lineage_subtype": "NA"})
    .groupby(["lineage", "lineage_subtype"])["depmap_id"]
    .count()
    .reset_index()
    .sort_values(["lineage", "depmap_id"], ascending=(True, False))
)

for lineage, data in lineage_counts.groupby("lineage"):
    total = data["depmap_id"].sum()
    display(Markdown(f"**{lineage}** ({total} cell lines)"))
    display(data.reset_index(drop=True))
    display(Markdown("---"))

**bile_duct** (37 cell lines)

Unnamed: 0,lineage,lineage_subtype,depmap_id
0,bile_duct,cholangiocarcinoma,31
1,bile_duct,gallbladder_adenocarcinoma,6


---

**blood** (55 cell lines)

Unnamed: 0,lineage,lineage_subtype,depmap_id
0,blood,AML,26
1,blood,ALL,15
2,blood,CML,7
3,blood,CLL,4
4,blood,unspecified_leukemia,3


---

**bone** (30 cell lines)

Unnamed: 0,lineage,lineage_subtype,depmap_id
0,bone,Ewing_sarcoma,16
1,bone,osteosarcoma,9
2,bone,chordoma,4
3,bone,chondrosarcoma,1


---

**breast** (40 cell lines)

Unnamed: 0,lineage,lineage_subtype,depmap_id
0,breast,breast_carcinoma,19
1,breast,breast_ductal_carcinoma,19
2,breast,,1
3,breast,breast_adenocarcinoma,1


---

**central_nervous_system** (62 cell lines)

Unnamed: 0,lineage,lineage_subtype,depmap_id
0,central_nervous_system,glioma,52
1,central_nervous_system,medulloblastoma,7
2,central_nervous_system,meningioma,2
3,central_nervous_system,PNET,1


---

**cervix** (14 cell lines)

Unnamed: 0,lineage,lineage_subtype,depmap_id
0,cervix,cervical_carcinoma,5
1,cervix,cervical_squamous,5
2,cervix,cervical_adenocarcinoma,3
3,cervix,glassy_cell_carcinoma,1


---

**colorectal** (40 cell lines)

Unnamed: 0,lineage,lineage_subtype,depmap_id
0,colorectal,colorectal_adenocarcinoma,40


---

**epidermoid_carcinoma** (1 cell lines)

Unnamed: 0,lineage,lineage_subtype,depmap_id
0,epidermoid_carcinoma,skin_squamous,1


---

**esophagus** (25 cell lines)

Unnamed: 0,lineage,lineage_subtype,depmap_id
0,esophagus,esophagus_squamous,20
1,esophagus,esophagus_adenocarcinoma,5


---

**eye** (7 cell lines)

Unnamed: 0,lineage,lineage_subtype,depmap_id
0,eye,uveal_melanoma,5
1,eye,retinoblastoma,2


---

**fibroblast** (1 cell lines)

Unnamed: 0,lineage,lineage_subtype,depmap_id
0,fibroblast,fibroblast_soft_tissue,1


---

**gastric** (28 cell lines)

Unnamed: 0,lineage,lineage_subtype,depmap_id
0,gastric,gastric_adenocarcinoma,28


---

**kidney** (28 cell lines)

Unnamed: 0,lineage,lineage_subtype,depmap_id
0,kidney,renal_cell_carcinoma,24
1,kidney,,2
2,kidney,malignant_rhabdoid_tumor,2


---

**liver** (22 cell lines)

Unnamed: 0,lineage,lineage_subtype,depmap_id
0,liver,hepatocellular_carcinoma,20
1,liver,,1
2,liver,hepatoblastoma,1


---

**lung** (116 cell lines)

Unnamed: 0,lineage,lineage_subtype,depmap_id
0,lung,NSCLC,83
1,lung,SCLC,19
2,lung,mesothelioma,13
3,lung,lung_carcinoid,1


---

**lymphocyte** (30 cell lines)

Unnamed: 0,lineage,lineage_subtype,depmap_id
0,lymphocyte,non_hodgkin_lymphoma,20
1,lymphocyte,lymphoma_unspecified,5
2,lymphocyte,hodgkin_lymphoma,4
3,lymphocyte,ATL,1


---

**ovary** (47 cell lines)

Unnamed: 0,lineage,lineage_subtype,depmap_id
0,ovary,ovary_adenocarcinoma,42
1,ovary,SCCOHT,2
2,ovary,brenner_tumor,1
3,ovary,mixed_germ_cell,1
4,ovary,ovary_carcinoma,1


---

**pancreas** (38 cell lines)

Unnamed: 0,lineage,lineage_subtype,depmap_id
0,pancreas,exocrine,38


---

**peripheral_nervous_system** (21 cell lines)

Unnamed: 0,lineage,lineage_subtype,depmap_id
0,peripheral_nervous_system,neuroblastoma,20
1,peripheral_nervous_system,PNET,1


---

**plasma_cell** (21 cell lines)

Unnamed: 0,lineage,lineage_subtype,depmap_id
0,plasma_cell,multiple_myeloma,21


---

**prostate** (5 cell lines)

Unnamed: 0,lineage,lineage_subtype,depmap_id
0,prostate,prostate_adenocarcinoma,4
1,prostate,prostate_hyperplasia,1


---

**skin** (65 cell lines)

Unnamed: 0,lineage,lineage_subtype,depmap_id
0,skin,melanoma,57
1,skin,skin_squamous,4
2,skin,merkel_cell_carcinoma,3
3,skin,,1


---

**soft_tissue** (44 cell lines)

Unnamed: 0,lineage,lineage_subtype,depmap_id
0,soft_tissue,rhabdomyosarcoma,10
1,soft_tissue,liposarcoma,8
2,soft_tissue,malignant_rhabdoid_tumor,8
3,soft_tissue,ATRT,5
4,soft_tissue,synovial_sarcoma,5
5,soft_tissue,epithelioid_sarcoma,2
6,soft_tissue,leiomyosarcoma,2
7,soft_tissue,fibrosarcoma,1
8,soft_tissue,pleomorphic_sarcoma,1
9,soft_tissue,thyroid_sarcoma,1


---

**thyroid** (11 cell lines)

Unnamed: 0,lineage,lineage_subtype,depmap_id
0,thyroid,thyroid_carcinoma,9
1,thyroid,thyroid_squamous,2


---

**upper_aerodigestive** (46 cell lines)

Unnamed: 0,lineage,lineage_subtype,depmap_id
0,upper_aerodigestive,upper_aerodigestive_squamous,46


---

**urinary_tract** (30 cell lines)

Unnamed: 0,lineage,lineage_subtype,depmap_id
0,urinary_tract,bladder_carcinoma,30


---

**uterus** (31 cell lines)

Unnamed: 0,lineage,lineage_subtype,depmap_id
0,uterus,endometrial_adenocarcinoma,18
1,uterus,choriocarcinoma,3
2,uterus,endometrial_squamous,3
3,uterus,MMMT,2
4,uterus,clear_cell_carcinoma,1
5,uterus,endometrial_adenosquamous,1
6,uterus,endometrial_stromal_sarcoma,1
7,uterus,mullerian_carcinoma,1
8,uterus,uterine_carcinosarcoma,1


---

In [11]:
lineage_counts.to_csv(output_dir / "broad-lineage-subtype-counts.csv", index=False)

---

## Session info

In [12]:
notebook_toc = time()
print(f"execution time: {(notebook_toc - notebook_tic) / 60:.2f} minutes")

execution time: 7.08 minutes


In [13]:
%load_ext watermark
%watermark -d -u -v -iv -b -h -m

Last updated: 2022-08-23

Python implementation: CPython
Python version       : 3.10.5
IPython version      : 8.4.0

Compiler    : GCC 10.3.0
OS          : Linux
Release     : 3.10.0-1160.45.1.el7.x86_64
Machine     : x86_64
Processor   : x86_64
CPU cores   : 28
Architecture: 64bit

Hostname: compute-e-16-231.o2.rc.hms.harvard.edu

Git branch: expand-lineages

dask      : 2022.7.1
matplotlib: 3.5.2
seaborn   : 0.11.2
pandas    : 1.4.3
numpy     : 1.23.1

