# Prepare data for modeling

In [1]:
raise "TODO: make faster!"

TypeError: exceptions must derive from BaseException

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import janitor
from pathlib import Path
import re
import plotnine as gg

In [2]:
gg.theme_set(gg.theme_minimal())

In [3]:
data_dir = Path("../data")
save_dir = Path("../modeling_data")

### Setup dask

In [4]:
import dask
import dask.dataframe as dd

In [5]:
from dask.distributed import Client, progress

client = Client(n_workers=2, threads_per_worker=2, memory_limit="20GB")
client

0,1
Client  Scheduler: tcp://127.0.0.1:41948  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 2  Cores: 4  Memory: 40.00 GB


## Cell line information

In [6]:
def show_counts(df, col):
    if type(col) != list:
        col = [col]

    return (
        df[col + ["depmap_id"]]
        .drop_duplicates()
        .groupby(col)
        .count()
        .sort_values("depmap_id", ascending=False)
    )

In [7]:
sample_info = pd.read_csv(save_dir / "sample_info.csv")
show_counts(sample_info, "lineage")

Unnamed: 0_level_0,depmap_id
lineage,Unnamed: 1_level_1
lung,273
blood,132
skin,113
lymphocyte,109
central_nervous_system,107
colorectal,83
breast,82
upper_aerodigestive,76
bone,75
ovary,74


In [8]:
noncancerous_lineages = ["unknown", "embryo"]
engineered_lineages = sample_info[
    sample_info.lineage.str.contains("engineer")
].lineage.to_list()

ignore_lineages = engineered_lineages + noncancerous_lineages
sample_info = sample_info[~sample_info.lineage.isin(ignore_lineages)]

sample_info_columns = [
    "depmap_id",
    "primary_or_metastasis",
    "lineage",
    "lineage_subtype",
]
sample_info = sample_info[sample_info_columns].drop_duplicates()

## *KRAS* mutations

In [9]:
# Remove all cell lines with no mutation data.
all_samples_with_mutation_data = pd.read_csv(
    save_dir / "ccle_mutations.csv", low_memory=False
).depmap_id.unique()

sample_info = sample_info.pipe(
    lambda x: x[x.depmap_id.isin(all_samples_with_mutation_data)]
)

In [10]:
kras_mutations = pd.read_csv(save_dir / "kras_mutations.csv")
kras_mutations = kras_mutations[["depmap_id", "kras_mutation"]]

sample_info = sample_info.merge(kras_mutations, on="depmap_id", how="left").assign(
    kras_mutation=lambda x: x.kras_mutation.fillna("WT")
)

In [11]:
sample_info.head()

Unnamed: 0,depmap_id,primary_or_metastasis,lineage,lineage_subtype,kras_mutation
0,ACH-000001,Metastasis,ovary,ovary_adenocarcinoma,WT
1,ACH-000002,Primary,blood,AML,WT
2,ACH-000003,,colorectal,colorectal_adenocarcinoma,WT
3,ACH-000004,,blood,AML,WT
4,ACH-000005,,blood,AML,WT


## Screen data

In [12]:
achilles_lfc = dd.read_csv(save_dir / "achilles_logfold_change.csv").compute()

In [13]:
achilles_guide_map = dd.read_csv(save_dir / "achilles_guide_map.csv").compute()

In [14]:
modeling_data = pd.merge(
    left=achilles_lfc, right=sample_info, how="inner", on=["depmap_id"]
)

modeling_data = pd.merge(
    left=modeling_data, right=achilles_guide_map, how="inner", on=["sgrna"]
)

modeling_data.head()

Unnamed: 0,sgrna,replicate_id,lfc,depmap_id,pdna_batch,passes_qc,primary_or_metastasis,lineage,lineage_subtype,kras_mutation,genome_alignment,n_alignments,hugo_symbol
0,AAAAAAATCCAGCAATGCAG,143b-311cas9_repa_p6_batch3,0.289694,ACH-001001,3,True,Primary,bone,osteosarcoma,G12S,chr10_110964620_+,1,SHOC2
1,AAAAAAATCCAGCAATGCAG,2313287-311cas9_repa_p5_batch3,0.171917,ACH-000948,3,True,Primary,gastric,gastric_adenocarcinoma,WT,chr10_110964620_+,1,SHOC2
2,AAAAAAATCCAGCAATGCAG,2313287-311cas9_repb_p5_batch3,-0.522717,ACH-000948,3,True,Primary,gastric,gastric_adenocarcinoma,WT,chr10_110964620_+,1,SHOC2
3,AAAAAAATCCAGCAATGCAG,253j-311cas9_repa_p5_batch3,-0.21169,ACH-000011,3,True,Metastasis,urinary_tract,bladder_carcinoma,WT,chr10_110964620_+,1,SHOC2
4,AAAAAAATCCAGCAATGCAG,42-mg-ba-311cas9_repa_p6_batch3,-1.067942,ACH-000323,3,True,Primary,central_nervous_system,glioma,WT,chr10_110964620_+,1,SHOC2


## Copy number at each guide target

In [None]:
def get_segment_mean(cn_data, depmap_id, chromosome, pos):
    d = cn_data[(cn_data.depmap_id == depmap_id) & (cn_data.chromosome == chromosome)]
    d = d[(d.start <= pos) & (pos <= d.end)]
    if len(d) == 0:
        return None
    elif len(d) > 1:
        raise Exception(f"Data contains more than 1 row of data: {len(d)}")
    else:
        return d.segment_mean.to_list()[0]


def segmentmean_to_copynumber(seg_mean):
    if seg_mean == None:
        return None
    return 2.0 ** seg_mean


def parse_genome_location(gloc):
    d = gloc.split("_")
    d[0] = d[0].replace("chr", "")
    return (d[0], int(d[1]))


cn_segments = dd.read_csv(save_dir / "ccle_semgent_cn.csv").compute()

for i in range(0, len(modeling_data)):
    genome_loc = modeling_data.loc[i, "genome_alignment"]
    depmap_id = modeling_data.loc[i, "depmap_id"]
    g_chr, g_pos = parse_genome_location(genome_loc)

    seg_mean = get_segment_mean(
        cn_data=cn_segments, depmap_id=depmap_id, chromosome=g_chr, pos=g_pos
    )

    modeling_data.at[i, "chromosome"] = g_chr
    modeling_data.at[i, "chr_position"] = g_pos
    modeling_data.at[i, "copy_number"] = segmentmean_to_copynumber(seg_mean)

modeling_data.head(10)

In [None]:
eg_cellline = "ACH-001001"

chromosome_order = [str(a) for a in range(1, 23)] + ["X"]

d = modeling_data[modeling_data.depmap_id == eg_cellline]
d["chromosome"] = pd.Categorical(d.chromosome, categories=chromosome_order)

gg.options.set_option("figure_size", (12, 8))
gg.options.set_option("dpi", 400)

(
    gg.ggplot(d, gg.aes("chr_position", "copy_number", color="chromosome"))
    + gg.facet_wrap("chromosome", ncol=4, scales="free")
    + gg.geom_line(alpha=0.5, size=0.5)
    + gg.geom_point(alpha=0.5, size=0.5)
    + gg.theme_minimal()
    + gg.theme(
        axis_text_x=gg.element_blank(),
        legend_position="none",
        subplots_adjust={"hspace": 0.25, "wspace": 0.25},
        axis_text_y=gg.element_text(hjust=2),
    )
    + gg.labs(
        x="chromosome position",
        y="copy number",
        title=f"Example of copy number variation for a single cell line: {eg_cellline}",
    )
)

## Mutation data of each gene

In [None]:
full_mutations_df = pd.read_csv(
    save_dir / "ccle_mutations.csv", dtype={"chromosome": str}, low_memory=False
)

mutations_data_columns = [
    "depmap_id",
    "hugo_symbol",
    "variant_classification",
    "variant_type",
    "isdeleterious",
    "istcgahotspot",
    "iscosmichotspot",
]


def mod_variant_classifications(d):
    d.variant_classification = d.variant_classification.fillna("unknown")
    d.variant_classification = d.variant_classification.astype("str")
    d.variant_classification = [a.lower() for a in d.variant_classification]
    return d


mutations_df = (
    full_mutations_df[mutations_data_columns]
    .rename(
        columns={
            "isdeleterious": "is_deleterious",
            "istcgahotspot": "is_tcga_hotspot",
            "iscosmichotspot": "is_cosmic_hotspot",
        }
    )
    .pipe(mod_variant_classifications)
)


def any_mutations_true(mut_data, col_name):
    return [any([a[col_name] for a in m]) for m in mut_data]


mutations_df = (
    pd.DataFrame(
        mutations_df.set_index(["depmap_id", "hugo_symbol"])
        .apply(lambda d: d.to_dict(), axis=1)
        .groupby(["depmap_id", "hugo_symbol"])
        .agg(mutation_data=lambda x: list(x))
    )
    .assign(n_muts=lambda d: [len(a) for a in d.mutation_data])
    .reset_index()
    .assign(
        any_deleterious=lambda df: any_mutations_true(
            df.mutation_data, "is_deleterious"
        ),
        any_tcga_hotspot=lambda df: any_mutations_true(
            df.mutation_data, "is_tcga_hotspot"
        ),
        any_cosmic_hotspot=lambda df: any_mutations_true(
            df.mutation_data, "is_cosmic_hotspot"
        ),
    )
)

mutations_df.head()

Check that there is one row per `depmap_id` x `hugo_symbol` pair.
An error will be raised if this is not true.

In [None]:
x = (
    mutations_df.groupby(["depmap_id", "hugo_symbol"])
    .count()
    .query("mutation_data > 1")
)
if x.shape[0] > 0:
    raise "More than one row per cell line x gene pair"

In [None]:
modeling_data = pd.merge(
    left=modeling_data, right=mutations_df, how="left", on=["depmap_id", "hugo_symbol"]
).fillna(
    value={
        "n_muts": 0,
        "any_deleterious": False,
        "any_tcga_hotspot": False,
        "any_cosmic_hotspot": False,
    }
)

In [None]:
modeling_data.head()

## A column to indicate if there is a mutation at the guide target location

In [None]:
modeling_data["mutated_at_target"] = False

for i in range(modeling_data.shape[0]):
    g_chr = modeling_data.chromosome[i]
    g_pos = modeling_data.chr_position[i]
    g_depmapid = modeling_data.depmap_id[i]

    mut_d = (
        full_mutations_df.query(f"depmap_id == '{g_depmapid}'")
        .query(f"chromosome == '{g_chr}'")
        .query(f"start_position <= {g_pos} <= end_position")
    )

    if mut_d.shape[0] >= 1:
        modeling_data.loc[i, "mutated_at_target"] = True

In [None]:
modeling_data.groupby("mutated_at_target").count()

## RNA expression of the target gene

In [None]:
rna_df = (
    dd.read_csv(save_dir / "ccle_expression.csv")
    .rename(columns={"dep_map_id": "depmap_id"})
    .compute()
)
rna_df.head()

In [None]:
modeling_data = pd.merge(
    left=modeling_data, right=rna_df, how="left", on=["depmap_id", "hugo_symbol"]
)

---

## Final data frame

In [None]:
modeling_data.head()

In [None]:
modeling_data.to_csv(save_dir / "depmap_modeling_dataframe.csv")