# Check of modeling data

In [None]:
from pathlib import Path

import dask.dataframe as dd
import numpy as np
import pandas as pd
from dask.distributed import Client, progress

In [None]:
client = Client(n_workers=4, threads_per_worker=2, memory_limit="32GB")
client

Papermill parameters:

`depmap_modeling_df`: The path to the full DepMap modeling data set.

In [None]:
DEPMAP_MODELING_DF: str = ""

In [None]:
assert DEPMAP_MODELING_DF != "", "No path provided for the modeling data."

In [None]:
depmap_modeling_df_path = Path(DEPMAP_MODELING_DF)

if not depmap_modeling_df_path.exists():
    raise FileNotFoundError(f"Could not find '{str(depmap_modeling_df_path)}'")

In [None]:
pd.read_csv(depmap_modeling_df_path, low_memory=False, nrows=200)

In [None]:
depmap_modeling_df = dd.read_csv(
    depmap_modeling_df_path,
    dtype={
        "age": "float64",
        "p_dna_batch": "object",
        "primary_or_metastasis": "object",
        "counts_final": "float64",
        "counts_initial": "float64",
    },
    low_memory=False,
)

In [None]:
depmap_modeling_df.head()

In [None]:
depmap_modeling_df.columns

## Basic checks

In [None]:
FAILED_CHECKS = 0

Check that specific columns exist (prevents some really bonehead discoveries later on...).

In [None]:
cols_that_should_exist = [
    "depmap_id",
    "sgrna",
    "hugo_symbol",
    "lfc",
    "screen",
    "num_mutations",
    "is_mutated",
    "lineage",
    "counts_final",
    "p_dna_batch",
    "primary_or_metastasis",
]

missing_cols = [
    col for col in cols_that_should_exist if col not in depmap_modeling_df.columns
]
if len(missing_cols) != 0:
    print(f"Some columns ({len(missing_cols)}) that should be present are not 😦")
    print("  missing columns: " + ", ".join(missing_cols))
    FAILED_CHECKS += 1

Check that specific columns have no missing (`NA`) values.

In [None]:
cols_without_na = [
    "depmap_id",
    "sgrna",
    "hugo_symbol",
    "lfc",
    "screen",
    "num_mutations",
    "is_mutated",
    "lineage",
]

na_checks = depmap_modeling_df.isna()[cols_without_na].any().compute()
num_missed_checks = na_checks.sum()

if num_missed_checks > 0:
    FAILED_CHECKS += num_missed_checks
    print(na_checks[na_checks])

In [None]:
na_checks

Check that all combinations of cell line, sgRNA, and experimental replicate only appear once.

In [None]:
grp_cols = ["depmap_id", "sgrna", "replicate_id"]
ct_df = (
    depmap_modeling_df.assign(n=1)[grp_cols + ["n"]]
    .groupby(grp_cols)
    .count()
    .query("n > 1")
    .compute()
)

if not ct_df.shape[0] == 0:
    print("There are some sgRNA with multiple targets.")
    print(ct_df.head(20))
    FAILED_CHECKS += 1

In [None]:
if FAILED_CHECKS > 0:
    raise Exception(f"There were {FAILED_CHECKS} failed checks.")

---

In [None]:
%load_ext watermark
%watermark -d -u -v -iv -b -h -m