# Check of modeling data

In [1]:
from pathlib import Path

import dask.dataframe as dd
import numpy as np
import pandas as pd
from dask.distributed import Client, progress

In [2]:
client = Client(n_workers=4, threads_per_worker=2, memory_limit="32GB")
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 8,Total memory: 119.21 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:39738,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 8
Started: Just now,Total memory: 119.21 GiB

0,1
Comm: tcp://127.0.0.1:36028,Total threads: 2
Dashboard: http://127.0.0.1:37602/status,Memory: 29.80 GiB
Nanny: tcp://127.0.0.1:46828,
Local directory: /n/data1/hms/dbmi/park/Cook/speclet/munge/dask-worker-space/worker-yh7xum9x,Local directory: /n/data1/hms/dbmi/park/Cook/speclet/munge/dask-worker-space/worker-yh7xum9x

0,1
Comm: tcp://127.0.0.1:39969,Total threads: 2
Dashboard: http://127.0.0.1:34959/status,Memory: 29.80 GiB
Nanny: tcp://127.0.0.1:42595,
Local directory: /n/data1/hms/dbmi/park/Cook/speclet/munge/dask-worker-space/worker-_oq0d05k,Local directory: /n/data1/hms/dbmi/park/Cook/speclet/munge/dask-worker-space/worker-_oq0d05k

0,1
Comm: tcp://127.0.0.1:34284,Total threads: 2
Dashboard: http://127.0.0.1:33028/status,Memory: 29.80 GiB
Nanny: tcp://127.0.0.1:38905,
Local directory: /n/data1/hms/dbmi/park/Cook/speclet/munge/dask-worker-space/worker-45idy0hx,Local directory: /n/data1/hms/dbmi/park/Cook/speclet/munge/dask-worker-space/worker-45idy0hx

0,1
Comm: tcp://127.0.0.1:38011,Total threads: 2
Dashboard: http://127.0.0.1:35466/status,Memory: 29.80 GiB
Nanny: tcp://127.0.0.1:33823,
Local directory: /n/data1/hms/dbmi/park/Cook/speclet/munge/dask-worker-space/worker-t03_79ea,Local directory: /n/data1/hms/dbmi/park/Cook/speclet/munge/dask-worker-space/worker-t03_79ea


Papermill parameters:

`depmap_modeling_df`: The path to the full DepMap modeling data set.

In [3]:
DEPMAP_MODELING_DF: str = ""

In [4]:
# Parameters
DEPMAP_MODELING_DF = "../modeling_data/depmap-modeling-data.csv"

In [5]:
assert DEPMAP_MODELING_DF != "", "No path provided for the modeling data."

In [6]:
depmap_modeling_df_path = Path(DEPMAP_MODELING_DF)

if not depmap_modeling_df_path.exists():
    raise FileNotFoundError(f"Could not find '{str(depmap_modeling_df_path)}'")

In [7]:
pd.read_csv(depmap_modeling_df_path, low_memory=False, nrows=200)

Unnamed: 0,sgrna,replicate_id,lfc,p_dna_batch,genome_alignment,hugo_symbol,screen,multiple_hits_on_gene,sgrna_target_chr,sgrna_target_pos,...,any_deleterious,any_tcga_hotspot,any_cosmic_hotspot,is_mutated,copy_number,lineage,lineage_subtype,primary_or_metastasis,is_male,age
0,AAACCTGCGGCGGTCGCCA,OVR3_c905R1,-0.299958,CRISPR_C6596666.sample,chr8_66505451_-,VXN,sanger,True,8,66505451,...,,,,False,1.139595,ovary,ovary_adenocarcinoma,metastasis,False,60
1,AACAGCACACCGGCCCCGT,OVR3_c905R1,0.267092,CRISPR_C6596666.sample,chrX_156009834_-,IL9R,sanger,True,X,156009834,...,,,,False,0.656377,ovary,ovary_adenocarcinoma,metastasis,False,60
2,AACCTCCGGACTCCTCAGC,OVR3_c905R1,0.550477,CRISPR_C6596666.sample,chr7_39609658_-,YAE1,sanger,True,7,39609658,...,,,,False,0.923715,ovary,ovary_adenocarcinoma,metastasis,False,60
3,AACTCAAACTGACGCCGAA,OVR3_c905R1,-0.391922,CRISPR_C6596666.sample,chr1_117623388_-,TENT5C,sanger,True,1,117623388,...,,,,False,1.352975,ovary,ovary_adenocarcinoma,metastasis,False,60
4,AACTGACCTTGAAACGCTG,OVR3_c905R1,-1.562577,CRISPR_C6596666.sample,chr16_66933623_+,CIAO2B,sanger,True,16,66933623,...,,,,False,1.157211,ovary,ovary_adenocarcinoma,metastasis,False,60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,TGAGCTGGCAATGCTAGAT,OVR3_c905R1,0.565344,CRISPR_C6596666.sample,chrX_155774116_-,SPRY3,sanger,True,X,155774116,...,,,,False,0.656377,ovary,ovary_adenocarcinoma,metastasis,False,60
196,TGATGGAGCGAATCAGATG,OVR3_c905R1,-0.204959,CRISPR_C6596666.sample,chr16_66934065_+,CIAO2B,sanger,True,16,66934065,...,,,,False,1.157211,ovary,ovary_adenocarcinoma,metastasis,False,60
197,TGCACTTATGTGTGCCGCC,OVR3_c905R1,0.650650,CRISPR_C6596666.sample,chrX_156003692_-,IL9R,sanger,True,X,156003692,...,,,,False,0.656377,ovary,ovary_adenocarcinoma,metastasis,False,60
198,TGCTAGGACCCAACTGAGC,OVR3_c905R1,-0.517796,CRISPR_C6596666.sample,chr10_46580364_+,SYT15,sanger,True,10,46580364,...,,,,False,0.752471,ovary,ovary_adenocarcinoma,metastasis,False,60


In [8]:
depmap_modeling_df = dd.read_csv(
    depmap_modeling_df_path,
    dtype={
        "age": "float64",
        "p_dna_batch": "object",
        "primary_or_metastasis": "object",
        "counts_final": "float64",
        "counts_initial": "float64",
    },
    low_memory=False,
)

In [9]:
depmap_modeling_df.head()

Unnamed: 0,sgrna,replicate_id,lfc,p_dna_batch,genome_alignment,hugo_symbol,screen,multiple_hits_on_gene,sgrna_target_chr,sgrna_target_pos,...,any_deleterious,any_tcga_hotspot,any_cosmic_hotspot,is_mutated,copy_number,lineage,lineage_subtype,primary_or_metastasis,is_male,age
0,AAACCTGCGGCGGTCGCCA,OVR3_c905R1,-0.299958,CRISPR_C6596666.sample,chr8_66505451_-,VXN,sanger,True,8,66505451,...,,,,False,1.139595,ovary,ovary_adenocarcinoma,metastasis,False,60.0
1,AACAGCACACCGGCCCCGT,OVR3_c905R1,0.267092,CRISPR_C6596666.sample,chrX_156009834_-,IL9R,sanger,True,X,156009834,...,,,,False,0.656377,ovary,ovary_adenocarcinoma,metastasis,False,60.0
2,AACCTCCGGACTCCTCAGC,OVR3_c905R1,0.550477,CRISPR_C6596666.sample,chr7_39609658_-,YAE1,sanger,True,7,39609658,...,,,,False,0.923715,ovary,ovary_adenocarcinoma,metastasis,False,60.0
3,AACTCAAACTGACGCCGAA,OVR3_c905R1,-0.391922,CRISPR_C6596666.sample,chr1_117623388_-,TENT5C,sanger,True,1,117623388,...,,,,False,1.352975,ovary,ovary_adenocarcinoma,metastasis,False,60.0
4,AACTGACCTTGAAACGCTG,OVR3_c905R1,-1.562577,CRISPR_C6596666.sample,chr16_66933623_+,CIAO2B,sanger,True,16,66933623,...,,,,False,1.157211,ovary,ovary_adenocarcinoma,metastasis,False,60.0


In [10]:
depmap_modeling_df.columns

Index(['sgrna', 'replicate_id', 'lfc', 'p_dna_batch', 'genome_alignment',
       'hugo_symbol', 'screen', 'multiple_hits_on_gene', 'sgrna_target_chr',
       'sgrna_target_pos', 'depmap_id', 'counts_final', 'counts_initial',
       'rna_expr', 'num_mutations', 'any_deleterious', 'any_tcga_hotspot',
       'any_cosmic_hotspot', 'is_mutated', 'copy_number', 'lineage',
       'lineage_subtype', 'primary_or_metastasis', 'is_male', 'age'],
      dtype='object')

## Basic checks

In [11]:
FAILED_CHECKS = 0

Check that specific columns exist (prevents some really bonehead discoveries later on...).

In [12]:
cols_that_should_exist = [
    "depmap_id",
    "sgrna",
    "hugo_symbol",
    "lfc",
    "screen",
    "num_mutations",
    "is_mutated",
    "lineage",
    "counts_final",
    "p_dna_batch",
    "primary_or_metastasis",
]

missing_cols = [
    col for col in cols_that_should_exist if col not in depmap_modeling_df.columns
]
if len(missing_cols) != 0:
    print(f"Some columns ({len(missing_cols)}) that should be present are not 😦")
    print("  missing columns: " + ", ".join(missing_cols))
    FAILED_CHECKS += 1

Check that specific columns have no missing (`NA`) values.

In [13]:
cols_without_na = [
    "depmap_id",
    "sgrna",
    "hugo_symbol",
    "lfc",
    "screen",
    "num_mutations",
    "is_mutated",
    "lineage",
]

na_checks = depmap_modeling_df.isna()[cols_without_na].any().compute()
num_missed_checks = na_checks.sum()

if num_missed_checks > 0:
    FAILED_CHECKS += num_missed_checks
    print(na_checks[na_checks])

In [14]:
na_checks

depmap_id        False
sgrna            False
hugo_symbol      False
lfc              False
screen           False
num_mutations    False
is_mutated       False
lineage          False
dtype: bool

Check that all combinations of cell line, sgRNA, and experimental replicate only appear once.

In [15]:
grp_cols = ["depmap_id", "sgrna", "replicate_id"]
ct_df = (
    depmap_modeling_df.assign(n=1)[grp_cols + ["n"]]
    .groupby(grp_cols)
    .count()
    .query("n > 1")
    .compute()
)

if not ct_df.shape[0] == 0:
    print("There are some sgRNA with multiple targets.")
    print(ct_df.head(20))
    FAILED_CHECKS += 1

In [16]:
if FAILED_CHECKS > 0:
    raise Exception(f"There were {FAILED_CHECKS} failed checks.")

---

In [17]:
%load_ext watermark
%watermark -d -u -v -iv -b -h -m

Last updated: 2022-05-15

Python implementation: CPython
Python version       : 3.10.4
IPython version      : 8.3.0

Compiler    : GCC 10.3.0
OS          : Linux
Release     : 3.10.0-1160.45.1.el7.x86_64
Machine     : x86_64
Processor   : x86_64
CPU cores   : 28
Architecture: 64bit

Hostname: compute-e-16-229.o2.rc.hms.harvard.edu

Git branch: crc-panc-eso

numpy : 1.22.3
dask  : 2022.5.0
pandas: 1.4.2

