# Check of modeling data

In [1]:
from pathlib import Path
from time import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotnine as gg
import seaborn as sns

In [2]:
import dask.dataframe as dd
from dask.distributed import Client, progress

client = Client(n_workers=4, threads_per_worker=2, memory_limit="16GB")
client

0,1
Client  Scheduler: tcp://127.0.0.1:36904  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 8  Memory: 59.60 GiB


In [3]:
depmap_modeling_df_path = Path("../modeling_data/depmap_modeling_dataframe.csv")
if not depmap_modeling_df_path.exists():
    raise FileNotFoundError(f"Could not find '{depmap_modeling_df_path.as_posix()}'")

In [4]:
depmap_modeling_df = dd.read_csv(
    depmap_modeling_df_path, dtype={"age": "float64"}, low_memory=False
)

In [5]:
depmap_modeling_df.head()

Unnamed: 0,sgrna,replicate_id,lfc,p_dna_batch,genome_alignment,hugo_symbol,screen,multiple_hits_on_gene,sgrna_target_chr,sgrna_target_pos,...,num_mutations,any_deleterious,any_tcga_hotspot,any_cosmic_hotspot,is_mutated,copy_number,lineage,primary_or_metastasis,is_male,age
0,AAACCTGCGGCGGTCGCCA,OVR3_c905R1,-0.299958,CRISPR_C6596666.sample,chr8_66505451_-,VXN,sanger,True,8,66505451,...,0,,,,False,0.847995,ovary,metastasis,False,60.0
1,AACAGCACACCGGCCCCGT,OVR3_c905R1,0.267092,CRISPR_C6596666.sample,chrX_156009834_-,IL9R,sanger,True,X,156009834,...,0,,,,False,0.700605,ovary,metastasis,False,60.0
2,AACCTCCGGACTCCTCAGC,OVR3_c905R1,0.550477,CRISPR_C6596666.sample,chr7_39609658_-,YAE1,sanger,True,7,39609658,...,0,,,,False,0.934918,ovary,metastasis,False,60.0
3,AACTCAAACTGACGCCGAA,OVR3_c905R1,-0.391922,CRISPR_C6596666.sample,chr1_117623388_-,TENT5C,sanger,True,1,117623388,...,0,,,,False,1.352975,ovary,metastasis,False,60.0
4,AACTGACCTTGAAACGCTG,OVR3_c905R1,-1.562577,CRISPR_C6596666.sample,chr16_66933623_+,CIAO2B,sanger,True,16,66933623,...,0,,,,False,1.259171,ovary,metastasis,False,60.0


In [6]:
depmap_modeling_df.shape

(Delayed('int-adc0c546-7c6b-491a-97f1-4936de647e72'), 22)

In [7]:
depmap_modeling_df.columns

Index(['sgrna', 'replicate_id', 'lfc', 'p_dna_batch', 'genome_alignment',
       'hugo_symbol', 'screen', 'multiple_hits_on_gene', 'sgrna_target_chr',
       'sgrna_target_pos', 'depmap_id', 'rna_expr', 'num_mutations',
       'any_deleterious', 'any_tcga_hotspot', 'any_cosmic_hotspot',
       'is_mutated', 'copy_number', 'lineage', 'primary_or_metastasis',
       'is_male', 'age'],
      dtype='object')

## Basic checks

In [8]:
FAILED_CHECKS = 0

Check that specific columns have no missing (`NA`) values.

In [9]:
cols_without_na = [
    "depmap_id",
    "sgrna",
    "hugo_symbol",
    "lfc",
    "screen",
    "num_mutations",
    "is_mutated",
    "lineage",
]

na_checks = depmap_modeling_df.isna()[cols_without_na].any().compute()
num_missed_checks = na_checks.sum()

if num_missed_checks > 0:
    FAILED_CHECKS += num_missed_checks
    print(na_checks[na_checks])

In [10]:
na_checks

depmap_id        False
sgrna            False
hugo_symbol      False
lfc              False
screen           False
num_mutations    False
is_mutated       False
lineage          False
dtype: bool

Check that all combinations of cell line, sgRNA, and experimental replicate only appear once.

In [11]:
grp_cols = ["depmap_id", "sgrna", "replicate_id"]
ct_df = (
    depmap_modeling_df.assign(n=1)[grp_cols + ["n"]]
    .groupby(grp_cols)
    .count()
    .query("n > 1")
    .compute()
)

if not ct_df.shape[0] == 0:
    print("There are some sgRNA with multiple targets.")
    print(ct_df.head(20))
    FAILED_CHECKS += 1

In [12]:
if FAILED_CHECKS > 0:
    raise Exception(f"There were {FAILED_CHECKS} failed checks.")

---

In [13]:
%load_ext watermark
%watermark -d -u -v -iv -b -h -m

Last updated: 2021-06-04

Python implementation: CPython
Python version       : 3.9.2
IPython version      : 7.24.1

Compiler    : GCC 9.3.0
OS          : Linux
Release     : 3.10.0-1062.el7.x86_64
Machine     : x86_64
Processor   : x86_64
CPU cores   : 28
Architecture: 64bit

Hostname: compute-e-16-189.o2.rc.hms.harvard.edu

Git branch: update-data

numpy     : 1.20.1
seaborn   : 0.11.1
dask      : 2021.5.1
plotnine  : 0.8.0
pandas    : 1.2.3
matplotlib: 3.3.4

