# Check of modeling data

In [1]:
from pathlib import Path
from time import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotnine as gg
import seaborn as sns

In [2]:
import dask.dataframe as dd
from dask.distributed import Client, progress

client = Client(n_workers=4, threads_per_worker=2, memory_limit="16GB")
client

0,1
Client  Scheduler: tcp://127.0.0.1:40063  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 8  Memory: 59.60 GiB


In [3]:
depmap_modeling_df_path = Path("../modeling_data/depmap_modeling_dataframe.csv")
if not depmap_modeling_df_path.exists():
    raise FileNotFoundError(f"Could not find '{depmap_modeling_df_path.as_posix()}'")

In [4]:
pd.read_csv(depmap_modeling_df_path, low_memory=False, nrows=200)

Unnamed: 0,sgrna,replicate_id,lfc,p_dna_batch,genome_alignment,hugo_symbol,screen,multiple_hits_on_gene,sgrna_target_chr,sgrna_target_pos,...,num_mutations,any_deleterious,any_tcga_hotspot,any_cosmic_hotspot,is_mutated,copy_number,lineage,primary_or_metastasis,is_male,age
0,AAAGCCCAGGAGTATGGGAG,HEL-311Cas9_RepA_p4_batch3,0.858827,3,chr2_130522105_-,CFC1B,broad,True,2,130522105,...,0,,,,False,1.109223,blood,,True,30
1,AAATCAGAGAAACCTGAACG,HEL-311Cas9_RepA_p4_batch3,-0.397664,3,chr11_89916950_-,TRIM49D1,broad,True,11,89916950,...,0,,,,False,1.155134,blood,,True,30
2,AACGTCTTTGAAGAAAGCTG,HEL-311Cas9_RepA_p4_batch3,0.102909,3,chr5_71055421_-,GTF2H2,broad,True,5,71055421,...,0,,,,False,0.757424,blood,,True,30
3,AACGTCTTTGAAGGAAGCTG,HEL-311Cas9_RepA_p4_batch3,-0.434218,3,chr5_69572480_+,GTF2H2C,broad,True,5,69572480,...,0,,,,False,0.757424,blood,,True,30
4,AAGAGGTTCCAGACTACTTA,HEL-311Cas9_RepA_p4_batch3,0.590026,3,chrX_155898173_+,VAMP7,broad,True,X,155898173,...,0,,,,False,0.345761,blood,,True,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,TGCTGGTGTGAATAAACAGT,HEL-311Cas9_RepA_p4_batch3,0.294152,3,chr5_79619732_-,TENT2,broad,True,5,79619732,...,0,,,,False,0.757424,blood,,True,30
196,TGGCCTTAGGAAGCAGTGCG,HEL-311Cas9_RepA_p4_batch3,0.110017,3,chr18_62187703_+,RELCH,broad,True,18,62187703,...,0,,,,False,1.064841,blood,,True,30
197,TGGCGAAGATGTAGACGGCG,HEL-311Cas9_RepA_p4_batch3,-0.013850,3,chr22_23961084_+,GSTT2B,broad,True,22,23961084,...,0,,,,False,0.794994,blood,,True,30
198,TGGCTGGTGTTCAGGATCCA,HEL-311Cas9_RepA_p4_batch3,-0.038889,3,chr3_50297333_+,NAA80,broad,True,3,50297333,...,0,,,,False,1.154458,blood,,True,30


In [5]:
depmap_modeling_df = dd.read_csv(
    depmap_modeling_df_path,
    dtype={
        "age": "float64",
        "p_dna_batch": "object",
        "primary_or_metastasis": "object",
        "counts_final": "float64",
    },
    low_memory=False,
)

In [6]:
depmap_modeling_df.head()

Unnamed: 0,sgrna,replicate_id,lfc,p_dna_batch,genome_alignment,hugo_symbol,screen,multiple_hits_on_gene,sgrna_target_chr,sgrna_target_pos,...,num_mutations,any_deleterious,any_tcga_hotspot,any_cosmic_hotspot,is_mutated,copy_number,lineage,primary_or_metastasis,is_male,age
0,AAAGCCCAGGAGTATGGGAG,HEL-311Cas9_RepA_p4_batch3,0.858827,3,chr2_130522105_-,CFC1B,broad,True,2,130522105,...,0,,,,False,1.109223,blood,,True,30.0
1,AAATCAGAGAAACCTGAACG,HEL-311Cas9_RepA_p4_batch3,-0.397664,3,chr11_89916950_-,TRIM49D1,broad,True,11,89916950,...,0,,,,False,1.155134,blood,,True,30.0
2,AACGTCTTTGAAGAAAGCTG,HEL-311Cas9_RepA_p4_batch3,0.102909,3,chr5_71055421_-,GTF2H2,broad,True,5,71055421,...,0,,,,False,0.757424,blood,,True,30.0
3,AACGTCTTTGAAGGAAGCTG,HEL-311Cas9_RepA_p4_batch3,-0.434218,3,chr5_69572480_+,GTF2H2C,broad,True,5,69572480,...,0,,,,False,0.757424,blood,,True,30.0
4,AAGAGGTTCCAGACTACTTA,HEL-311Cas9_RepA_p4_batch3,0.590026,3,chrX_155898173_+,VAMP7,broad,True,X,155898173,...,0,,,,False,0.345761,blood,,True,30.0


In [7]:
depmap_modeling_df.columns

Index(['sgrna', 'replicate_id', 'lfc', 'p_dna_batch', 'genome_alignment',
       'hugo_symbol', 'screen', 'multiple_hits_on_gene', 'sgrna_target_chr',
       'sgrna_target_pos', 'depmap_id', 'counts_final', 'counts_initial',
       'rna_expr', 'num_mutations', 'any_deleterious', 'any_tcga_hotspot',
       'any_cosmic_hotspot', 'is_mutated', 'copy_number', 'lineage',
       'primary_or_metastasis', 'is_male', 'age'],
      dtype='object')

## Basic checks

In [8]:
FAILED_CHECKS = 0

Check that specific columns exist (prevents some really bonehead discoveries later on...).

In [9]:
cols_that_should_exist = [
    "depmap_id",
    "sgrna",
    "hugo_symbol",
    "lfc",
    "screen",
    "num_mutations",
    "is_mutated",
    "lineage",
    "counts_final",
    "p_dna_batch",
    "primary_or_metastasis",
]

missing_cols = [
    col for col in cols_that_should_exist if col not in depmap_modeling_df.columns
]
if len(missing_cols) != 0:
    print(f"Some columns ({len(missing_cols)}) that should be present are not 😦")
    print("  missing columns: " + ", ".join(missing_cols))
    FAILED_CHECKS += 1

Check that specific columns have no missing (`NA`) values.

In [10]:
cols_without_na = [
    "depmap_id",
    "sgrna",
    "hugo_symbol",
    "lfc",
    "screen",
    "num_mutations",
    "is_mutated",
    "lineage",
]

na_checks = depmap_modeling_df.isna()[cols_without_na].any().compute()
num_missed_checks = na_checks.sum()

if num_missed_checks > 0:
    FAILED_CHECKS += num_missed_checks
    print(na_checks[na_checks])

In [11]:
na_checks

depmap_id        False
sgrna            False
hugo_symbol      False
lfc              False
screen           False
num_mutations    False
is_mutated       False
lineage          False
dtype: bool

Check that all combinations of cell line, sgRNA, and experimental replicate only appear once.

In [12]:
grp_cols = ["depmap_id", "sgrna", "replicate_id"]
ct_df = (
    depmap_modeling_df.assign(n=1)[grp_cols + ["n"]]
    .groupby(grp_cols)
    .count()
    .query("n > 1")
    .compute()
)

if not ct_df.shape[0] == 0:
    print("There are some sgRNA with multiple targets.")
    print(ct_df.head(20))
    FAILED_CHECKS += 1



In [13]:
if FAILED_CHECKS > 0:
    raise Exception(f"There were {FAILED_CHECKS} failed checks.")

---

In [14]:
%load_ext watermark
%watermark -d -u -v -iv -b -h -m

Last updated: 2021-09-28

Python implementation: CPython
Python version       : 3.9.2
IPython version      : 7.27.0

Compiler    : GCC 9.3.0
OS          : Linux
Release     : 3.10.0-1062.el7.x86_64
Machine     : x86_64
Processor   : x86_64
CPU cores   : 28
Architecture: 64bit

Hostname: compute-e-16-190.o2.rc.hms.harvard.edu

Git branch: update-data

seaborn   : 0.11.2
plotnine  : 0.8.0
pandas    : 1.2.3
numpy     : 1.20.1
dask      : 2021.5.1
matplotlib: 3.3.4

