In [1]:
from pathlib import Path
from time import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotnine as gg
import seaborn as sns

In [2]:
depmap_modeling_df_path = Path("../modeling_data/depmap_modeling_dataframe.csv")
if not depmap_modeling_df_path.exists():
    raise FileNotFoundError(f"Could not find '{depmap_modeling_df_path.as_posix()}'")

In [3]:
depmap_modeling_df = pd.read_csv(depmap_modeling_df_path, nrows=1e6, low_memory=False)

In [4]:
depmap_modeling_df.head()

Unnamed: 0,sgrna,replicate_id,lfc,p_dna_batch,genome_alignment,hugo_symbol,screen,multiple_hits_on_gene,sgrna_target_chr,sgrna_target_pos,...,num_mutations,any_deleterious,any_tcga_hotspot,any_cosmic_hotspot,is_mutated,copy_number,lineage,primary_or_metastasis,is_male,age
0,AAACCTGCGGCGGTCGCCA,OVR3_c905R1,-0.299958,CRISPR_C6596666.sample,chr8_66505451_-,VXN,sanger,True,8,66505451,...,0,,,,False,0.847995,ovary,metastasis,False,60
1,AACAGCACACCGGCCCCGT,OVR3_c905R1,0.267092,CRISPR_C6596666.sample,chrX_156009834_-,IL9R,sanger,True,X,156009834,...,0,,,,False,0.700605,ovary,metastasis,False,60
2,AACCTCCGGACTCCTCAGC,OVR3_c905R1,0.550477,CRISPR_C6596666.sample,chr7_39609658_-,YAE1,sanger,True,7,39609658,...,0,,,,False,0.934918,ovary,metastasis,False,60
3,AACTCAAACTGACGCCGAA,OVR3_c905R1,-0.391922,CRISPR_C6596666.sample,chr1_117623388_-,TENT5C,sanger,True,1,117623388,...,0,,,,False,1.352975,ovary,metastasis,False,60
4,AACTGACCTTGAAACGCTG,OVR3_c905R1,-1.562577,CRISPR_C6596666.sample,chr16_66933623_+,CIAO2B,sanger,True,16,66933623,...,0,,,,False,1.259171,ovary,metastasis,False,60


In [5]:
depmap_modeling_df.shape

(1000000, 22)

In [6]:
depmap_modeling_df.columns

Index(['sgrna', 'replicate_id', 'lfc', 'p_dna_batch', 'genome_alignment',
       'hugo_symbol', 'screen', 'multiple_hits_on_gene', 'sgrna_target_chr',
       'sgrna_target_pos', 'depmap_id', 'rna_expr', 'num_mutations',
       'any_deleterious', 'any_tcga_hotspot', 'any_cosmic_hotspot',
       'is_mutated', 'copy_number', 'lineage', 'primary_or_metastasis',
       'is_male', 'age'],
      dtype='object')

## Basic checks

In [7]:
FAILED_CHECKS = 0

Check that specific columns have no missing (`NA`) values.

In [8]:
for c in [
    "depmap_id",
    "sgrna",
    "hugo_symbol",
    "lfc",
    "screen",
    "num_mutations",
    "is_mutated",
    "lineage",
    "is_male",
    "age",
]:
    if any(depmap_modeling_df[[c]].isna().values):
        print(f"Column '{c}' has missing values but should not.")
        FAILED_CHECKS += 1

Check that all combinations of cell line, sgRNA, and experimental replicate only appear once.

In [9]:
grp_cols = ["depmap_id", "sgrna", "replicate_id"]
ct_df = (
    depmap_modeling_df[grp_cols]
    .assign(n=1)
    .groupby(grp_cols)
    .count()
    .reset_index(drop=False)
)

if not all(ct_df[["n"]].values == 1):
    print("There are some sgRNA with multiple targets.")
    FAILED_CHECKS += 1

In [10]:
depmap_modeling_df.query("sgrna == 'AACTGCGCACAGAAGGAGA' and depmap_id == 'ACH-000001'")

Unnamed: 0,sgrna,replicate_id,lfc,p_dna_batch,genome_alignment,hugo_symbol,screen,multiple_hits_on_gene,sgrna_target_chr,sgrna_target_pos,...,num_mutations,any_deleterious,any_tcga_hotspot,any_cosmic_hotspot,is_mutated,copy_number,lineage,primary_or_metastasis,is_male,age
2559,AACTGCGCACAGAAGGAGA,OVR3_c905R1,0.039557,CRISPR_C6596666.sample,chr16_72058351_+,HP,sanger,True,16,72058351,...,0,,,,False,0.71174,ovary,metastasis,False,60


In [11]:
ct_df.loc[ct_df["n"] > 1]

Unnamed: 0,depmap_id,sgrna,replicate_id,n


In [12]:
if FAILED_CHECKS > 0:
    raise Exception(f"There were {FAILED_CHECKS} failed checks.")

---

In [13]:
%load_ext watermark
%watermark -d -u -v -iv -b -h -m

Last updated: 2021-06-03

Python implementation: CPython
Python version       : 3.9.2
IPython version      : 7.24.1

Compiler    : GCC 9.3.0
OS          : Linux
Release     : 3.10.0-1062.el7.x86_64
Machine     : x86_64
Processor   : x86_64
CPU cores   : 32
Architecture: 64bit

Hostname: compute-a-16-62.o2.rc.hms.harvard.edu

Git branch: update-data

seaborn   : 0.11.1
pandas    : 1.2.3
numpy     : 1.20.1
matplotlib: 3.3.4
plotnine  : 0.8.0

