# Make a smaller data set for testing models

In [1]:
library(nakedpipe)
library(tidyverse)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.3     [32m✔[39m [34mdplyr  [39m 1.0.1
[32m✔[39m [34mtidyr  [39m 1.1.1     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.5.0

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



In [20]:
data_dir <- file.path("..", "modeling_data")
modeling_data_path <- file.path(data_dir, "depmap_modeling_dataframe.csv")
out_path <- file.path(data_dir, "depmap_modeling_dataframe_subsample.csv")

In [4]:
modeling_data <- read_csv(modeling_data_path, guess_max = 1e5)
head(modeling_data)

Parsed with column specification:
cols(
  .default = col_character(),
  lfc = [32mcol_double()[39m,
  pdna_batch = [32mcol_double()[39m,
  passes_qc = [33mcol_logical()[39m,
  n_alignments = [32mcol_double()[39m,
  chrom_pos = [32mcol_double()[39m,
  segment_mean = [32mcol_double()[39m,
  copy_number = [32mcol_double()[39m,
  n_muts = [32mcol_double()[39m,
  any_deleterious = [33mcol_logical()[39m,
  mutated_at_guide_location = [33mcol_logical()[39m,
  rna_expr = [32mcol_double()[39m
)

See spec(...) for full column specifications.



sgrna,replicate_id,lfc,pdna_batch,passes_qc,depmap_id,primary_or_metastasis,lineage,lineage_subtype,kras_mutation,⋯,segment_mean,copy_number,n_muts,any_deleterious,variant_classification,is_deleterious,is_tcga_hotspot,is_cosmic_hotspot,mutated_at_guide_location,rna_expr
<chr>,<chr>,<dbl>,<dbl>,<lgl>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<dbl>,<dbl>,<dbl>,<lgl>,<chr>,<chr>,<chr>,<chr>,<lgl>,<dbl>
AAAAAAATCCAGCAATGCAG,143b-311cas9_repa_p6_batch3,0.2896938,3,True,ACH-001001,Primary,bone,osteosarcoma,G12S,⋯,1.143428,2.209053,0,False,,,,,False,4.1009776
AAAAAACCCGTAGATAGCCT,143b-311cas9_repa_p6_batch3,0.1701723,3,True,ACH-001001,Primary,bone,osteosarcoma,G12S,⋯,0.8850587,1.84684,0,False,,,,,False,7.4709435
AAAAAAGAAGAAAAAACCAG,143b-311cas9_repa_p6_batch3,-0.6959467,3,True,ACH-001001,Primary,bone,osteosarcoma,G12S,⋯,0.8946238,1.859125,0,False,,,,,False,4.6270231
AAAAAAGCTCAAGAAGGAGG,143b-311cas9_repa_p6_batch3,-0.3249354,3,True,ACH-001001,Primary,bone,osteosarcoma,G12S,⋯,1.046162,2.065029,0,False,,,,,False,4.7750505
AAAAAAGGCTGTAAAAGCGT,143b-311cas9_repa_p6_batch3,0.1428739,3,True,ACH-001001,Primary,bone,osteosarcoma,G12S,⋯,0.8744568,1.833318,0,False,,,,,False,0.2265085
AAAAAAGGGCTCCAAAAAGG,143b-311cas9_repa_p6_batch3,-0.2998787,3,True,ACH-001001,Primary,bone,osteosarcoma,G12S,⋯,1.085596,2.122252,0,False,,,,,False,0.3785116


In [5]:
glimpse(modeling_data)

Rows: 120,235,860
Columns: 25
$ sgrna                     [3m[90m<chr>[39m[23m "AAAAAAATCCAGCAATGCAG", "AAAAAACCCGTAGATAGC…
$ replicate_id              [3m[90m<chr>[39m[23m "143b-311cas9_repa_p6_batch3", "143b-311cas…
$ lfc                       [3m[90m<dbl>[39m[23m 0.28969381, 0.17017231, -0.69594665, -0.324…
$ pdna_batch                [3m[90m<dbl>[39m[23m 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3…
$ passes_qc                 [3m[90m<lgl>[39m[23m TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, T…
$ depmap_id                 [3m[90m<chr>[39m[23m "ACH-001001", "ACH-001001", "ACH-001001", "…
$ primary_or_metastasis     [3m[90m<chr>[39m[23m "Primary", "Primary", "Primary", "Primary",…
$ lineage                   [3m[90m<chr>[39m[23m "bone", "bone", "bone", "bone", "bone", "bo…
$ lineage_subtype           [3m[90m<chr>[39m[23m "osteosarcoma", "osteosarcoma", "osteosarco…
$ kras_mutation             [3m[90m<chr>[39m[23m "G12S", "G12S", "G12S", "G12S", "G1

In [6]:
# Number of unique guides.
n_distinct(modeling_data$sgrna)

In [7]:
# Number of unique cell lines.
n_distinct(modeling_data$depmap_id)

In [8]:
# Number of genes.
n_distinct(modeling_data$hugo_symbol)

In [12]:
# Some specific genes to subset.
specific_genes <- c("KRAS", "BRAF", "PIK3CA", "PTK2", "MDM2", "TP53")

set.seed(0)
num_random_genes <- 20
subsample_genes <- c(
    sample(unique(modeling_data$hugo_symbol), num_random_genes),
    specific_genes
)

# Check that all genes are actually in the modeling data.
all(subsample_genes %in% modeling_data$hugo_symbol)

In [13]:
subsample_genes

In [18]:
subsample_modeling_data <- modeling_data %>% filter(hugo_symbol %in% subsample_genes)
pryr::object_size(subsample_modeling_data)

31.5 MB

In [21]:
nrow(subsample_modeling_data)

In [22]:
write_csv(subsample_modeling_data, out_path)