# Prepare DepMap data

In [4]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import janitor
from pathlib import Path
import re

In [12]:
data_dir = Path('../data')
save_dir = Path('../modeling_data')

## 'sample_info.csv'

In [9]:
sample_info_columns = [
    'depmap_id', 'stripped_cell_line_name', 'ccle_name', 'sex', 
    'cas9_activity', 'primary_or_metastasis', 'primary_disease',
    'subtype', 'lineage', 'lineage_subtype',
]

sample_info = pd.read_csv(data_dir / 'sample_info.csv') \
    .clean_names() \
    [sample_info_columns]
sample_info.head()

Unnamed: 0,depmap_id,stripped_cell_line_name,ccle_name,sex,cas9_activity,primary_or_metastasis,primary_disease,subtype,lineage,lineage_subtype
0,ACH-000001,NIHOVCAR3,NIHOVCAR3_OVARY,Female,,Metastasis,Ovarian Cancer,"Adenocarcinoma, high grade serous",ovary,ovary_adenocarcinoma
1,ACH-000002,HL60,HL60_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,Female,,Primary,Leukemia,"Acute Myelogenous Leukemia (AML), M3 (Promyelo...",blood,AML
2,ACH-000003,CACO2,CACO2_LARGE_INTESTINE,Male,,,Colon/Colorectal Cancer,Adenocarcinoma,colorectal,colorectal_adenocarcinoma
3,ACH-000004,HEL,HEL_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,Male,47.6,,Leukemia,"Acute Myelogenous Leukemia (AML), M6 (Erythrol...",blood,AML
4,ACH-000005,HEL9217,HEL9217_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,Male,13.4,,Leukemia,"Acute Myelogenous Leukemia (AML), M6 (Erythrol...",blood,AML


In [13]:
sample_info.to_csv(save_dir / 'sample_info.csv', index=False)

## 'Achilles_guide_map.csv'

In [20]:
achilles_guide_map = pd.read_csv(data_dir / 'Achilles_guide_map.csv') \
    .clean_names() \
    .assign(hugo_symbol=lambda x: [a.split(' ')[0] for a in x.gene]) \
    .drop(['gene'], axis=1)

achilles_guide_map.head()

Unnamed: 0,sgrna,genome_alignment,n_alignments,hugo_symbol
0,AAAAAAATCCAGCAATGCAG,chr10_110964620_+,1,SHOC2
1,AAAAAACCCGTAGATAGCCT,chr12_95003615_+,1,NDUFA12
2,AAAAAAGAAGAAAAAACCAG,chr4_75970356_-,1,SDAD1
3,AAAAAAGCTCAAGAAGGAGG,chr2_33588446_-,1,FAM98A
4,AAAAAAGGCTGTAAAAGCGT,chr19_19891600_+,1,ZNF253


## 'Achilles_dropped_guides.csv'

In [23]:
achilles_dropped_guides = pd.read_csv(data_dir / 'Achilles_dropped_guides.csv') \
    .clean_names() \
    .rename({'unnamed_0': 'sgrna'}, axis=1)

achilles_dropped_guides.head()

Unnamed: 0,sgrna,genomic_coordinates,gene,n_alignments,fail_reason
0,AAAAAGCTTCCGCCTGATGG,,,0.0,not_aligned
1,AAAAATCCTAAAATAAAATA,chrX_145827835.0_-,,1.0,in_dropped_guides
2,AAAACAGAATATAGTCAGTG,chrX_145827787.0_-,,1.0,guide_dropped_by_ceres
3,AAAACAGGACGATGTGCGGC,,,0.0,not_aligned
4,AAAACATCGACCGAAAGCGT,,,0.0,not_aligned


In [24]:
len(np.unique(achilles_dropped_guides.sgrna))

2554

In [26]:
achilles_guide_map = achilles_guide_map[~achilles_guide_map.sgrna.isin(achilles_dropped_guides.sgrna)]
achilles_guide_map = achilles_guide_map.reset_index(drop=True)

In [27]:
achilles_guide_map.to_csv(save_dir / 'achilles_guide_map.csv', index=False)

## 'Achilles_replicate_map.csv'

In [37]:
achilles_replicate_map = pd.read_csv(data_dir / 'Achilles_replicate_map.csv') \
    .clean_names() \
    .assign(replicate_id=lambda x: x.replicate_id.str.lower())
achilles_replicate_map.head()

Unnamed: 0,replicate_id,depmap_id,pdna_batch,passes_qc
0,pacadd188-311cas9_repb_p6_batch3,ACH-001382,3,True
1,kmrc20-311cas9_repa_p6_batch3,ACH-000250,3,True
2,253j-311cas9_repa_p5_batch3,ACH-000011,3,True
3,ocug1-311cas9-repb-p6_batch3,ACH-001619,3,True
4,raji-311cas9_repb_p6_batch3,ACH-000654,3,True


In [38]:
np.max(achilles_replicate_map.pdna_batch)

4

In [39]:
len(np.unique(achilles_replicate_map.replicate_id))

1638

In [40]:
np.round(np.mean(achilles_replicate_map.passes_qc), 3)

0.996

In [41]:
achilles_replicate_map.to_csv(save_dir / 'achilles_replicate_map.csv', index=False)

## 'Achilles_logfold_change.csv'

In [82]:
achilles_logfold_change = pd.read_csv(data_dir / 'Achilles_logfold_change.csv') \
    .rename({'Construct Barcode': 'sgrna'}, axis=1) \
    .set_index('sgrna') \
    .melt(var_name='replicate_id', value_name='lfc', ignore_index=False) \
    .reset_index() \
    .assign(replicate_id=lambda x: x.replicate_id.str.lower()) \
    .merge(achilles_replicate_map, on='replicate_id', how='left') \
    .pipe(lambda x: x[x.passes_qc])

achilles_logfold_change.head()

Unnamed: 0,sgrna,replicate_id,lfc,depmap_id,pdna_batch,passes_qc
0,AAAAAAATCCAGCAATGCAG,143b-311cas9_repa_p6_batch3,0.289694,ACH-001001,3,True
1,AAAAAACCCGTAGATAGCCT,143b-311cas9_repa_p6_batch3,0.170172,ACH-001001,3,True
2,AAAAAAGAAGAAAAAACCAG,143b-311cas9_repa_p6_batch3,-0.695947,ACH-001001,3,True
3,AAAAAAGCTCAAGAAGGAGG,143b-311cas9_repa_p6_batch3,-0.324935,ACH-001001,3,True
4,AAAAAAGGCTGTAAAAGCGT,143b-311cas9_repa_p6_batch3,0.142874,ACH-001001,3,True


In [83]:
if achilles_logfold_change.depmap_id.isnull().values.any():
    raise Exception('Some data points are missing cell line assignments.')

In [84]:
if not np.all(achilles_logfold_change.passes_qc):
    raise Exception('Some data does not pass QC.')

In [85]:
achilles_logfold_change.shape

(121067627, 6)

In [86]:
achilles_logfold_change.to_csv(save_dir / 'achilles_logfold_change.csv', 
                               index=False)

## 'CCLE_mutations.csv'

In [98]:
ccle_mutations_columns = [
    'depmap_id', 
    'hugo_symbol', 'chromosome', 'start_position', 'end_position',
    'variant_classification', 'variant_type', 'reference_allele', 
    'tumor_seq_allele1', 'cdna_change', 'codon_change', 'protein_change',
    'isdeleterious', 'istcgahotspot', 'iscosmichotspot'
]

ccle_mutations = pd.read_csv(data_dir / 'CCLE_mutations.csv', 
                             delimiter='\t', low_memory=False) \
    .clean_names() \
    [ccle_mutations_columns]

ccle_mutations.head()

Unnamed: 0,depmap_id,hugo_symbol,chromosome,start_position,end_position,variant_classification,variant_type,reference_allele,tumor_seq_allele1,cdna_change,codon_change,protein_change,isdeleterious,istcgahotspot,iscosmichotspot
0,ACH-000986,A1BG,19,58858743,58858743,Missense_Mutation,SNP,C,T,c.1456G>A,c.(1456-1458)Gac>Aac,p.D486N,False,False,False
1,ACH-000988,A1BG,19,58858810,58858810,Silent,SNP,C,T,c.1389G>A,c.(1387-1389)caG>caA,p.Q463Q,False,False,False
2,ACH-002182,A1BG,19,58858867,58858867,Missense_Mutation,SNP,C,G,c.1332G>C,c.(1330-1332)aaG>aaC,p.K444N,False,False,False
3,ACH-000985,A1BG,19,58858872,58858872,Missense_Mutation,SNP,C,T,c.1327G>A,c.(1327-1329)Gtg>Atg,p.V443M,False,False,False
4,ACH-001793,A1BG,19,58858914,58858914,Missense_Mutation,SNP,C,T,c.1285G>A,c.(1285-1287)Gac>Aac,p.D429N,False,False,False


In [99]:
ccle_mutations.to_csv(save_dir / 'ccle_mutations.csv', index=False)

### *KRAS* mutations

In [136]:
kras_mutations_columns = [
    'depmap_id', 'start_position', 'end_position', 
    'variant_classification', 'variant_type', 'protein_change',
    'isdeleterious', 'istcgahotspot', 'iscosmichotspot'
]

kras_hotspot_codons = ['12', '13', '61', '146']

kras_mutations = ccle_mutations[ccle_mutations.hugo_symbol == 'KRAS'] \
    [kras_mutations_columns] \
    .assign(
        variant_classification=lambda x: x.variant_classification.str.lower(),
        variant_type=lambda x: x.variant_type.str.lower(),
        codon=lambda x: [re.sub('\D', '', a) for a in x.protein_change],
        is_kras_hotspot=lambda x: x.codon.isin(kras_hotspot_codons)
    ) \
    .pipe(lambda x: x[x.variant_classification != 'silent']) \
    .pipe(lambda x: x[x.is_kras_hotspot | x.iscosmichotspot | x.istcgahotspot | x.isdeleterious]) \
    .assign(kras_allele=lambda x: [a if b else "other" for a,b in zip(x.protein_change, x.is_kras_hotspot)]) \
    .assign(kras_allele=lambda x: [re.sub('p.', '', a) for a in x.kras_allele]) \
    .drop_duplicates() \
    .reset_index(drop=True)


kras_mutations.head()

Unnamed: 0,depmap_id,start_position,end_position,variant_classification,variant_type,protein_change,isdeleterious,istcgahotspot,iscosmichotspot,codon,is_kras_hotspot,kras_allele
0,ACH-000981,25368390,25368390,frame_shift_del,del,p.K185fs,True,False,False,185,False,other
1,ACH-001650,25368390,25368390,frame_shift_del,del,p.K185fs,True,False,False,185,False,other
2,ACH-002238,25368390,25368390,frame_shift_del,del,p.K185fs,True,False,False,185,False,other
3,ACH-000996,25368455,25368455,nonsense_mutation,snp,p.R164*,True,False,False,164,False,other
4,ACH-000218,25378561,25378561,missense_mutation,snp,p.A146V,False,True,True,146,True,A146V


In [137]:
kras_mutations[['kras_allele', 'depmap_id']] \
    .groupby('kras_allele') \
    .count() \
    .sort_values('depmap_id', ascending=False)

Unnamed: 0_level_0,depmap_id
kras_allele,Unnamed: 1_level_1
G12D,68
G12V,47
G12C,27
other,18
G12A,16
G13D,14
Q61H,9
G12S,8
G12R,8
A146T,7


In [134]:
kras_mutations.to_csv(save_dir / 'kras_mutations.csv', index=False)

In [143]:
mult_kras_mutations = kras_mutations[['depmap_id', 'kras_allele']] \
    .groupby('depmap_id') \
    .count() \
    .pipe(lambda x: x[x.kras_allele > 1]) \
    .sort_values('kras_allele', ascending=False) \
    .reset_index(drop=False)

mult_kras_mutations

Unnamed: 0,depmap_id,kras_allele
0,ACH-000718,3
1,ACH-000249,2
2,ACH-000264,2
3,ACH-000314,2
4,ACH-000344,2
5,ACH-001001,2
6,ACH-001094,2
7,ACH-001378,2
8,ACH-001650,2
9,ACH-001857,2


In [149]:
kras_mult_mutations_fix = kras_mutations[kras_mutations.depmap_id.isin(mult_kras_mutations.depmap_id)] \
    .reset_index(drop=True) \
    .pipe(lambda x: x[x.is_kras_hotspot]) \
    .sort_values('depmap_id')

kras_mult_mutations_fix

Unnamed: 0,depmap_id,start_position,end_position,variant_classification,variant_type,protein_change,isdeleterious,istcgahotspot,iscosmichotspot,codon,is_kras_hotspot,kras_allele
1,ACH-000249,25380275,25380275,missense_mutation,snp,p.Q61H,False,True,True,61,True,Q61H
3,ACH-000264,25380277,25380277,missense_mutation,snp,p.Q61K,False,True,True,61,True,Q61K
6,ACH-000264,25380277,25380278,missense_mutation,dnp,p.Q61K,False,True,True,61,True,Q61K
2,ACH-000314,25380275,25380275,missense_mutation,snp,p.Q61H,False,True,True,61,True,Q61H
10,ACH-000314,25398282,25398282,missense_mutation,snp,p.G13C,False,True,True,13,True,G13C
4,ACH-000344,25380277,25380277,missense_mutation,snp,p.Q61K,False,True,True,61,True,Q61K
5,ACH-000344,25380277,25380278,missense_mutation,dnp,p.Q61K,False,True,True,61,True,Q61K
11,ACH-000718,25398284,25398284,missense_mutation,snp,p.G12V,False,True,True,12,True,G12V
16,ACH-000718,25398285,25398285,missense_mutation,snp,p.G12C,False,True,True,12,True,G12C
15,ACH-000718,25398284,25398285,missense_mutation,dnp,p.G12F,False,True,True,12,True,G12F


In [153]:
kras_mut_blacklist = [
    'ACH-000718'
]

kras_mult_mutations_fix[['depmap_id', 'kras_allele']] \
    .pipe(lambda x: x[~x.depmap_id.isin(kras_mut_blacklist)]) \
    .groupby('depmap_id') \
    .count() \
    .sort_values('kras_allele', ascending=False) \
    .reset_index(drop=False)

Unnamed: 0,depmap_id,kras_allele
0,ACH-000264,2
1,ACH-000314,2
2,ACH-000344,2
3,ACH-001378,2
4,ACH-001857,2
5,ACH-000249,1
6,ACH-001001,1
7,ACH-001094,1
8,ACH-001650,1
