# Sanger DepMap Cancer Cell Line Proteomics

The [Sanger Cancer Dependency Map](https://depmap.sanger.ac.uk/) is a project aimed at assigning dependencies to every cancer cell which could be leveraged in patient treatment. The resource contains data for gene expression, gene mutation, CRISPR knockout, proteomics, and many more analysis methods in order to exhaustively explore cancer dependencies.

The Cancer Cell Line Proteomics dataset measures protein intensity values acquired using data-independent acquisition mass spectrometry (DIA-MS). The dataset includes associations between 949 cancerous human cell lines and 8320 genes.

Gonçalves, E et al. (2022). "Pan-cancer proteomic map of 949 human cell lines." Cancer Cell 40(8): 835-849.e838.

In [None]:
import pandas as pd
import datetime
import numpy as np
import scipy.spatial.distance as dist
import seaborn
import sys
import json
import scanpy as sc

from sklearn.feature_extraction.text import TfidfVectorizer
import anndata
from bokeh.io import output_notebook, export_svg, output_file, save
from bokeh.io.export import get_screenshot_as_png
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.palettes import Category20
sys.setrecursionlimit(100000)

## Load Data
CSV files are downloaded from the respective SQL table containing the database and added to a folder named "tables". Mapping files are also placed in this folder.

The Sanger DepMap Cancer Cell Line Proteomics data used below can be found [here](https://cellmodelpassports.sanger.ac.uk/downloads).

In [None]:
def threshold(zscore):
    if zscore > 0:
        return 1
    elif zscore < 0:
        return -1
    return zscore

In [None]:
proteomics = pd.read_csv('newdata/SangerProteomics/proteomics_all_20220713.csv').set_index(['model_name','symbol'])
proteomics['threshold'] = proteomics['zscore'].apply(threshold)
proteomics = proteomics.reset_index()
proteomics

In [None]:
print(proteomics.model_name.unique().__len__(), 'cell lines,',proteomics.symbol.unique().__len__(),'genes')

## Pre-process Data

### Filter Genes

In [None]:
prodgenes = pd.read_csv('production/gene_updated.csv')
prodgenes['symbol'] = prodgenes['symbol'].apply(str.upper)
prodgenes

In [None]:
geneinfo = pd.read_csv('tables/gene_info', sep='\t')
geneinfo = geneinfo[geneinfo['#tax_id']==9606]
geneinfo = geneinfo[geneinfo['type_of_gene']=='protein-coding']
geneinfo = geneinfo.get(['GeneID','Symbol','description'])
geneinfo

In [None]:
dropgenes = []
for gene in proteomics.symbol.unique():
    if gene not in prodgenes.symbol.tolist() and gene not in geneinfo.Symbol.tolist():
        dropgenes.append(gene)

In [None]:
proteomics = proteomics.set_index('symbol').drop(dropgenes, axis=0).reset_index()
proteomics

### Map Gene Symbols to Entrez NCBI ID

In [None]:
genedict = prodgenes.set_index('symbol')['ncbi_entrez_gene_id'].to_dict()
genedict.update(geneinfo.set_index('Symbol')['GeneID'].to_dict())
proteomics['gene_id'] = proteomics['symbol'].apply(lambda x: genedict[x])
proteomics = proteomics.get(['symbol','gene_id','model_name','model_id','protein_intensity','zscore','threshold'])
proteomics

## Process Data

In [None]:
proteomics = proteomics.sort_values(['model_name','zscore'], ascending=[True,False]).set_index('model_name')
edgelist = pd.DataFrame(columns=proteomics.columns)
for cellline in proteomics.index.unique():
    edgelist = pd.concat([edgelist,proteomics.loc[cellline][:100]])
proteomics = proteomics.reset_index().sort_values(['model_name','zscore']).set_index('model_name')
for cellline in proteomics.index.unique():
    edgelist = pd.concat([edgelist,proteomics.loc[cellline][:100]])
proteomics = proteomics.reset_index()
edgelist = edgelist.reset_index()
edgelist.columns = proteomics.columns
edgelist

In [None]:
print(proteomics.model_name.unique().__len__(), 'cell lines,',proteomics.symbol.unique().__len__(),'genes')

## Harmonizome Additions

### Resource

In [None]:
('Sanger Cancer Dependency Map',
    'DepMap',
    'This project aims to assign a dependency to every cancer cell in a patient which could be exploited to develop new therapies. This knowledge is foundational for precision cancer medicine',
    'Identifying all dependencies in every cancer cell',
    'https://depmap.sanger.ac.uk/',
    949,
    1,
    'depmap_logo.png')

### Dataset

In [None]:
(130, 
    'Sanger Dependency Map Cancer Cell Line Proteomics', 
    'Cancer Cell Line Proteomics', 
    'Protein intensity values acquired using data-independent acquisition mass spectrometry (DIA-MS).', 
    'association', 
    'gene_set_description', 
    'gene_sets_description from the Sanger Dependency Map Cancer Cell Line Proteomics dataset', 
    'attribute_set_description from the Sanger Dependency Map Cancer Cell Line Proteomics dataset', 
    '', 
    '', 
    0, 
    0, 
    datetime.datetime.today().strftime('%Y-%m-%d'), 
    'sangerdepmap', 
    0, 
    76, 
    25, 
    5, 
    1, 
    1, 
    'protein expression by mass spectrometry', 
    'curated experimental data', 
    'high throughput, data driven', 
    'cell lines')

### Publication

In [None]:
('130',
'Gonçalves, E et al. (2022) DepMap: Pan-cancer proteomic map of 949 human cell lines. Cancer Cell 40, 835-849.e8.',
'Gonçalves, Cancer Cell, 2022',
'https://doi.org/10.1016/j.ccell.2022.06.010',
'35839778',
'http://www.ncbi.nlm.nih.gov/pubmed/35839778',
'Gonçalves',
'E',
'Cancer Cell',
'2022',
'Pan-cancer proteomic map of 949 human cell lines',
'40',
'835-49')

### Naming Authority

In [None]:
(98,
    'Cell Model Passports',
    'CMP',
    'A Hub for Preclinical Cancer Models - Annotation, Genomics & Functional Datasets',
    'cellmodelpassports.sanger.ac.uk',
    130)

### Attributes To Add

In [None]:
cancermodels = pd.read_csv('tables/model_list_20221014.csv')
cancermodels = cancermodels.set_index('model_id')
cancermodels = (cancermodels['cancer_type']+' '+cancermodels['tissue_status']).to_dict()
attributes = edgelist.get(['model_name','model_id']).drop_duplicates().reset_index(drop=True)
attributes['model_desc'] = attributes['model_id'].apply(lambda x: cancermodels[x])
attributes = attributes.reset_index().reset_index()
attributes.columns = ['attribute', 'gene_set', 'model_name', 'model_id', 'model_desc']
attributes['attribute'] += 295497
attributes['gene_set'] += 130000000
genesetfk = attributes.set_index('model_name')['gene_set'].to_dict()
attributes

In [None]:
# id, name_from_naming_authority, id_from_naming_authority, description_from_naming_authority, url, naming_authority_fk
for cellline in attributes.index:
    cellline = attributes.loc[cellline]
    id = cellline['model_id']
    '''print((cellline['attribute'],
        cellline['model_name'], 
        id, 
        'cell line derived from ' + cellline['model_desc'],
        'cellmodelpassports.sanger.ac.uk/passports/'+id,
        98), end=',\n')'''

### Gene Sets to Add

In [None]:
# id, name_from_dataset, id_from_dataset, description_from_dataset, url_from_dataset, dataset_fk, attribute_type_fk, attribute_fk
for cellline in attributes.index:
    cellline = attributes.loc[cellline]
    id = cellline['model_id']
    '''print((cellline['gene_set'],
        cellline['model_name'],
        id,
        'cell line derived from ' + cellline['model_desc'],
        'https://cellmodelpassports.sanger.ac.uk/passports/'+id,
        130,
        1,
        cellline['attribute']), end=',\n')'''

### Genes To Add

In [None]:
# id, symbol, entrez_gene_id, name, ncbi_entrez_gene_url
genefk = prodgenes.set_index('symbol')['id'].to_dict()
geneinfo['Symbol'] = geneinfo['Symbol'].apply(str.upper)
geneinfo = geneinfo.set_index('Symbol')
i = 56721
for gene in edgelist.symbol.unique():
    if gene not in prodgenes.symbol.to_list() and gene in geneinfo.index:
        id = geneinfo.loc[gene,'GeneID']
        name = geneinfo.loc[gene,'description']
        print((i,
        gene,
        id,
        name,
        'http://www.ncbi.nlm.nih.gove/gene/'+str(id)),end=',\n')
        genefk[gene]=id
        i+=1
geneinfo = geneinfo.reset_index()

In [None]:
genes = edgelist.get(['symbol','gene_id']).drop_duplicates().reset_index(drop=True)
genes['gene_fk'] = genes['symbol'].apply(str.upper).apply(lambda x: genefk[x])
genes

### Associations to add

In [None]:
# id, gene_fk, gene_set_fk, cleaned_value, standardized_value, threshold_value

associations = edgelist.copy()
associations['gene_fk'] = associations['symbol'].apply(str.upper).apply(lambda x: genefk[x])
associations['gene_set_fk'] = associations['model_name'].apply(lambda x: genesetfk[x])
associations = associations.get(['gene_fk','gene_set_fk','protein_intensity','zscore','threshold'])
associations = associations.reset_index()
associations['index'] += 10000000
associations.columns = ['id', 'gene_fk','gene_set_fk','protein_intensity','zscore','threshold']
associations.to_csv('harmonizome-update/sanger.csv')
associations

## Downloads

In [None]:
output_path = 'newdata/SangerProteomics/Downloads/'

### Gene List

In [None]:
genes = genes.get(['symbol','gene_id'])
genes.to_csv(output_path+'gene_list_terms.txt.gz', sep='\t', compression='gzip')
genes

### Attribute List

In [None]:
attributes = attributes.get(['model_name','model_id'])
attributes.to_csv(output_path+'attribute_list_entries.txt.gz', sep='\t', compression='gzip')
attributes

### Gene-Attribute Edge List

In [None]:
edgelist = edgelist.get(['model_name', 'model_id', 'symbol', 'gene_id', 'protein_intensity', 'zscore', 'threshold'])
edgelist.to_csv(output_path+'gene_attribute_edges.txt.gz', sep='\t', compression='gzip')
edgelist

### Gene-Attribute Ternary Matrix

In [None]:
ternary_matrix = pd.crosstab(index=edgelist['symbol'],
                columns=edgelist['model_name'],
                values=edgelist['threshold'].values,
                aggfunc=np.max).fillna(0).astype(int)


ternary_matrix = ternary_matrix.rename_axis('Gene Symbol', axis='index').rename_axis('Cell Line',  axis='columns')

ternary_matrix_T = ternary_matrix.T

ternary_matrix.to_csv(output_path+'gene_attribute_matrix.txt.gz', sep='\t', compression='gzip')
ternary_matrix

### Gene-Gene Similarity Matrix

In [None]:
similarity_matrix = dist.pdist(ternary_matrix.to_numpy(dtype=np.int_), 'cosine')
similarity_matrix = dist.squareform(similarity_matrix)
similarity_matrix = 1 - similarity_matrix

gene_similarity_matrix = pd.DataFrame(similarity_matrix, index=ternary_matrix.index, columns=ternary_matrix.index)
gene_similarity_matrix.index.name = None
gene_similarity_matrix.columns.name = None

gene_similarity_matrix.to_csv(output_path+'gene_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
gene_similarity_matrix

### Attribute-Attribute Similarity Matrix

In [None]:
similarity_matrix = dist.pdist(ternary_matrix_T.to_numpy(dtype=np.int_), 'cosine')
similarity_matrix = dist.squareform(similarity_matrix)
similarity_matrix = 1 - similarity_matrix

attribute_similarity_matrix = pd.DataFrame(similarity_matrix, index=ternary_matrix_T.index, columns=ternary_matrix_T.index)
attribute_similarity_matrix.index.name = None
attribute_similarity_matrix.columns.name = None

attribute_similarity_matrix.to_csv(output_path+'attribute_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
attribute_similarity_matrix

### Up Gene Set Library

In [None]:
arr = ternary_matrix.reset_index(drop=True).to_numpy(dtype=np.int_)

with open(output_path+'gene_set_library_up_crisp.gmt', 'w') as f:
    for i in range(arr.shape[1]):
        print(ternary_matrix.columns[i],*ternary_matrix.index[arr[:,i]==1],
            sep='\t', end='\n', file=f)

### Down Gene Set Library

In [None]:
arr = ternary_matrix.reset_index(drop=True).to_numpy(dtype=np.int_)

with open(output_path+'gene_set_library_dn_crisp.gmt', 'w') as f:
    for i in range(arr.shape[1]):
        print(ternary_matrix.columns[i],*ternary_matrix.index[arr[:,i]==-1],
            sep='\t', end='\n', file=f)

### Up Attribute Set Library

In [None]:
arr = ternary_matrix_T.reset_index(drop=True).to_numpy(dtype=np.int_)

with open(output_path+'attribute_set_library_up_crisp.gmt', 'w') as f:
    for i in range(arr.shape[1]):
        print(ternary_matrix_T.columns[i],*ternary_matrix_T.index[arr[:,i]==1],
            sep='\t', end='\n', file=f)

### Down Attribute Set Library

In [None]:
arr = ternary_matrix_T.reset_index(drop=True).to_numpy(dtype=np.int_)

with open(output_path+'attribute_set_library_dn_crisp.gmt', 'w') as f:
    for i in range(arr.shape[1]):
        print(ternary_matrix_T.columns[i],*ternary_matrix_T.index[arr[:,i]==-1],
            sep='\t', end='\n', file=f)

### Gene-Attribute Cleaned Matrix

In [None]:
cleaned_matrix = pd.crosstab(index=edgelist['symbol'],
                columns=edgelist['model_name'],
                values=edgelist['protein_intensity'].values,
                aggfunc=np.max).fillna(0)

cleaned_matrix = cleaned_matrix.rename_axis('Gene Symbol', axis='index').rename_axis('Transcription Factor',  axis='columns')

cleaned_matrix.to_csv(output_path+'gene_attribute_matrix_cleaned.txt.gz', sep='\t', compression='gzip')
cleaned_matrix

### Gene-Attribute Standardized Matrix

In [None]:
standardized_matrix = pd.crosstab(index=edgelist['symbol'],
                columns=edgelist['model_name'],
                values=edgelist['zscore'].values,
                aggfunc=np.max).fillna(0)

standardized_matrix = standardized_matrix.rename_axis('Gene Symbol', axis='index').rename_axis('Transcription Factor',  axis='columns')

standardized_matrix.to_csv(output_path+'gene_attribute_matrix_standardized.txt.gz', sep='\t', compression='gzip')
standardized_matrix

### Knowledge Graph Serializations

In [None]:
nodes = {}
edges = []

for gene in genes.index:
    gene = genes.loc[gene]
    id = str(gene['gene_id'])
    label = gene['symbol']
    nodes[id] = {
        "type":"gene",
        "properties": {
            "id": id,
            "label": label
        }
    }

for cellline in attributes.index:
    cellline = attributes.loc[cellline]
    id = cellline['model_id']
    label = cellline['model_name']
    nodes[id] = {
        "type":"cell line",
        "properties": {
            "id": id,
            "label": label
        }
    }

for edge in edgelist.index:
    edge = edgelist.loc[edge]
    source = edge['model_id']
    target = str(edge['gene_id'])
    edges.append({
    "source": source,
    "relation": "directly regulates activity of",
    "target": target,
    "properties": {
        "id": source+":"+target,
        "source_label": edge['model_name'],
        "target_label": edge['symbol'],
        "directed":True,
        "protein_intensity":edge['protein_intensity'],
        "zscore":edge['zscore'],
        "threshold":edge['threshold']
    }
})

RDF

In [None]:
with open(output_path+'serializations/sangerproteomics.rdf', 'w') as f:
    print('@prefix sanger: <https://cellmodelpassports.sanger.ac.uk/passports?q=> .', file=f)
    print('@prefix regulates: <http://purl.obolibrary.org/obo/RO_0002448> .', file=f)
    print('@prefix gene: <https://ncbi.nlm.nih.gov/gene/> .', file=f)
    print('', file=f)
    for edge in edges:
        print('sanger:'+edge['source'], 'regulates', 'gene:'+str(edge['target']), end=' .\n', file=f)

JSON

In [None]:
with open(output_path+'serializations/sangerproteomics.json', 'w') as f:
    serial = json.dump(
        {
            "Version":"1", 
            "nodes": nodes,
            "edges": edges
        }, indent=4, fp=f)

TSV

In [None]:
nodeframe = pd.DataFrame(nodes).T
nodeframe['id'] = nodeframe['properties'].apply(lambda x: x['id'])
nodeframe['label'] = nodeframe['properties'].apply(lambda x: x['label'])
nodeframe = nodeframe.drop(columns=['properties'])
nodeframe = nodeframe.reset_index(drop=True)
nodeframe.to_csv(output_path+'serializations/tsv/nodes.tsv', sep='\t')

In [None]:
edgeframe = pd.DataFrame(edges)

for property in edgeframe.loc[0, 'properties']:
    edgeframe[property] = edgeframe['properties'].apply(lambda x: x[property])

edgeframe = edgeframe.get(['source','source_label','relation','target','target_label','protein_intensity','zscore','threshold'])
edgeframe.to_csv(output_path+'serializations/tsv/edges.tsv', sep='\t')

## Visualizations

In [None]:
seaborn.clustermap(ternary_matrix, cmap='seismic', center=0)

In [None]:
seaborn.clustermap(gene_similarity_matrix,cmap='seismic',center=0)

In [None]:
seaborn.clustermap(attribute_similarity_matrix, cmap='seismic', center=0)

### UMAP

In [None]:
output_notebook()

In [None]:
genesets={}
genesetsup = edgelist[edgelist['threshold']==1].sort_values(['model_name', 'zscore'],ascending=[True,False]).set_index('model_name')['symbol']
genesetsup.index += '_up'
for geneset in genesetsup.index.unique():
    genesets[geneset] = ' '.join(genesetsup.loc[geneset].to_list())
genesetsdn = edgelist[edgelist['threshold']==-1].sort_values(['model_name', 'zscore']).set_index('model_name')['symbol']
genesetsdn.index += '_dn'
for geneset in genesetsdn.index.unique():
    genesets[geneset] = ' '.join(genesetsdn.loc[geneset].to_list())

In [None]:
vec = TfidfVectorizer(max_df=0.5, min_df=10)
X = vec.fit_transform(genesets.values())
adata = anndata.AnnData(X, dtype='float32')
adata.obs.index = genesets.keys()

sc.pp.neighbors(adata, n_neighbors=25, use_rep='X')
sc.tl.leiden(adata)
sc.tl.umap(adata, min_dist=0.001, spread=10)

new_order = adata.obs.sort_values(by='leiden').index.tolist()
adata = adata[new_order,:]
adata.obs['leiden'] = 'Cluster ' + adata.obs['leiden'].astype('object')

mapped_df = pd.DataFrame(adata.obsm['X_umap'])
mapped_df.columns = ['x', 'y']

mapped_df['cluster'] = adata.obs['leiden'].values
mapped_df['term'] = adata.obs.index

clusters = pd.unique(mapped_df['cluster']).tolist()
colors = list(Category20[20])[::2] + list(Category20[20])[1::2]
color_mapper = {clusters[i]:colors[i%20] for i in range(len(clusters))}

mapped_df['color'] = mapped_df['cluster'].apply(lambda x: color_mapper[x])

xlabel = 'UMAP 1'
ylabel = 'UMAP 2'

source2 = ColumnDataSource(
        data=dict(
            x = mapped_df.x,
            y = mapped_df.y,
            alpha = [0.7] * mapped_df.shape[0],
            colors = mapped_df['color'], 
            size = [6] * mapped_df.shape[0],
            gene_set = mapped_df['term'],
            cluster = mapped_df['cluster']
        )
    )

hover_emb = HoverTool(names=["df"], tooltips="""
    <div style="margin: 10">
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
            <span style="font-size: 12px">@gene_set</span>
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Coordinates:</span>
            <span style="font-size: 12px">(@x,@y)</span>
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Cluster:</span>
            <span style="font-size: 12px">@cluster</span>
        </div>
    </div>
    """)

tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']
title_emb = 'Gene Sets in Tabula Sapiens Library'
plot_emb = figure(plot_width=1000, plot_height=700, tools=tools_emb, title=title_emb, x_axis_label=xlabel, y_axis_label=ylabel)
plot_emb.circle( 'x', 'y', source = source2, size='size',
                alpha='alpha', line_alpha=0, line_width=0.01, name="df", 
                fill_color = 'colors', 
                line_color="black", hover_fill_color="firebrick")
plot_emb.xaxis.axis_label_text_font_style = 'normal'
plot_emb.xaxis.axis_label_text_font_size = '18px'
plot_emb.yaxis.axis_label_text_font_size = '18px'
plot_emb.yaxis.axis_label_text_font_style = 'normal'
plot_emb.title.align = 'center'
plot_emb.title.text_font_size = '18px'

show(plot_emb)