# CCLE Proteomics
The CCLE is a resource cataloging roughly 1000 human cancer cell lines. The CCLE Proteomics dataset quantifies protein expression across 375 cell lines by protein intensity values collected using mass spectrometry. These expression values were then normalized (quantile and gene-wise z-score) and protein-cell line associations with a z-score greater than 2 were kept. This yielded 122408 associations between 375 cell lines and 8959 genes.

In [None]:
import pandas as pd
import datetime
import numpy as np
import scipy.spatial.distance as dist
import seaborn as sns
import sys
import json
import scanpy as sc
from tqdm import tqdm
from collections import OrderedDict

from sklearn.feature_extraction.text import TfidfVectorizer
import anndata
from collections import OrderedDict

# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, save, output_file
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.palettes import Category20
output_notebook()

from IPython.display import display, HTML, Markdown
sys.setrecursionlimit(100000)

## Load and Pre-Process Data
The raw CCLE Proteomics data was downloaded from the Gygi Lab's [website](https://gygi.hms.harvard.edu/publications/ccle.html).

In [None]:
ccleprot = pd.read_csv('newdata/CCLEProteomics/summed_sn_non_normalized_v1.1.csv')
for column in ccleprot.columns:
    if 'Description' in column or 'Group' in column or 'Protein.' in column or 'bridge' in column or 'Peptides' in column:
        ccleprot = ccleprot.drop(columns=column)
ccleprot

In [None]:
symbolmap = pd.read_csv('mapping/mappingFile_2023.tsv', sep='\t', header=None).set_index(1)[2].to_dict()
geneids = pd.read_csv('mapping/GeneSymbolsAndIDS_2023.tsv', sep='\t').set_index('Human, Mouse, and Rat Approved Symbol')['Entrez Gene ID(supplied by NCBI)'].to_dict()

In [None]:
ccleprot = ccleprot[ccleprot['Gene.Symbol'].apply(lambda x: x in symbolmap)]
ccleprot['Gene.Symbol'] = ccleprot['Gene.Symbol'].apply(lambda x: symbolmap[x])
ccleprot = ccleprot.set_index('Gene.Symbol').dropna(thresh=179)
ccleprot = ccleprot.rename(lambda x: x.split('_Ten')[0], axis=1)
ccleprot = ccleprot.groupby(axis=1, level=0).mean()
ccleprot

### Impute Missing Values

In [None]:
ccleprot = ccleprot.fillna(ccleprot.mean(axis=1))
ccleprot

### Quantile Normalization

In [None]:
df = ccleprot.copy()

attributes = df.columns.values.tolist()

df.columns = np.arange(0, len(attributes))

#compute rank
dic = {}
for i in tqdm(df.columns):
    dic.update({i : sorted(df[i])})

sorted_df = pd.DataFrame(dic)
rank = sorted_df.mean(axis = 1).tolist()

#sort
for i in tqdm(df.columns):
    t = np.searchsorted(np.sort(df[i]), df[i])
    df[i] = [rank[i] for i in t]

df.columns = attributes
ccleprot = df
ccleprot

### Gene-Wise Z-Score Normalization

In [None]:
def zscore(gene):
    mean = gene.mean()
    std = gene.std()
    gene = gene.apply(lambda x: (x-mean)/std)
    return gene

In [None]:
ccleprot = ccleprot.apply(zscore, axis=1)
ccleprot = ccleprot.rename_axis('Gene', axis=0).rename_axis('Cell Line', axis=1)
ccleprot.to_csv('newdata/CCLEProteomics/downloads/gene_attribute_matrix_standardized.txt.gz')
ccleprot

In [None]:
ccleprot = ccleprot.stack().to_frame('Z-score')
ccleprot = ccleprot[ccleprot['Z-score']>=2]
ccleprot = ccleprot.sort_values('Z-score', ascending=False).reset_index()
ccleprot

## Process Data for SQL Ingestion

### Dataset

In [None]:
#(id, name, name_without_resource, description, association, gene_set_description, gene_sets_description, attribute_set_description, is_signed, is_continuous_valued, last_updated, directory, num_page views, resource_fk, measurement_fk, dataset_group_fk, attribute_type_fk, attribute_group_fk, evidence_type, evidence_group, measurement_bias, attribute_type_plural)
(141, 'Cancer Cell Line Encyclopedia Cell Line Proteomics', 'Cell Line Proteomics',  'Protein intensity values acquired using mass spectrometry across human cancerous cell lines', 'gene-cell line associations by differential expression of genes across cell lines', 'proteins associated with cell line {0} from the CLE Cell Line Proteomics dataset.', 'sets of proteins associated with cell lines from the CCLE Cell Line Proteomics dataset.', 'Cell lines associated with {0} protein from the CCLE Cell Line Proteomics dataset.', 0, 1, '2023-06-26', 'ccleproteomics', 0, 7, 25, 5, 1, 1,  'protein expression by mass spectrometry','curated experimental data','high throughput, data driven','cell lines')

### Publication

In [None]:
#(id, long_citation, short_citation, url, pmid, pubmed_url, first_author_last_name, first_author_initials, journal_abbreviateion, year, title, volume, pages)
(143, 'Nusinow, PD et al. (2020) Quanititative Proteomics of the Cancer Cell Line Encylopedia. Cell. 180:387-402', 'Nusinow, Cell, 2020', 'dx.doi.org/10.1016/j.cell.2019.12.023', 31978347, 'https://pubmed.ncbi.nlm.nih.gov/31978347/', 'Nusinow', 'PD', 'Cell', 2020, 'Quantitative Proteomics of the Cancer Cell Line Encyclopedia', 180, '387-402')

(223, 141, 143)

### Gene

In [None]:
genes = pd.read_csv('production/gene.csv')
genelist = genes['ncbi_entrez_gene_id'].to_list()
geneids = pd.read_csv('tables/GeneSymbolsAndIDS_2023.tsv', sep='\t').drop_duplicates('Human, Mouse, and Rat Approved Symbol').set_index('Human, Mouse, and Rat Approved Symbol')['Entrez Gene ID(supplied by NCBI)'].to_dict()
newgenes = pd.read_csv('tables/newgenes.csv', index_col=0)
newgenelist = newgenes['ncbi_entrez_gene_id'].to_list()
geneids.update(newgenes.set_index('symbol')['ncbi_entrez_gene_id'].to_dict())
genedescs = pd.read_csv('tables/gene_info', sep='\t').get(['GeneID', 'description']).set_index('GeneID')['description'].to_dict()
genefks = genes.set_index('ncbi_entrez_gene_id')['id'].to_dict()
genefks.update(newgenes.reset_index().set_index('ncbi_entrez_gene_id')['id'].to_dict())

In [None]:
index = 57238
geneurl = 'https://ncbi.nlm.nih.gov/gene/'

for gene in ccleprot['Gene'].unique():
    id = geneids[gene]
    if id not in genelist and id not in newgenelist:
        print((index, gene, id, genedescs[id], geneurl+str(id)), end=',\n')
        genefks[id] = index
        geneids[gene] = id
        index += 1

### Attribute

In [None]:
pd.read_csv('newdata/CCLEProteomics/Model.csv')

In [None]:
index = 360000
celllines = pd.read_csv('newdata/CCLEProteomics/Model.csv').get(['CCLEName', 'OncotreeSubtype']).dropna().set_index('CCLEName')['OncotreeSubtype'].to_dict()
attributefks = {}

for cellline in ccleprot['Cell Line'].unique():
    print((index, cellline, celllines[cellline]+' cell line', 33), end=',\n')
    attributefks[cellline] = index
    index += 1

### Gene Set

In [None]:
index = 134200000
genesetfks = {}
url = 'https://depmap.org/portal/cell_line/'

for cellline in ccleprot['Cell Line'].unique():
    print((index, cellline, celllines[cellline]+' cell line', url+cellline, attributefks[cellline]), end=',\n')
    genesetfks[cellline] = index
    index += 1

### Association

In [None]:
associations = ccleprot.copy()
associations['gene_fk'] = associations['Gene'].apply(lambda x: genefks[geneids[x]])
associations['gene_set_fk'] = associations['Cell Line'].apply(lambda x: genesetfks[x])
associations['standardized_value'] = associations['Z-score']
associations['threshold_value'] = 1
associations = associations.get(['gene_fk', 'gene_set_fk', 'standardized_value', 'threshold_value'])
associations.index += 21000000
associations.to_csv('harmonizome-update/ccleproteomics.csv')
associations

## Create Downloads

In [None]:
output_path = 'newdata/CCLEProteomics/downloads/'
ccleprot['Gene ID'] = ccleprot['Gene'].apply(lambda x: geneids[x])

### Gene-Attribute Binary Matrix

In [None]:
binarymatrix = pd.crosstab(ccleprot['Gene'], ccleprot['Cell Line'])
binarymatrixT = binarymatrix.T
binarymatrix.to_csv(output_path+'gene_attribute_matrix.txt.gz', sep='\t', compression='gzip')
binarymatrix

### Gene-Attribute Edge List

In [None]:
edgelist = ccleprot.get(['Gene', 'Gene ID', 'Cell Line', 'Z-score'])
edgelist['Cell Line ID'] = edgelist['Cell Line']
edgelist = edgelist.get(['Gene', 'Gene ID', 'Cell Line', 'Cell Line ID', 'Z-score'])
edgelist.to_csv(output_path+'gene_attribute_edges.txt.gz', sep='\t', compression='gzip')
edgelist

### Gene List

In [None]:
geneslist = ccleprot.get(['Gene', 'Gene ID']).drop_duplicates().reset_index(drop=True)
geneslist.to_csv(output_path+'gene_list_terms.txt.gz', sep='\t', compression='gzip')
geneslist

### Attribute List

In [None]:
attributeslist = ccleprot.get(['Cell Line']).drop_duplicates().reset_index(drop=True)
attributeslist['Cell Line ID'] = attributeslist['Cell Line']
attributeslist.to_csv(output_path+'attribute_list_entries.txt.gz', sep='\t', compression='gzip')
attributeslist

### Gene Set Library

In [None]:
with open(output_path+'gene_set_library_crisp.gmt', 'w') as f:
    arr = binarymatrix.reset_index(drop=True).to_numpy(dtype=np.int_)
    attributes = binarymatrix.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*binarymatrix.index[arr[:, i] == 1]])>= 5:
            print(attributes[i], *binarymatrix.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Attribute Set Library

In [None]:
with open(output_path+'attribute_set_library_crisp.gmt', 'w') as f:
    arr = binarymatrixT.reset_index(drop=True).to_numpy(dtype=np.int_)
    genes = binarymatrixT.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*binarymatrixT.index[arr[:, i] == 1]])>= 5:
            print(genes[i], *binarymatrixT.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Gene Similarity Matrix

In [None]:
gene_similarity_matrix = dist.pdist(binarymatrix.to_numpy(dtype=np.int_), 'cosine')
gene_similarity_matrix = dist.squareform(gene_similarity_matrix)
gene_similarity_matrix = 1 - gene_similarity_matrix

gene_similarity_matrix = pd.DataFrame(data=gene_similarity_matrix, index=binarymatrix.index, columns=binarymatrix.index)
gene_similarity_matrix.index.name = None
gene_similarity_matrix.columns.name = None
gene_similarity_matrix.to_csv(output_path+'gene_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
gene_similarity_matrix

### Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = dist.pdist(binarymatrixT.to_numpy(dtype=np.int_), 'cosine')
attribute_similarity_matrix = dist.squareform(attribute_similarity_matrix)
attribute_similarity_matrix = 1 - attribute_similarity_matrix

attribute_similarity_matrix = pd.DataFrame(data=attribute_similarity_matrix, index=binarymatrixT.index, columns=binarymatrixT.index)
attribute_similarity_matrix.index.name = None
attribute_similarity_matrix.columns.name = None
attribute_similarity_matrix.to_csv(output_path+'attribute_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
attribute_similarity_matrix

### Knowledge Graph Serialization

In [None]:
nodes = {}
edges = []

for gene in geneslist.index:
    gene = geneslist.loc[gene]
    nodes[int(gene['Gene ID'])] = {
        "type":"gene",
        "properties": {
            "id":int(gene['Gene ID']),
            "label":gene['Gene']
        }}

for cellline in attributeslist.index:
    cellline = attributeslist.loc[cellline]
    nodes[cellline['Cell Line']] = {
        "type":"cell line",
        "properties": {
            "label":cellline['Cell Line'],
            "id":cellline['Cell Line ID']
        }}

for edge in edgelist.index:
    edge = edgelist.loc[edge]
    edges.append({
        "source": int(edge['Gene ID']),
        "relation": "over-expressed in",
        "target": edge['Cell Line'],
        "properties":{
            "id":str(edge['Gene ID'])+":"+edge['Cell Line ID'],
            "source_id":int(edge['Gene ID']),
            "source_label":edge['Gene'],
            "target_id":edge['Cell Line ID'],
            "target_label":edge['Cell Line'],
            "directed":True,
            "z-score":edge['Z-score'],
            "threshold":1
        }})

#### RDF

In [None]:
with open(output_path+'kg_serializations/ccleproteomics.rdf', 'w') as f:
    print('@prefix gene: ncbi.nlm.nih.gov/gene/', file=f)
    print('@prefix RO: purl.obolibrary.org/RO_', file=f)
    print('@prefix CCLE: https://depmap.org/portal/cell_line/', file=f)
    print('', file=f)
    for edge in edges:
        print('gene:'+str(edge['properties']['source_id']), 'RO:0002245', 'CCLE:'+edge['properties']['target_id'], end=' .\n', file=f)

#### JSON

In [None]:
with open(output_path+'kg_serializations/ccleproteomics.json', 'w') as f:
    serial = json.dump(
        {
            "Version":"1", 
            "nodes": nodes,
            "edges": edges
        }, indent=4, fp=f)

#### TSV

In [None]:
nodeframe = pd.DataFrame(nodes).T
nodeframe['id'] = nodeframe['properties'].apply(lambda x: x['id'])
nodeframe['label'] = nodeframe['properties'].apply(lambda x: x['label'])
nodeframe['namespace'] = nodeframe['type'].apply(lambda x: {'gene':'NCBI Entrez', 'cell line':'CCLE'}[x])
nodeframe = nodeframe.get(['namespace', 'id', 'label']).reset_index(drop=True)
nodeframe.to_csv(output_path+'kg_serializations/ccleproteomics_tsv/nodes.tsv', sep='\t')
nodeframe

In [None]:
edgeframe = pd.DataFrame(edges)
edgeframe['z-score'] = edgeframe['properties'].apply(lambda x: x['z-score'])
edgeframe['threshold'] = edgeframe['properties'].apply(lambda x: x['threshold'])
edgeframe = edgeframe.get(['source', 'relation', 'target', 'z-score', 'threshold'])
edgeframe.to_csv(output_path+'kg_serializations/ccleproteomics_tsv/edges.tsv', sep='\t')
edgeframe

## Create Visualizations

### Gene-Attribute Clustered Heatmap

In [None]:
sns.clustermap(binarymatrix, cmap='seismic', center=0)

### Gene-Gene Similarity Clustered Heatmap

In [None]:
sns.clustermap(gene_similarity_matrix, cmap='seismic', center=0)

### Attribute-Attribute Similarity Clustered Heatmap

In [None]:
sns.clustermap(attribute_similarity_matrix, cmap='seismic', center=0)

### UMAP

In [None]:
def load_gmt(file):
    gmt = OrderedDict()
    for line in file:
        term, *geneset = line.strip().split('\t')
        gmt[term] = ' '.join(set(geneset))
    return gmt

In [None]:
libdict = load_gmt(open(output_path+'gene_set_library_crisp.gmt', 'r'))
scatterdir = 'newdata/CCLEProteomics/images/'

In [None]:
def process_scatterplot(libdict, nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1):
    print("\tTF-IDF vectorizing gene set data...")
    vec = TfidfVectorizer(max_df=maxdf, min_df=mindf)
    X = vec.fit_transform(libdict.values())
    print(X.shape)
    adata = anndata.AnnData(X)
    adata.obs.index = libdict.keys()

    print("\tPerforming Leiden clustering...")
    ### the n_neighbors and min_dist parameters can be altered
    sc.pp.neighbors(adata, n_neighbors=nneighbors, use_rep='X')
    sc.tl.leiden(adata, resolution=1.0)
    sc.tl.umap(adata, min_dist=mindist, spread=spread, random_state=42)

    new_order = adata.obs.sort_values(by='leiden').index.tolist()
    adata = adata[new_order, :]
    adata.obs['leiden'] = 'Cluster ' + adata.obs['leiden'].astype('object')

    df = pd.DataFrame(adata.obsm['X_umap'])
    df.columns = ['x', 'y']

    df['cluster'] = adata.obs['leiden'].values
    df['term'] = adata.obs.index
    df['genes'] = [libdict[l] for l in df['term']]

    return df

In [None]:
def get_scatter_colors(df):
    clusters = pd.unique(df['cluster']).tolist()
    colors = list(Category20[20])[::2] + list(Category20[20])[1::2]
    color_mapper = {clusters[i]: colors[i % 20] for i in range(len(clusters))}
    return color_mapper

def get_scatterplot(scatterdf):
    df = scatterdf.copy()
    color_mapper = get_scatter_colors(df)
    df['color'] = df['cluster'].apply(lambda x: color_mapper[x])

    hover_emb = HoverTool(name="df", tooltips="""
        <div style="margin: 10">
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
                <span style="font-size: 12px">@gene_set</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Coordinates:</span>
                <span style="font-size: 12px">(@x,@y)</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Cluster:</span>
                <span style="font-size: 12px">@cluster</span>
            </div>
        </div>
    """)
    tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']

    plot_emb = figure(
        width=1000, 
        height=700, 
        tools=tools_emb
    )

    source = ColumnDataSource(
        data=dict(
            x = df['x'],
            y = df['y'],
            gene_set = df['term'],
            cluster = df['cluster'],
            colors = df['color'],
            label = df['cluster']
        )
    )

    # hide axis labels and grid lines
    plot_emb.xaxis.major_tick_line_color = None
    plot_emb.xaxis.minor_tick_line_color = None
    plot_emb.yaxis.major_tick_line_color = None
    plot_emb.yaxis.minor_tick_line_color = None
    plot_emb.xaxis.major_label_text_font_size = '0pt'
    plot_emb.yaxis.major_label_text_font_size = '0pt' 

    plot_emb.output_backend = "svg"    
    
    plot_emb.title = 'Gene Sets in CCLE Cell Line Proteomics Library'
    plot_emb.xaxis.axis_label = "UMAP_1"
    plot_emb.yaxis.axis_label = "UMAP_2"
    plot_emb.xaxis.axis_label_text_font_style = 'normal'
    plot_emb.xaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_style = 'normal'
    plot_emb.title.align = 'center'
    plot_emb.title.text_font_size = '18px'
    
    s = plot_emb.scatter(
        'x', 
        'y', 
        size = 4, 
        source = source, 
        color = 'colors'
    )
    
    return plot_emb

In [None]:
## defaults: nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1
scatter_df = process_scatterplot(libdict, nneighbors=30,mindist=0.025,
    spread=0.45, 
    maxdf=0.75, 
    mindf=37

)

# Display Scatter Plot
plot = get_scatterplot(scatter_df)
output_notebook()
show(plot)

In [None]:
# save HTML if desired
output_file(filename=f"{scatterdir}umap.html", title = 'Gene Sets in CCLE Cell Line Proteomics Library')
save(plot)