# Kinase Library Tyrosine Kinome Atlas

The Kinase Library is a resource which collects information about kinase phosphorylation sites. The Tyrosine Kinome Atlas dataset uses synthetic peptide libraries to profile substrate sequence specificity.
In this study, 7,315 phosphosites were computationally ranked against each kinase motif. This generated percentiles and rank of kinase-substrate specificity for proteins based on their phosphorylation by 78 canonical tyrosine human kinases.
The kinase-substrate percentile scores and ranking were downloaded from the supplementary information of 

In [None]:
import pandas as pd
import datetime
import numpy as np
import os
import scipy.spatial.distance as dist
import seaborn as sns
import sys
import json
import scanpy as sc
from tqdm import tqdm

# UMAP
from sklearn.feature_extraction.text import TfidfVectorizer
import anndata
from collections import OrderedDict

# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, save, output_file
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.palettes import Category20
output_notebook()

from IPython.display import display, HTML, Markdown
sys.setrecursionlimit(100000)

## Load Data

In [None]:
tyrosineKinases = pd.read_csv('TyrKinome/41586_2024_7407_MOESM5_ESM/Annotation - with non-canonical-Table 1.csv')
tyrosineKinases

## Process Data

In [None]:
uniprotToEntrez = pd.read_csv('../../tables/uniprot_to_entrez.tsv', sep='\t').set_index('From')['To'].to_dict()
geneids = pd.read_csv('../../mapping/mappingFiles/GeneSymbolsAndIDs_2024.tsv', sep='\t', index_col='GeneID')['Symbol'].to_dict()
uniprot = pd.read_csv('../../mapping/mappingFiles/humanUniprotMapping.tsv', sep='\t', index_col='UniProt Accession')['NCBI Entrez Gene ID'].astype(int).to_dict()
genemapping = pd.read_csv('../../mapping/mappingFiles/mappingFile_2024.tsv', sep='\t', index_col='Synonyms')
genemapping = genemapping[genemapping['#tax_id']==9606]['Symbol'].to_dict()

In [None]:
matrix = tyrosineKinases.set_index('Uniprot').get(tyrosineKinases.columns[15:])
matrix.index = matrix.index.map(uniprot).map(geneids).map(genemapping)
matrix = matrix.rename_axis('Gene').reset_index().dropna().set_index('Gene').rename_axis('Kinase', axis=1)
matrix = matrix[matrix.columns[:185:2]].rename(lambda x: x.split(sep='_')[0], axis=1)
matrix

In [None]:
matrix = matrix.sort_index().groupby(axis=0, level=0).median()
matrix

In [None]:
edgelist = matrix.stack().reset_index()
edgelist.columns = ['Gene', 'Kinase', 'Percentile']
edgelist = edgelist.sort_values(['Kinase', 'Percentile'], ascending=[True, False]).reset_index(drop=True)
#edgelist = edgelist.drop_duplicates(['Gene','Kinase'])
print(len(edgelist['Gene'].unique()), 'genes,', len(edgelist['Kinase'].unique()), 'kinases')
edgelist

In [None]:
edgelist_filtered = pd.DataFrame(columns=['Gene', 'Percentile'])
edgelist = edgelist.set_index('Kinase')
for kinase in edgelist.index.unique():
    edgelist_filtered = pd.concat([edgelist_filtered, edgelist.loc[kinase][:100]])
edgelist = edgelist_filtered.rename_axis('Kinase', axis=0).reset_index().sort_values(['Kinase','Percentile','Gene'],ascending=[True,False,True]).reset_index(drop=True)
print(len(edgelist['Gene'].unique()), 'genes,', len(edgelist['Kinase'].unique()), 'kinases')
edgelist

## Harmonizome Additions

### Dataset

In [None]:
#(id, name, name_without_resource, description, association, gene_set_description, gene_sets_description, attribute_set_description, is_signed, is_continuous_valued, last_updated, directory, num_page_views, resource_fk, measurement_fk, dataset_group_fk, attribute_type_fk, attribute_group_fk, evidence_type, evidence_group, measurement_bias, attribute_type_plural)
(156,
'Kinase Library Tyrosine Kinome Atlas',
'Tyrosine Kinome Atlas',
'A phosphoproteomics atlas detailing phosphorylation of protein substrates by 93 canonical and non-canonical tyrosine kinases in the human kinome.',
'substrate-kinase associations from high-thorughput and low-thoroughput phosphoproteomics data',
'substrates of the kinase {0} from the Kinase Library Tyrosine Kinome Atlas dataset.',
'sets of substrates of kinases from the Kinase Library Tyrosine Kinome Atlas dataset.',
'kinases that phosphorylate {0} protein from the Kinase Library Tyrosine Kinome Atlas dataset.',
0,
1,
'2024-09-24',
'tyrkinaselib',
0,
102,
25,
5,
30,
5,
'protein phosphorylation by PSPA',
'primary experimental data',
'mixed',
'kinases')

### Publication

In [None]:
#(id, long_citation, short_citation, url, pmid, pubmed_url, first_author_last_name, first_author_initials, journal_abbreviation, year, title, volume, pages)
(155,
'Yaron-Barir, TM et al. (2024) The intrinsic substrate specificity of the human tyrosine kinome. Nature. 629:1174-81',
'Yaron-Barir, Nature, 2024',
'dx.doi.org/10.1038/s41586-024-07407-y',
38720073,
'https://www.ncbi.nlm.nih.gov/pubmed/38720073',
'Yaron-Barir',
'TM',
'Nature',
2024,
'The intrinsic substrate specificity of the human tyrosine kinome',
629,
'1174-81')

#datasets_to_publications
(238, 156, 155)

### Genes

In [None]:
dbgenes = pd.read_csv('../../tables/gene.csv')
dbgeneids = dbgenes['ncbi_entrez_gene_id']
dbgenesymbols = dbgenes['symbol'].tolist()
genefks = dbgenes.set_index('symbol')['id'].to_dict()
geneids = dbgenes.set_index('symbol')['ncbi_entrez_gene_id'].to_dict()
dbgenes

### Naming Authority

### Attributes

In [None]:
dbattributes = pd.read_csv('../../tables/attribute.csv')
dbattributes['name_from_naming_authority'] = dbattributes['name_from_naming_authority'].astype(str).map(str.lower)
dbattributes = dbattributes.set_index('name_from_naming_authority')
attributefks = dbattributes['id'].to_dict()

#(id, name_from_naming_authority,  naming_authority_fk)
index = 423202
for kinase in edgelist['Kinase'].unique():
    if kinase.lower() not in dbattributes.index:
        print((index, kinase, 85), end=',\n')
        attributefks[kinase.lower()] = index
    index += 1

### Gene Sets

In [None]:
genesetfks = {}

#id, name_from_dataset, description_from_dataset, dataset_fk, attribute_type_fk, attribute_fk
index = 135700000
for kinase in edgelist['Kinase'].unique():
    print((index, kinase, 156, 30, attributefks[kinase.lower()]), end=',\n')
    genesetfks[kinase] = index
    index += 1

### Associations

In [None]:
associations = edgelist.copy()

associations['Gene'] = associations['Gene'].apply(lambda x: genefks[x.upper()])
associations['Kinase'] = associations['Kinase'].apply(lambda x: genesetfks[x])
associations['threshold'] = 1
associations.index += 43000000
associations.columns = ['gene_set_fk','gene_fk','standardized_value','threshold_value']
associations = associations.rename_axis('id', axis=0)[['gene_fk', 'gene_set_fk', 'standardized_value', 'threshold_value']]
associations.to_csv('../../harmonizome-update/tyrkinaselib.csv')
associations

In [None]:
edgelist['Percentile'].apply(np.log10)

## Downloads

In [None]:
output_path = 'TyrKinome/downloads/'

### Gene-Attribute Matrix

In [None]:
binarymatrix = pd.crosstab(index=edgelist['Gene'], columns=edgelist['Kinase'], values=1, aggfunc=max).replace(np.nan, 0).astype(int)
binarymatrixT = binarymatrix.T
binarymatrix.to_csv(output_path+'gene_attribute_matrix.txt.gz', sep='\t', compression='gzip')
binarymatrix

### Gene Attribute Edge List

In [None]:
edgelist['Gene ID'] = edgelist['Gene'].apply(lambda x: geneids[x.upper()])
edgelist['Threshold'] = 1

edgelist = edgelist[['Gene', 'Gene ID', 'Kinase', 'Percentile', 'Threshold']]
edgelist.columns = ['Gene', 'Gene ID', 'Kinase', 'standardized_value', 'threshold_value']
edgelist.to_csv(output_path+'gene_attribute_edges.txt.gz', sep='\t', compression='gzip')
edgelist

### Gene List

In [None]:
genes = edgelist[['Gene', 'Gene ID']].drop_duplicates().reset_index(drop=True)
genes.to_csv(output_path+'gene_list_terms.txt.gz', sep='\t', compression='gzip')
genes

### Attribute List

In [None]:
attributes = edgelist[['Kinase']].drop_duplicates().reset_index(drop=True)
attributes.to_csv(output_path+'attribute_list_entries.txt.gz', sep='\t', compression='gzip')
attributes

### Gene Set Library

In [None]:
arr = binarymatrix.reset_index(drop=True).to_numpy(dtype=np.int_)

with open(output_path+'gene_set_library_crisp.gmt', 'w') as f:
    for i in range(arr.shape[1]):
        print(binarymatrix.columns[i], '', *binarymatrix.index[arr[:,i]==1],
            sep='\t', end='\n', file=f)

### Attribute Set Library

In [None]:
arr = binarymatrixT.reset_index(drop=True).to_numpy(dtype=np.int_)

with open(output_path+'attribute_set_library_crisp.gmt', 'w') as f:
    for i in range(arr.shape[1]):
        print(binarymatrixT.columns[i], '', *binarymatrixT.index[arr[:,i]==1],
            sep='\t', end='\n', file=f)

### Gene Similarity Matrix

In [None]:
similarity_matrix = dist.pdist(binarymatrix.to_numpy(dtype=np.int_), 'cosine')
similarity_matrix = dist.squareform(similarity_matrix)
similarity_matrix = 1 - similarity_matrix

gene_similarity_matrix = pd.DataFrame(similarity_matrix, index=binarymatrix.index, columns=binarymatrix.index)
gene_similarity_matrix.index.name = None
gene_similarity_matrix.columns.name = None

gene_similarity_matrix.to_csv(output_path+'gene_similarity_matrix_cosine.txt', sep='\t')
gene_similarity_matrix

### Attribute Similarity Matrix

In [None]:
similarity_matrix = dist.pdist(binarymatrixT.to_numpy(dtype=np.int_), 'cosine')
similarity_matrix = dist.squareform(similarity_matrix)
similarity_matrix = 1 - similarity_matrix

attribute_similarity_matrix = pd.DataFrame(similarity_matrix, index=binarymatrixT.index, columns=binarymatrixT.index)
attribute_similarity_matrix.index.name = None
attribute_similarity_matrix.columns.name = None

attribute_similarity_matrix.to_csv(output_path+'attribute_similarity_matrix_cosine.txt', sep='\t')
attribute_similarity_matrix

### Standardized Gene-Attribute Matrix

In [None]:
standardizedmatrix = pd.crosstab(index=edgelist['Gene'], columns=edgelist['Kinase'], values=edgelist['standardized_value'], aggfunc=max).replace(np.nan, 0)
standardizedmatrix.to_csv(output_path+'gene_attribute_matrix_standardized.txt.gz', sep='\t', compression='gzip')
standardizedmatrix

### Knowledge Graph Serializations

In [None]:
nodes = {}
edges = []

#Gene	GeneID	GeneFK
for gene in genes.index:
    gene = genes.loc[gene]
    id = int(gene['Gene ID'])
    label = gene['Gene']
    nodes[id] = {
    "type": "gene",
        "properties": {
            "id": id,
            "label": label
        }
    }


#Kinase	ID	Description	AttributeFK	GenesetFK
for kinase in attributes.index:
    kinase = attributes.loc[kinase]
    id = kinase['Kinase']
    label = kinase['Kinase']
    nodes[id] = {
        "type": "kinase",
        "properties": {
            "id": id,
            "label": label
        }
    }

#Kinase	Kinase ID	Gene	Gene ID	Percentile	Threshold
for edge in edgelist.index:
    edge = edgelist.loc[edge]
    sourceid = edge['Kinase']
    sourcelabel = edge['Kinase']
    targetid = int(edge['Gene ID'])
    targetlabel = edge['Gene']
    edges.append({
    "source": sourceid,
    "relation": "phosphorylates",
    "target": targetid,
    "properties": {
        "id": str(sourceid)+":"+str(targetid),
        "source_label": sourcelabel,
        "target_label": targetlabel,
        "directed":True,
        "standardized_value":float(edge['standardized_value']),
        "threshold_value":1
    }
})

#### RDF

In [None]:
with open(output_path+'kg_serializations/tyrkinaselib.rdf', 'w') as f:
    print('@prefix kinase: https://ncbi.nlm.nih.gov/gene/', file=f)
    print('@prefix RO: http://purl.obolibrary.org/obo/RO_', file=f)

    print(file=f)
    for edge in edges:
        print('kinase:'+str(edge['source']), 'RO:0002447', 'KinaseLibrary:'+str(edge['target']), end=' .\n', file=f)

#### JSON

In [None]:
with open(output_path+'kg_serializations/tyrkinaselib.json', 'w') as f:
    serial = json.dump(
        {
            "Version":"1", 
            "nodes": nodes,
            "edges": edges
        }, indent=4, fp=f)

#### TSV

In [None]:
nodeframe = pd.DataFrame(nodes).T
nodeframe['label'] = nodeframe['properties'].apply(lambda x: x['label'])
nodeframe = nodeframe.rename_axis('id', axis=0).reset_index().get(['type', 'label', 'id'])
nodeframe.columns = ['namespace', 'label', 'id']
nodeframe.to_csv(output_path+'kg_serializations/tyrkinaselib_tsv/nodes.tsv', sep='\t')
display(nodeframe)

edgeframe = pd.DataFrame(edges)
edgeframe['id'] = edgeframe['properties'].apply(lambda x: x['id'])
edgeframe['source_label'] = edgeframe['properties'].apply(lambda x: x['source_label'])
edgeframe['target_label'] = edgeframe['properties'].apply(lambda x: x['target_label'])
edgeframe['directed'] = True
edgeframe['standardized_value'] = edgeframe['properties'].apply(lambda x: x['standardized_value'])
edgeframe['threshold_value'] = edgeframe['properties'].apply(lambda x: x['threshold_value'])
edgeframe = edgeframe[['source', 'source_label', 'relation', 'target', 'target_label', 'id', 'directed', 'standardized_value', 'threshold_value']]
edgeframe.to_csv(output_path+'kg_serializations/tyrkinaselib_tsv/edges.tsv', sep='\t')
display(edgeframe)

## Visualizations

### Gene-Attribute Clustered Heatmap

In [None]:
sns.clustermap(binarymatrix, cmap='seismic', center=0)

### Gene Similarity Clustered Heatmap

In [None]:
sns.clustermap(gene_similarity_matrix, cmap='seismic', center=0)

### Attribute Similarity Clustered Heatmap

In [None]:
sns.clustermap(attribute_similarity_matrix, cmap='seismic', center=0)

### UMAP

In [None]:
def load_gmt(file):
    gmt = OrderedDict()
    for line in file:
        term, blank, *geneset = line.strip().split('\t')
        gmt[term] = ' '.join(set(geneset))
    return gmt
libdict = load_gmt(open(output_path+'gene_set_library_crisp.gmt', 'r'))

scatterdir = 'TyrKinome/images/'

In [None]:
def process_scatterplot(libdict, nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1):
    print("\tTF-IDF vectorizing gene set data...")
    vec = TfidfVectorizer(max_df=maxdf, min_df=mindf)
    X = vec.fit_transform(libdict.values())
    print(X.shape)
    adata = anndata.AnnData(X)
    adata.obs.index = libdict.keys()

    print("\tPerforming Leiden clustering...")
    ### the n_neighbors and min_dist parameters can be altered
    sc.pp.neighbors(adata, n_neighbors=nneighbors)
    sc.tl.leiden(adata, resolution=1.0)
    sc.tl.umap(adata, min_dist=mindist, spread=spread, random_state=42)

    new_order = adata.obs.sort_values(by='leiden').index.tolist()
    adata = adata[new_order, :]
    adata.obs['leiden'] = 'Cluster ' + adata.obs['leiden'].astype('object')

    df = pd.DataFrame(adata.obsm['X_umap'])
    df.columns = ['x', 'y']

    df['cluster'] = adata.obs['leiden'].values
    df['term'] = adata.obs.index
    df['genes'] = [libdict[l] for l in df['term']]

    return df

In [None]:
def get_scatter_colors(df):
    clusters = pd.unique(df['cluster']).tolist()
    colors = list(Category20[20])[::2] + list(Category20[20])[1::2]
    color_mapper = {clusters[i]: colors[i % 20] for i in range(len(clusters))}
    return color_mapper

def get_scatterplot(scatterdf):
    df = scatterdf.copy()
    color_mapper = get_scatter_colors(df)
    df['color'] = df['cluster'].apply(lambda x: color_mapper[x])

    hover_emb = HoverTool(name="df", tooltips="""
        <div style="margin: 10">
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
                <span style="font-size: 12px">@gene_set</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Coordinates:</span>
                <span style="font-size: 12px">(@x,@y)</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Cluster:</span>
                <span style="font-size: 12px">@cluster</span>
            </div>
        </div>
    """)
    tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']

    plot_emb = figure(
        width=1000, 
        height=700, 
        tools=tools_emb
    )

    source = ColumnDataSource(
        data=dict(
            x = df['x'],
            y = df['y'],
            gene_set = df['term'],
            cluster = df['cluster'],
            colors = df['color'],
            label = df['cluster']
        )
    )

    # hide axis labels and grid lines
    plot_emb.xaxis.major_tick_line_color = None
    plot_emb.xaxis.minor_tick_line_color = None
    plot_emb.yaxis.major_tick_line_color = None
    plot_emb.yaxis.minor_tick_line_color = None
    plot_emb.xaxis.major_label_text_font_size = '0pt'
    plot_emb.yaxis.major_label_text_font_size = '0pt' 

    plot_emb.output_backend = "svg"    
    
    plot_emb.title = 'Gene Sets in the Kinase Library Tyrosine Kinome Atlas Library'
    plot_emb.xaxis.axis_label = "UMAP_1"
    plot_emb.yaxis.axis_label = "UMAP_2"
    plot_emb.xaxis.axis_label_text_font_style = 'normal'
    plot_emb.xaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_style = 'normal'
    plot_emb.title.align = 'center'
    plot_emb.title.text_font_size = '18px'
    
    s = plot_emb.scatter(
        'x', 
        'y', 
        size = 4, 
        source = source, 
        color = 'colors'
    )
    
    return plot_emb

In [None]:
## defaults: nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1
scatter_df = process_scatterplot(libdict, 
     nneighbors=4,
     #mindist=0.01
     spread=1.0,
     #,maxdf=0.5
     #,mindf=2
)

# Display Scatter Plot
plot = get_scatterplot(scatter_df)
output_notebook()
show(plot)

In [None]:
# save HTML if desired
output_file(filename=f"{scatterdir}/umap.html", title = 'Gene Sets in Kinase Library Tyrosine Kinome Atlas Library')
save(plot)