# Kinase Library Serine Threonine Kinome Atlas

The Kinase Library is a resource which collects information about kinase phosphorylation sites. The Serine-Threonine Kinome Atlas dataset uses synthetic peptide libraries to profile substrate sequence specificity.
In this study, 89,752 phosphosites were computationally ranked against each kinase motif. This generated percentiles and rank of kinase-substrate specificity for 10,269 proteins based on their phosphorylation by 303 serine/threonine human kinases.

In [None]:
import pandas as pd
import datetime
import numpy as np
import scipy.spatial.distance as dist
import seaborn as sns
import sys
import json
import scanpy as sc

from sklearn.feature_extraction.text import TfidfVectorizer
import anndata
from bokeh.io import output_notebook, export_svg, output_file, save
from bokeh.io.export import get_screenshot_as_png
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.palettes import Category20
sys.setrecursionlimit(100000)

## Load Data

In [None]:
kinaselib = pd.read_csv('newdata/KinaseLibrary/41586_2022_5575_MOESM5_ESM/Supplementary Table 3-Table 1.csv', dtype=object)
display(kinaselib)
print(kinaselib['Database Uniprot Accession'].unique().__len__())

## Process Data

In [None]:
geneinfo = pd.read_csv('tables/gene_info', sep='\t')
geneinfo = geneinfo[geneinfo['#tax_id']==9606][geneinfo['type_of_gene']=='protein-coding']
genedict = geneinfo.set_index('GeneID')['Symbol'].to_dict()
geneids = geneinfo.set_index('Symbol')['GeneID'].to_dict()
entrez = pd.read_csv('tables/uniprot_to_entrez.tsv', sep='\t').set_index('From')['To'].to_dict()

In [None]:
kinaselib = kinaselib.set_index('Database Uniprot Accession').get(kinaselib.columns[13:619:2]).astype(float)
kinaselib = kinaselib.rename(lambda x: x.split(sep='_')[0], axis=1)

droplist = []

for gene in kinaselib.index:
    if gene not in entrez:
        droplist.append(gene)
    else:
        if entrez[gene] not in genedict:
            droplist.append(gene)

kinaselib = kinaselib.drop(droplist)
kinaselib = kinaselib.reset_index()
kinaselib['Database Uniprot Accession'] = kinaselib['Database Uniprot Accession'].apply(lambda x: genedict[entrez[x]])
kinaselib = kinaselib.set_index('Database Uniprot Accession')
kinaselib = kinaselib.rename_axis('Gene', axis=0).rename_axis('Kinase', axis=1)

display(kinaselib)
print(kinaselib.index.unique().__len__(), 'proteins,', kinaselib.columns.unique().__len__(), 'kinases')

In [None]:
edgelist = kinaselib.stack().reset_index()
edgelist.columns = ['Gene', 'Kinase', 'Percentile']
edgelist = edgelist.sort_values(['Kinase', 'Percentile'], ascending=[True, False]).reset_index(drop=True)
edgelist = edgelist.drop_duplicates(['Gene','Kinase'])
print(edgelist['Gene'].unique().__len__(), 'genes,', edgelist['Kinase'].unique().__len__(), 'kinases')
edgelist

In [None]:
edgelist_filtered = pd.DataFrame(columns=['Gene', 'Percentile'])
edgelist = edgelist.set_index('Kinase')
for kinase in edgelist.index.unique():
    edgelist_filtered = pd.concat([edgelist_filtered, edgelist.loc[kinase][:100]])
edgelist = edgelist_filtered.rename_axis('Kinase', axis=0).reset_index().sort_values(['Kinase','Percentile','Gene'],ascending=[True,False,True]).reset_index(drop=True)
print(edgelist['Gene'].unique().__len__(), 'genes,', edgelist['Kinase'].unique().__len__(), 'kinases')
edgelist

## Harmonizome Additions

### Resource

In [None]:
(102,
'Kinase Library',
None,
'A phosphoproteomics atlas detailing phosphorylation of protein substrates by 303 serine/threonine kinases in the human kinome.',
'An atlas of human serine/threonine kinome activity.',
'https://kinase-library.phosphosite.org/site',
303,
1,
None)

### Dataset

In [None]:
#(id, name, name_without_resource, description, association, gene_set_description, gene_sets_description, attribute_set_description, is_signed, is_continuous_valued, last_updated, directory, num_page_views, resource_fk, measurement_fk, dataset_group_fk, attribute_type_fk, attribute_group_fk, evidence_type, evidence_group, measurement_bias, attribute_type_plural)
(132,
'Kinase Library Serine Threonine Kinome Atlas',
'Serine Threonine Kinome Atlas',
'A phosphoproteomics atlas detailing phosphorylation of protein substrates by 303 serine/threonine kinases in the human kinome.',
'substrate-kinase associations from high-thorughput phosphoproteomics data',
'substrates of the kinase {0} from the Kinase Library Serine Threonine Kinome Atlas dataset.',
'sets of substrates of kinases from the Kinase Library Serine Threonine Kinome Atlas dataset.',
'kinases that phosphorylate {0} protein from the Kinase Library Serine Threonine Atlas dataset.',
0,
1,
'2023-02-22',
'kinaselib',
0,
102,
25,
5,
30,
5,
'protein phosphorylation by PSPA',
'primary experimental data',
'high-throughput',
'kinases')

### Publication

In [None]:
#(id, long_citation, short_citation, url, pmid, pubmed_url, first_author_last_name, first_author_initials, journal_abbreviation, year, title, volume, pages)
(138,
'Johnson, EL et al. (2023) An atlas of substrate specificities for the human serine/threonine kinome. Nature. 613:759-66',
'Johnson, Nature, 2023',
'dx.doi.org/10.1038/s41586-022-05575-3',
36631611,
'https://www.ncbi.nlm.nih.gov/pubmed/36631611',
'Johnson',
'JL',
'Nature',
2023,
'An atlas of substrate specificities for the human serine/threonine kinome',
613,
'759-766')

### Genes

In [None]:
#(id,symbol,ncbi_entrez_gene_id,name,ncbi_entrez_gene_url)
genes = edgelist['Gene'].drop_duplicates().to_frame()
genes['GeneID'] = genes['Gene'].apply(lambda x: geneids[x])
genes = genes.sort_values('GeneID').reset_index(drop=True)

prodgenes = pd.read_csv('production/gene.csv').get(['id','symbol','ncbi_entrez_gene_id']).set_index('ncbi_entrez_gene_id')
genefks = prodgenes['id'].to_dict()
genes = genes.set_index('GeneID')
i = 57208
for gene in genes.index:
    if gene not in prodgenes.index:
        print((i,genes.loc[gene,'Gene'],gene, geneinfo.set_index('GeneID').loc[gene,'description'],'https://ncbi.nlm.nih.gov/gene/'+str(gene)))
        genefks[gene] = i
        i += 1
genes = genes.reset_index()
genes['GeneFK'] = genes['GeneID'].apply(lambda x: genefks[x])

genes

### Attributes

In [None]:
# This code was use to get a list of correctly mapped kinases -> NCBI Entrez Gene ID relations. Kinases with a Gene ID of 0 in this output need to be manually annotated.
'''kinases = geneinfo.set_index('Symbol')['GeneID']
#geneinfo['Synonyms'] = geneinfo['Synonyms'].apply(str.split, sep='|')
kinases = pd.concat([kinases, geneinfo.explode('Synonyms').set_index('Synonyms')['GeneID']]).reset_index().drop_duplicates('index').set_index('index')
for kinase in attributes['Kinase']:
    if kinase in kinases.index:
        print(kinase, kinases.loc[kinase, 'GeneID'])
    else:
        print(kinase, 0)'''

# This mapping dictionary was created using the above code, which can be run by uncommenting it.
kinasedict = pd.read_csv('newdata/KinaseLibrary/kinaseids.csv').set_index('Kinase')['Gene ID'].to_dict()
kinases = geneinfo.set_index('GeneID').T.get(kinasedict.values()).T.reset_index()
kinases = kinases.get(['Symbol', 'GeneID', 'description']).set_index('GeneID')

In [None]:
prodattributes = pd.read_csv('production/attribute.csv')
prodattributes = prodattributes[prodattributes['naming_authority_fk']==18].reset_index(drop=True)
proddict = prodattributes.set_index('id_from_naming_authority')['id'].to_dict()

attributes = edgelist['Kinase'].drop_duplicates().to_frame().reset_index(drop=True)
attributes['ID'] = attributes['Kinase'].apply(lambda x: kinasedict[x])
attributes['Description'] = attributes['ID'].apply(lambda x: kinases.loc[x, 'description'])
attributes

In [None]:
#id, name_from_naming_authority, id_from_naming_authority, description_from_naming_authority, url_from_naming_authority, naming_authority_fk
i = 298355

for attribute in attributes.index:
    attribute = attributes.loc[attribute]
    if str(attribute['ID']) not in prodattributes['id_from_naming_authority'].tolist():
        print((i, attribute['Kinase'], attribute['ID'], attribute['Description'], 'https://ncbi.nlm.nih.gov/gene/'+str(attribute['ID']), 18), end=',\n')
        proddict[str(attribute['ID'])] = i
        i+=1

### Gene Sets

In [None]:
#id, name_from_dataset, description_from_dataset, dataset_fk, attribute_type_fk, attribute_fk

attributes['AttributeFK'] = attributes['ID'].apply(lambda x: proddict[str(x)])
attributes['GenesetFK'] = attributes.index + 133400000
attributes = attributes.reset_index(drop=True)

attributes
for geneset in attributes.index:
    geneset = attributes.loc[geneset]
    print((geneset['GenesetFK'], geneset['Kinase'], geneset['Description'], 132, 30, geneset['AttributeFK']), end=',\n')

### Associations

In [None]:
associations = edgelist.copy()
genes = genes.set_index('Gene')
attributes = attributes.set_index('Kinase')
associations['Gene'] = associations['Gene'].apply(lambda x: genes.loc[x, 'GeneFK'])
associations['Kinase'] = associations['Kinase'].apply(lambda x: attributes.loc[x, 'GenesetFK'])
associations['threshold'] = 1
genes = genes.reset_index()
attributes = attributes.reset_index()
associations.index += 14000000
associations.columns = ['gene_set_fk','gene_fk','cleaned_value','threshold_value']
associations = associations.rename_axis('id', axis=0).get(['gene_fk','gene_set_fk','cleaned_value','threshold_value'])
associations.to_csv('harmonizome-update/kinaselibrary.csv')
associations

## Downloads

In [None]:
output_path = 'newdata/KinaseLibrary/downloads/'

### Gene-Attribute Matrix

In [None]:
binarymatrix = pd.crosstab(index=edgelist['Gene'], columns=edgelist['Kinase'], values=1, aggfunc=max).replace(np.nan, 0).astype(int)
binarymatrixT = binarymatrix.T
binarymatrix

### Gene Attribute Edge List

In [None]:
genedict = genes.set_index('Gene')['GeneID'].to_dict()
attributedict = attributes.set_index('Kinase')['ID'].to_dict()

edgelist['Kinase ID'] = edgelist['Kinase'].apply(lambda x: attributedict[x])
edgelist['Gene ID'] = edgelist['Gene'].apply(lambda x: genedict[x])
edgelist['Threshold'] = 1

edgelist = edgelist.get(['Kinase', 'Kinase ID', 'Gene', 'Gene ID', 'Percentile', 'Threshold'])
edgelist.to_csv(output_path+'gene_attribute_edges.txt.gz', sep='\t', compression='gzip')
edgelist

### Gene List

In [None]:
genes = genes.get(['Gene', 'GeneID'])
genes.to_csv(output_path+'gene_list_terms.txt.gz', sep='\t', compression='gzip')
genes

### Attribute List

In [None]:
attributes = attributes.get(['Kinase', 'ID'])
attributes.to_csv(output_path+'attribute_list_entries.txt.gz', sep='\t', compression='gzip')
attributes

### Gene Set Library

In [None]:
arr = binarymatrix.reset_index(drop=True).to_numpy(dtype=np.int_)

with open(output_path+'gene_set_library_crisp.gmt', 'w') as f:
    for i in range(arr.shape[1]):
        print(binarymatrix.columns[i],*binarymatrix.index[arr[:,i]==1],
            sep='\t', end='\n', file=f)

### Attribute Set Library

In [None]:
arr = binarymatrixT.reset_index(drop=True).to_numpy(dtype=np.int_)

with open(output_path+'attribute_set_library_crisp.gmt', 'w') as f:
    for i in range(arr.shape[1]):
        print(binarymatrixT.columns[i],*binarymatrixT.index[arr[:,i]==1],
            sep='\t', end='\n', file=f)

### Gene Similarity Matrix

In [None]:
similarity_matrix = dist.pdist(binarymatrix.to_numpy(dtype=np.int_), 'cosine')
similarity_matrix = dist.squareform(similarity_matrix)
similarity_matrix = 1 - similarity_matrix

gene_similarity_matrix = pd.DataFrame(similarity_matrix, index=binarymatrix.index, columns=binarymatrix.index)
gene_similarity_matrix.index.name = None
gene_similarity_matrix.columns.name = None

gene_similarity_matrix.to_csv(output_path+'gene_similarity_matrix_cosine.txt', sep='\t')
gene_similarity_matrix

### Attribute Similarity Matrix

In [None]:
similarity_matrix = dist.pdist(binarymatrixT.to_numpy(dtype=np.int_), 'cosine')
similarity_matrix = dist.squareform(similarity_matrix)
similarity_matrix = 1 - similarity_matrix

attribute_similarity_matrix = pd.DataFrame(similarity_matrix, index=binarymatrixT.index, columns=binarymatrixT.index)
attribute_similarity_matrix.index.name = None
attribute_similarity_matrix.columns.name = None

attribute_similarity_matrix.to_csv(output_path+'attribute_similarity_matrix_cosine.txt', sep='\t')
attribute_similarity_matrix

### Cleaned Gene-Attribute Matrix

In [None]:
cleanedmatrix = pd.crosstab(index=edgelist['Gene'], columns=edgelist['Kinase'], values=edgelist['Percentile'], aggfunc=max).replace(np.nan, 0)
cleanedmatrix.to_csv(output_path+'gene_attribute_matrix_cleaned.txt.gz', sep='\t', compression='gzip')
cleanedmatrix

### Knowledge Graph Serializations

In [None]:
nodes = {}
edges = []

#Gene	GeneID	GeneFK
for gene in genes.index:
    gene = genes.loc[gene]
    id = int(gene['GeneID'])
    label = gene['Gene']
    nodes[id] = {
    "type": "gene",
        "properties": {
            "id": id,
            "label": label
        }
    }


#Kinase	ID	Description	AttributeFK	GenesetFK
for kinase in attributes.index:
    kinase = attributes.loc[kinase]
    id = kinase['Kinase']
    label = int(kinase['ID'])
    nodes[id] = {
        "type": "kinase",
        "properties": {
            "id": id,
            "label": label
        }
    }

#Kinase	Kinase ID	Gene	Gene ID	Percentile	Threshold
for edge in edgelist.index:
    edge = edgelist.loc[edge]
    sourceid = edge['Kinase']
    sourcelabel = int(edge['Kinase ID'])
    targetid = int(edge['Gene ID'])
    targetlabel = edge['Gene']
    edges.append({
    "source": sourceid,
    "relation": "phosphorylates",
    "target": targetid,
    "properties": {
        "id": sourceid+":"+str(targetid),
        "source_label": sourcelabel,
        "target_label": targetlabel,
        "directed":True,
        "percentile":float(edge['Percentile'])
    }
})

#### RDF

In [None]:
with open(output_path+'serializations/kinaselibrary.rdf', 'w') as f:
    print('@prefix kinase: https://ncbi.nlm.nih.gov/gene/', file=f)
    print('@prefix RO: http://purl.obolibrary.org/obo/RO_', file=f)
    print('@prefix gene: https://ncbi.nlm.nih.gov/gene/', file=f)
    print(file=f)
    for edge in edges:
        print('kinase:'+str(edge['properties']['source_label']), 'RO:0002447', 'gene:'+str(edge['target']), end=' .\n', file=f)

#### JSON

In [None]:
with open(output_path+'serializations/kinaselibrary.json', 'w') as f:
    serial = json.dump(
        {
            "Version":"1", 
            "nodes": nodes,
            "edges": edges
        }, indent=4, fp=f)

#### TSV

In [None]:
nodeframe = pd.DataFrame(nodes).T
nodeframe['label'] = nodeframe['properties'].apply(lambda x: x['label'])
nodeframe = nodeframe.rename_axis('id', axis=0).reset_index().get(['type', 'label', 'id'])
nodeframe.columns = ['namespace', 'label', 'id']
nodeframe.to_csv(output_path+'serializations/kinaselibrary_tsv/nodes.tsv', sep='\t')
display(nodeframe)

edgeframe = pd.DataFrame(edges)
edgeframe['id'] = edgeframe['properties'].apply(lambda x: x['id'])
edgeframe['source_label'] = edgeframe['properties'].apply(lambda x: x['source_label'])
edgeframe['target_label'] = edgeframe['properties'].apply(lambda x: x['target_label'])
edgeframe['directed'] = True
edgeframe['percentile'] = edgeframe['properties'].apply(lambda x: x['percentile'])
edgeframe = edgeframe.get(['source', 'source_label', 'relation', 'target', 'target_label', 'id', 'directed', 'percentile'])
edgeframe.to_csv(output_path+'serializations/kinaselibrary_tsv/edges.tsv', sep='\t')
display(edgeframe)

## Visualizations

### Gene-Attribute Clustered Heatmap

In [None]:
sns.clustermap(binarymatrix, cmap='seismic', center=0)

### Gene Similarity Clustered Heatmap

In [None]:
sns.clustermap(gene_similarity_matrix, cmap='seismic', center=0)

### Attribute Similarity Clustered Heatmap

In [None]:
sns.clustermap(attribute_similarity_matrix, cmap='seismic', center=0)

### UMAP

In [None]:
output_notebook()

In [None]:
top = edgelist.set_index('Kinase')['Gene'].to_frame()
genesets = {}
for kinase in top.index.unique():
    genesets[kinase] = ' '.join(top.loc[kinase]['Gene'].tolist())

In [None]:
vec = TfidfVectorizer(max_df=0.5, min_df=10)
X = vec.fit_transform(genesets.values())
adata = anndata.AnnData(X, dtype='float32')
adata.obs.index = genesets.keys()

sc.pp.neighbors(adata, n_neighbors=25, use_rep='X')
sc.tl.leiden(adata)
sc.tl.umap(adata, min_dist=0.0065, spread=0.8)

new_order = adata.obs.sort_values(by='leiden').index.tolist()
adata = adata[new_order,:]
adata.obs['leiden'] = 'Cluster ' + adata.obs['leiden'].astype('object')

mapped_df = pd.DataFrame(adata.obsm['X_umap'])
mapped_df.columns = ['x', 'y']

mapped_df['cluster'] = adata.obs['leiden'].values
mapped_df['term'] = adata.obs.index

clusters = pd.unique(mapped_df['cluster']).tolist()
colors = list(Category20[20])[::2] + list(Category20[20])[1::2]
color_mapper = {clusters[i]:colors[i%20] for i in range(len(clusters))}

mapped_df['color'] = mapped_df['cluster'].apply(lambda x: color_mapper[x])

xlabel = 'UMAP 1'
ylabel = 'UMAP 2'

source2 = ColumnDataSource(
        data=dict(
            x = mapped_df.x,
            y = mapped_df.y,
            alpha = [0.7] * mapped_df.shape[0],
            colors = mapped_df['color'], 
            size = [6] * mapped_df.shape[0],
            gene_set = mapped_df['term'],
            cluster = mapped_df['cluster']
        )
    )

hover_emb = HoverTool(name="df", tooltips="""
    <div style="margin: 10">
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
            <span style="font-size: 12px">@gene_set</span>
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Coordinates:</span>
            <span style="font-size: 12px">(@x,@y)</span>
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Cluster:</span>
            <span style="font-size: 12px">@cluster</span>
        </div>
    </div>
    """)

tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']
title_emb = 'Gene Sets in Kinase Library Serine Threonine Kinome Atlas Library'
plot_emb = figure(width=1000, height=700, tools=tools_emb, title=title_emb, x_axis_label=xlabel, y_axis_label=ylabel)
plot_emb.circle( 'x', 'y', source = source2, size='size',
                alpha='alpha', line_alpha=0, line_width=0.01, name="df", 
                fill_color = 'colors', 
                line_color="black", hover_fill_color="firebrick")
plot_emb.xaxis.axis_label_text_font_style = 'normal'
plot_emb.xaxis.axis_label_text_font_size = '18px'
plot_emb.yaxis.axis_label_text_font_size = '18px'
plot_emb.yaxis.axis_label_text_font_style = 'normal'
plot_emb.title.align = 'center'
plot_emb.title.text_font_size = '18px'

show(plot_emb)