# CellMarker 2.0 Processing Notebook
This notebook contains the script used to process the CellMarker 2.0 library for Harmonizome and Enrichr. A cell type-gene edgelist from [CellMarker](http://bio-bigdata.hrbmu.edu.cn/CellMarker/CellMarker_download.html) (All cell markers) was processed to ensure all cell types were properly formatted and all genes use approved and up-to-date gene symbols.

In [None]:
import pandas as pd
import datetime
import numpy as np
import scipy.spatial.distance as dist
import seaborn as sns
import sys
import json
import scanpy as sc
from tqdm import tqdm
from collections import OrderedDict

from sklearn.feature_extraction.text import TfidfVectorizer
import anndata
from collections import OrderedDict

# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, save, output_file
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.palettes import Category20
output_notebook()

from IPython.display import display, HTML, Markdown
sys.setrecursionlimit(100000)

## Loand and Pre-Process Data

In [None]:
cellmarker = pd.read_excel('Cell_marker_All.xlsx')
cellmarker

In [None]:
cellmarker = cellmarker[['cell_name', 'tissue_type', 'species', 'cellontology_id', 'Symbol']].drop_duplicates()
cellmarker

In [None]:
mapping = pd.read_csv('../../mapping/mappingFile_2023.tsv', sep='\t', header=None)
mapping[1] = mapping[1].apply(str).apply(str.upper)
mapping[2] = mapping[2].apply(str).apply(str.upper)
mapping = mapping.set_index(1)[2].to_dict()
mapping

In [None]:
cellmarker['Symbol'] = cellmarker.dropna(subset='Symbol')['Symbol'].apply(str.upper).map(mapping)
cellmarker

In [None]:
cellmarker['Cell Type'] = cellmarker['cell_name']+'_'+cellmarker['tissue_type']+'_'+cellmarker['species']
cellmarker = cellmarker[['Cell Type', 'cellontology_id', 'Symbol']]
cellmarker = cellmarker[cellmarker['Cell Type']!='Natural killer T (NKT) cell_Fetal kidney_Human']
cellmarker

In [None]:
manual = {'Plasmacytoid dendritic cell(PDC)_Bone marrow_Human':'Plasmacytoid dendritic cell(pDC)_Bone marrow_Human',
'Mature astrocyte_Brain_Human':'Mature Astrocyte_Brain_Human',
'Cancer Stem cell_Colorectum_Human':'Cancer stem cell_Colorectum_Human',
'Collecting Duct Principal Cell_Kidney_Mouse':'Collecting duct principal cell_Kidney_Mouse',
'Polymorphonuclear leukocyte_Kidney_Mouse':'Polymorphonuclear Leukocyte_Kidney_Mouse',
'myeloid-derived suppressor cell_Skin_Human':'Myeloid-derived suppressor cell_Skin_Human',
'Immune Cell_Skin_Mouse':'Immune cell_Skin_Mouse',
'Schwann Cell_Skin_Mouse':'Schwann cell_Skin_Mouse',
'Innate Lymphoid cell_Spleen_Human':'Innate lymphoid cell_Spleen_Human',
'NECK cell_Stomach_Human':'Neck cell_Stomach_Human',
'Stem leydig cell_Testis_Mouse':'Stem Leydig cell_Testis_Mouse',
'Dendritic Cell_Undefined_Mouse':'Dendritic cell_Undefined_Mouse',
'lymphatic endothelial cell_Undefined_Human':'Lymphatic endothelial cell_Undefined_Human'}

def manualcorrect(term):
    return manual[term] if term in manual else term

cellmarker['Cell Type'] = cellmarker['Cell Type'].apply(manualcorrect)
cellmarker

In [None]:
cellmarker = cellmarker.dropna(subset='Symbol').drop_duplicates().reset_index(drop=True)
cellmarker

In [None]:
print(f'{len(cellmarker["Symbol"].unique())} genes, {len(cellmarker["Cell Type"].unique())} terms, {cellmarker.shape[0]} annotations')

14664 genes, 7233 terms, 70878 annotations

## Process Data for SQL

### Resource

In [None]:
#(id, name, long_description, short_description, url, num_attributes, num_datasets)
('113', 'CellMarker', 'a database of manually curated cell markers in human/mouse and web tools based on scRNA-seq data', 'a database of cell type markers across human and mouse tissues', 'http://bio-bigdata.hrbmu.edu.cn/CellMarker/', 7233, 1)

### Dataset

In [None]:
#(id, name, name_without_resource, description, association, gene_set_description, gene_sets_description, attribute_set_description, is_signed, is_continuous_valued, last_updated, directory, num_page views, resource_fk, measurement_fk, dataset_group_fk, attribute_type_fk, attribute_group_fk, evidence_type, evidence_group, measurement_bias, attribute_type_plural, is_archived)
(155, 'CellMarker Gene-Cell Type Associations', 'Gene-Cell Type Associations', 'cell type markers ', 'gene-cell type associations by data aggregation', 'genes associated with the {0} cell type from the CellMarker Gene-Cell Type Associations dataset.', 'sets of genes associatd with each cell type from the CellMarker Gene-Cell Type Associations dataset.', 'cell types associated with {0} gene from the CellMarker Gene-Cell Type Associations dataset.', 0, 0, '2024-03-25', 'cellmarker', 0, 113, 3, 7, 2, 1, 'association by data aggregation', 'curated experimental data', 'mixed', 'cell types', 0)

### Publication

In [None]:
#(id, long_citation, short_citation, url, pmid, pubmed_url, first_author_last_name, first_author_initials, journal_abbreviateion, year, title, volume, pages)
(154, 'Hu, C et al. (2022) CellMarker 2.0: an updated database of manually curated cell markers in human/mouse and web tools based on scRNA-seq data. Nucleic Acids Res. 51:D870-6', 'Hu, Nucleic Acids Res, 2022', 'dx.doi.org/10.1093/nar/gkac947', 36300619, 'https://pubmed.ncbi.nlm.nih.gov/36300619', 'Hu', 'C', 'Nucleic Acids Res', 2022, 'CellMarker 2.0: an updated database of manually curated cell markers in human/mouse and web tools based on scRNA-seq data', 51, 'D870-6')
(238, 155, 154)

### Naming Authority

In [None]:
#(id, name, description, url, publication_fk)
(107, 'CellMarker', 'a database of manually curated cell markers in human/mouse and web tools based on scRNA-seq data', 'http://bio-bigdata.hrbmu.edu.cn/CellMarker/', 154)

### Genes

In [None]:
genes = pd.read_csv('../../tables/gene.csv')
genes['symbol'] = genes['symbol'].apply(str.upper)
genefks = genes.set_index('symbol')['id'].to_dict()
index = 58359

geneinfo = pd.read_csv('../../tables/gene_info', sep='\t')[['GeneID', 'Symbol', 'description', 'type_of_gene']]
geneinfo['Symbol'] = geneinfo['Symbol'].apply(str.upper)
geneinfo = geneinfo.set_index('Symbol')
for gene in cellmarker['Symbol'].unique():
    if gene not in genefks:
        geneid = geneinfo.loc[gene, 'GeneID']
        print((index, gene, geneid, geneinfo.loc[gene, 'description'], f'https://ncbi.nlm.nih.gov/gene/{geneid}'), end=',\n')
        genefks[gene] = index
        index += 1

### Attributes

In [None]:
index = 394329
attributefks = {}
clid = cellmarker.set_index('Cell Type')['cellontology_id'].dropna().to_dict()

for i in cellmarker['Cell Type'].unique():
    cell = i.split('_')
    if i in clid:
        print((index, i, f'{cell[0]} cell from {cell[1]} tissue from {cell[2]} ({clid[i]})', 107), end=',\n')
    else:
        print((index, i, f'{cell[0]} cell from {cell[1]} tissue from {cell[2]}', 107), end=',\n')
    attributefks[i] = index
    index += 1

### Gene Sets

In [None]:
index = 135600000
genesetfks = {}
for i in cellmarker['Cell Type'].unique():
    cell = i.split('_')
    if i in clid:
        print((index, i, f'{cell[0]} cell from {cell[1]} tissue from {cell[2]} ({clid[i]})', 155, 2, attributefks[i], 0), end=',\n')
    else:
        print((index, i, f'{cell[0]} cell from {cell[1]} tissue from {cell[2]}', 155, 2, attributefks[i], 0), end=',\n')
    genesetfks[i] = index
    index += 1

### Associations

In [None]:
associations = cellmarker.copy()
associations['Symbol'] = associations['Symbol'].apply(lambda x: genefks[x])
associations['Cell Type'] = associations['Cell Type'].apply(lambda x: genesetfks[x])
associations = associations[['Symbol', 'Cell Type']]
associations.columns = ['gene_fk', 'gene_set_fk']
associations['threshold_value'] = 1
associations = associations.rename_axis('id')
associations = associations.drop_duplicates().reset_index(drop=True)
associations.index += 42000000
associations.to_csv('../../harmonizome-update/cellmarker.csv')
associations

In [None]:
cellmarker.drop_duplicates(subset=['Cell Type', 'Symbol'])

In [None]:
error

## Create Downloads

In [None]:
output_path = 'downloads/'
cellmarker = cellmarker[['Symbol', 'Cell Type']].drop_duplicates().reset_index(drop=True)
cellmarker

### Gene-Attribute Matrix

In [None]:
binaryMatrix = pd.crosstab(cellmarker['Symbol'], cellmarker['Cell Type'], values=1, aggfunc=max).replace(np.nan, 0)
binaryMatrixT = binaryMatrix.T
binaryMatrix.to_csv(f'{output_path}gene_attribute_matrix.txt.gz', sep='\t', compression='gzip')
binaryMatrix

### Gene-Attribute Edge List

In [None]:
geneids = pd.read_csv('../../mapping/GeneSymbolsAndIDS_2023.tsv', sep='\t')
geneids['Human, Mouse, and Rat Approved Symbol'] = geneids['Human, Mouse, and Rat Approved Symbol'].apply(str.upper)
geneids = geneids.set_index('Human, Mouse, and Rat Approved Symbol')['Entrez Gene ID(supplied by NCBI)'].to_dict()

In [None]:
edgelist = cellmarker.copy()
edgelist['Gene ID'] = edgelist['Symbol'].map(geneids)
edgelist['Threshold'] = 1
edgelist = edgelist[['Symbol', 'Gene ID', 'Cell Type', 'Threshold']]
edgelist.columns = ['Gene', 'Gene ID', 'Cell Type', 'Threshold']
edgelist.to_csv(f'{output_path}gene_attribute_edges.txt.gz', sep='\t', compression='gzip')
edgelist

### Gene List

In [None]:
genelist = edgelist[['Gene', 'Gene ID']].drop_duplicates().reset_index(drop=True)
genelist.to_csv(f'{output_path}gene_list_terms.txt.gz', sep='\t', compression='gzip')
genelist

### Attribute List

In [None]:
attributelist = edgelist[['Cell Type']].drop_duplicates().reset_index(drop=True)
attributelist.to_csv(f'{output_path}attribute_list_entries.txt.gz', sep='\t', compression='gzip')
attributelist

### Gene Set Library

In [None]:
with open(f'{output_path}gene_set_library_crisp.gmt', 'w') as f:
    arr = binaryMatrix.reset_index(drop=True).to_numpy(dtype=np.int_)
    attributes = binaryMatrix.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*binaryMatrix.index[arr[:, i] == 1]])>= 5:
            print(attributes[i], *binaryMatrix.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Attribute Set Library

In [None]:
with open(output_path+'attribute_set_library_crisp.gmt', 'w') as f:
    arr = binaryMatrixT.reset_index(drop=True).to_numpy(dtype=np.int_)
    genes = binaryMatrixT.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*binaryMatrixT.index[arr[:, i] == 1]])>= 5:
            print(genes[i], *binaryMatrixT.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Gene Similarity Matrix

In [None]:
gene_similarity_matrix = dist.pdist(binaryMatrix.to_numpy(dtype=np.int_), 'cosine')
gene_similarity_matrix = dist.squareform(gene_similarity_matrix)
gene_similarity_matrix = 1 - gene_similarity_matrix

gene_similarity_matrix = pd.DataFrame(data=gene_similarity_matrix, index=binaryMatrix.index, columns=binaryMatrix.index)
gene_similarity_matrix.index.name = None
gene_similarity_matrix.columns.name = None
gene_similarity_matrix.to_csv(output_path+'gene_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
gene_similarity_matrix

### Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = dist.pdist(binaryMatrixT.to_numpy(dtype=np.int_), 'cosine')
attribute_similarity_matrix = dist.squareform(attribute_similarity_matrix)
attribute_similarity_matrix = 1 - attribute_similarity_matrix

attribute_similarity_matrix = pd.DataFrame(data=attribute_similarity_matrix, index=binaryMatrixT.index, columns=binaryMatrixT.index)
attribute_similarity_matrix.index.name = None
attribute_similarity_matrix.columns.name = None
attribute_similarity_matrix.to_csv(output_path+'attribute_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
attribute_similarity_matrix

### Knowledge Graaph Serialization

In [None]:
nodes = {}
edges = []

for gene in genelist.index:
    gene = genelist.loc[gene]
    nodes[int(gene['Gene ID'])] = {
        "type":"gene",
        "properties": {
            "id":int(gene['Gene ID']),
            "label":gene['Gene']
        }}

for term in attributelist.index:
    term = attributelist.loc[term]
    nodes[term['Cell Type']] = {
        "type": "cell type",
        "properties": {
            "label":term['Cell Type'],
            "id":term['Cell Type']
        }}

for edge in edgelist.index:
    edge = edgelist.loc[edge]
    edges.append({
        "source": int(edge['Gene ID']),
        "relation": 'is marker for',
        "target": edge['Cell Type'],
        "properties":{
            "id":str(edge['Gene ID'])+":"+edge['Cell Type'],
            "source_id":int(edge['Gene ID']),
            "source_label":edge['Gene'],
            "target_id":edge['Cell Type'],
            "target_label":edge['Cell Type'],
            "directed":True,
            "threshold":int(edge['Threshold'])
        }})

In [None]:
nodes

### RDF

In [None]:
with open(f'{output_path}kg_serializations/cellmarker.rdf', 'w') as f:
    print('@prefix gene: ncbi.nlm.nih.gov/gene/', file=f)
    print('@prefix RO: purl.obolibrary.org/RO_', file=f)
    print('', file=f)
    for edge in edges:
        print(
            'gene:'+str(edge['properties']['source_id']), 
            'RO:0002607', 
            edge['properties']['target_id'], end=' .\n', 
        file=f)

### JSON

In [None]:
with open(f'{output_path}kg_serializations/cellmarker.json', 'w') as f:
    serial = json.dump(
        {
            "Version":"1", 
            "nodes": nodes,
            "edges": edges
        }, indent=4, fp=f)

### TSV

#### Nodes

In [None]:
nodeframe = pd.DataFrame(nodes).T
nodeframe['id'] = nodeframe['properties'].apply(lambda x: x['id'])
nodeframe['label'] = nodeframe['properties'].apply(lambda x: x['label'])
nodeframe['namespace'] = nodeframe['type'].apply(lambda x: {'gene':'NCBI Entrez', 'cell type':'CellMarker'}[x])
nodeframe = nodeframe.get(['namespace', 'id', 'label']).reset_index(drop=True)
nodeframe.to_csv(f'{output_path}kg_serializations/cellmarker_tsv/nodes.tsv', sep='\t')
nodeframe

#### Edges

In [None]:
edgeframe = pd.DataFrame(edges)
edgeframe['threshold'] = edgeframe['properties'].apply(lambda x: x['threshold'])
edgeframe = edgeframe.get(['source', 'relation', 'target', 'threshold'])
edgeframe.to_csv(f'{output_path}kg_serializations/cellmarker_tsv/edges.tsv', sep='\t')
edgeframe

## Create Visualizations

In [None]:
binaryMatrix = pd.read_csv('downloads/gene_attribute_matrix.txt.gz', sep='\t', compression='gzip', index_col='Symbol')
gene_similarity_matrix = pd.read_csv('downloads/gene_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip', index_col='Unnamed: 0')
attribute_similarity_matrix = pd.read_csv('downloads/attribute_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip', index_col='Unnamed: 0')

### Gene-Attribute Clustered Heat Map

In [None]:
sns.clustermap(binaryMatrix, cmap='seismic', center=0, figsize=(25,25))

### Gene Similarity Clustered Heat Map

In [None]:
sns.clustermap(gene_similarity_matrix, cmap='seismic', center=0, figsize=(25,25))

### Attribute Similarity Clustered Heat Map

In [None]:
sns.clustermap(attribute_similarity_matrix, cmap='seismic', center=0, figsize=(25,25))

### UMAP

In [None]:
def load_gmt(file):
    gmt = OrderedDict()
    for line in file:
        term, blank, *geneset = line.strip().split('\t')
        gmt[term] = ' '.join(set(geneset))
    return gmt

In [None]:
libdict = load_gmt(open('downloads/gene_set_library_crisp.gmt', 'r'))
scatterdir = 'images/'

In [None]:
def process_scatterplot(libdict, nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1):
    print("\tTF-IDF vectorizing gene set data...")
    vec = TfidfVectorizer(max_df=maxdf, min_df=mindf)
    X = vec.fit_transform(libdict.values())
    print(X.shape)
    adata = anndata.AnnData(X)
    adata.obs.index = libdict.keys()

    print("\tPerforming Leiden clustering...")
    ### the n_neighbors and min_dist parameters can be altered
    sc.pp.neighbors(adata, n_neighbors=nneighbors, use_rep='X')
    sc.tl.leiden(adata, resolution=1.0)
    sc.tl.umap(adata, min_dist=mindist, spread=spread, random_state=42)

    new_order = adata.obs.sort_values(by='leiden').index.tolist()
    adata = adata[new_order, :]
    adata.obs['leiden'] = 'Cluster ' + adata.obs['leiden'].astype('object')

    df = pd.DataFrame(adata.obsm['X_umap'])
    df.columns = ['x', 'y']

    df['cluster'] = adata.obs['leiden'].values
    df['term'] = adata.obs.index
    df['genes'] = [libdict[l] for l in df['term']]

    return df

In [None]:
def get_scatter_colors(df):
    clusters = pd.unique(df['cluster']).tolist()
    colors = list(Category20[20])[::2] + list(Category20[20])[1::2]
    color_mapper = {clusters[i]: colors[i % 20] for i in range(len(clusters))}
    return color_mapper

def get_scatterplot(scatterdf):
    df = scatterdf.copy()
    color_mapper = get_scatter_colors(df)
    df['color'] = df['cluster'].apply(lambda x: color_mapper[x])

    hover_emb = HoverTool(name="df", tooltips="""
        <div style="margin: 10">
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
                <span style="font-size: 12px">@gene_set</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Coordinates:</span>
                <span style="font-size: 12px">(@x,@y)</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Cluster:</span>
                <span style="font-size: 12px">@cluster</span>
            </div>
        </div>
    """)
    tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']

    plot_emb = figure(
        width=1000, 
        height=700, 
        tools=tools_emb
    )

    source = ColumnDataSource(
        data=dict(
            x = df['x'],
            y = df['y'],
            gene_set = df['term'],
            cluster = df['cluster'],
            colors = df['color'],
            label = df['cluster']
        )
    )

    # hide axis labels and grid lines
    plot_emb.xaxis.major_tick_line_color = None
    plot_emb.xaxis.minor_tick_line_color = None
    plot_emb.yaxis.major_tick_line_color = None
    plot_emb.yaxis.minor_tick_line_color = None
    plot_emb.xaxis.major_label_text_font_size = '0pt'
    plot_emb.yaxis.major_label_text_font_size = '0pt' 

    plot_emb.output_backend = "svg"    
    
    plot_emb.title = 'Gene Sets in the CellMarker Gene-Cell Type Associations Library'
    plot_emb.xaxis.axis_label = "UMAP_1"
    plot_emb.yaxis.axis_label = "UMAP_2"
    plot_emb.xaxis.axis_label_text_font_style = 'normal'
    plot_emb.xaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_style = 'normal'
    plot_emb.title.align = 'center'
    plot_emb.title.text_font_size = '18px'
    
    s = plot_emb.scatter(
        'x', 
        'y', 
        size = 4, 
        source = source, 
        color = 'colors'
    )
    
    return plot_emb

In [None]:
## defaults: nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1
scatter_df = process_scatterplot(libdict
    #,nneighbors=30
    #,mindist=0.1
    #,spread=0.25, 
    #maxdf=1.0, 
    ,mindf=3
)

# Display Scatter Plot
plot = get_scatterplot(scatter_df)
output_notebook()
show(plot)

In [None]:
# save HTML if desired
output_file(filename=f"{scatterdir}umap.html", title = 'Gene Sets in the CellMarker Gene-Cell Type Associations Library')
save(plot)