# ChEA Transcription Factor Binding Site Profiles 2022 Dataset Processing
Data processing notebook for the ChEA Transcription Factor Binding Site Profiles 2022 Dataset. The gene set library was downloaded from Enrichr. Genes were mapped to up-to-date approved gene symbols and Entrez NCBI gene IDs.

In [None]:
import pandas as pd
import datetime
import numpy as np
import scipy.spatial.distance as dist
import seaborn as sns
import sys
import json
import scanpy as sc
from tqdm import tqdm
from collections import OrderedDict

from sklearn.feature_extraction.text import TfidfVectorizer
import anndata
from collections import OrderedDict

# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, save, output_file
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.palettes import Category20
output_notebook()

from IPython.display import display, HTML, Markdown
sys.setrecursionlimit(100000)

In [None]:
def load_gmt(file):
    gmt = {}
    for line in file:
        term, desc, *geneset = line.strip().split('\t')
        gmt[term] = {'desc':desc, 'geneset':set(geneset)}
    return gmt

In [None]:
chea22 = load_gmt(open('newdata/ChEA22/ChEA_2022.txt', 'r'))
chea22 = pd.DataFrame(chea22).T.explode('geneset').reset_index().drop(columns=['desc'])
chea22.columns=['Transcription Factor Binding Site', 'Gene']
chea22['Transcription Factor Binding Site'] = chea22['Transcription Factor Binding Site'].apply(lambda x: x.replace('ChIP-ChIP ', '').replace('ChIP-Seq ', '').replace(' ', '-').upper())
chea22

## Pre-process Data

In [None]:
symbolmap = pd.read_csv('mapping/mappingFile_2023.tsv', sep='\t', header=None).set_index(1)[2].to_dict()
geneids = pd.read_csv('mapping/GeneSymbolsAndIDS_2023.tsv', sep='\t').set_index('Human, Mouse, and Rat Approved Symbol')['Entrez Gene ID(supplied by NCBI)'].to_dict()

In [None]:
chea22 = chea22[chea22['Gene'].apply(lambda x: x in symbolmap)]
chea22['Gene'] = chea22['Gene'].apply(lambda x: symbolmap[x])
chea22['Gene ID'] = chea22['Gene'].apply(lambda x: geneids[x])
chea22 = chea22.drop_duplicates()

## Process Data for SQL

### Dataset

### Publication

### Genes

In [None]:
genefks = pd.read_csv('tables/gene.csv').set_index('ncbi_entrez_gene_id')['id'].to_dict()
genefks.update(pd.read_csv('tables/newgenes.csv').set_index('ncbi_entrez_gene_id')['id'].to_dict())

In [None]:
geneinfo = pd.read_csv('tables/gene_info', sep='\t', index_col='GeneID').get(['Symbol','description'])
genelist = pd.read_csv('tables/gene.csv').set_index('ncbi_entrez_gene_id')
for gene in chea22['Gene ID'].unique():
    if gene not in genelist.index:
        print(gene)#, geneinfo.loc[gene])



In [None]:
chea22 = chea22[chea22['Gene']!='MMD2']

index = 57241
newgenes = {'CSNKA2IP': 111064647, 'SMIM38': 107984345, 'TMEM271': 112441426}
for gene in newgenes:
    geneid = newgenes[gene]
    genefks[geneid] = index
    print((index, gene, geneid, geneinfo.loc[geneid,'description'], 'https://ncbi.nlm.nih.gov/gene/'+str(geneid)), end=',\n')
    index += 1

### Attributes

In [None]:
attributes = pd.read_csv('tables/attribute.csv', index_col='name_from_naming_authority')
attributes = attributes[attributes['naming_authority_fk']==35]
attributes

In [None]:
attributefks = attributes['id'].to_dict()


index=361470

for attribute in chea22['Transcription Factor Binding Site'].unique():
    if attribute not in attributes.index:
        attributefks[attribute] = index
        print((index, attribute, 'transcription factor binding site profile identified as [transcription factor gene symbol]-[publication pmid]-[cell or tissue sampled/organism studied]', 35), end=',\n')
        index += 1

### Gene Sets

In [None]:
genesetfks = {}
index = 134400000
for attribute in chea22['Transcription Factor Binding Site'].unique():
    genesetfks[attribute] = index
    print((index, attribute, 'transcription factor binding site profile identified as [transcription factor gene symbol]-[publication pmid]-[cell or tissue sampled/organism studied]', 143, 41, attributefks[attribute]), end=',\n')
    index += 1

### Associations

In [None]:
associations = chea22.copy()
associations['gene_set_fk'] = associations['Transcription Factor Binding Site'].apply(lambda x: genesetfks[x])
associations['gene_fk'] = associations['Gene ID'].apply(lambda x: genefks[x])
associations['threshold_value'] = 1
associations.index += 23000000
associations = associations.get(['gene_fk', 'gene_set_fk', 'threshold_value'])
associations.to_csv('harmonizome-update/chea22.csv')
associations

## Create Downloads

In [None]:
output_path = 'newdata/ChEA22/downloads/'

### Gene-Attribute Binary Matrix

In [None]:
binarymatrix = pd.crosstab(chea22['Gene'], chea22['Transcription Factor Binding Site'])
binarymatrixT = binarymatrix.T
binarymatrix.to_csv(output_path+'gene_attribute_matrix.txt.gz', sep='\t', compression='gzip')
binarymatrix

### Gene-Attribute Edge List

In [None]:
edgelist = chea22.get(['Gene', 'Gene ID', 'Transcription Factor Binding Site'])
edgelist.to_csv(output_path+'gene_attribute_edges.txt.gz', sep='\t', compression='gzip')
edgelist

### Gene List

In [None]:
geneslist = chea22.get(['Gene', 'Gene ID']).drop_duplicates().reset_index(drop=True)
geneslist.to_csv(output_path+'gene_list_terms.txt.gz', sep='\t', compression='gzip')
geneslist

### Attribute List

In [None]:
attributeslist = chea22.get(['Transcription Factor Binding Site']).drop_duplicates().reset_index(drop=True)
attributeslist.to_csv(output_path+'attribute_list_entries.txt.gz', sep='\t', compression='gzip')
attributeslist

### Gene Set Library

In [None]:
with open(output_path+'gene_set_library_crisp.gmt', 'w') as f:
    arr = binarymatrix.reset_index(drop=True).to_numpy(dtype=np.int_)
    attributes = binarymatrix.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*binarymatrix.index[arr[:, i] == 1]])>= 5:
            print(attributes[i], *binarymatrix.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Attribute Set Library

In [None]:
with open(output_path+'attribute_set_library_crisp.gmt', 'w') as f:
    arr = binarymatrixT.reset_index(drop=True).to_numpy(dtype=np.int_)
    genes = binarymatrixT.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*binarymatrixT.index[arr[:, i] == 1]])>= 5:
            print(genes[i], *binarymatrixT.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Gene Similarity Matrix

In [None]:
gene_similarity_matrix = dist.pdist(binarymatrix.to_numpy(dtype=np.int_), 'cosine')
gene_similarity_matrix = dist.squareform(gene_similarity_matrix)
gene_similarity_matrix = 1 - gene_similarity_matrix

gene_similarity_matrix = pd.DataFrame(data=gene_similarity_matrix, index=binarymatrix.index, columns=binarymatrix.index)
gene_similarity_matrix.index.name = None
gene_similarity_matrix.columns.name = None
gene_similarity_matrix.to_csv(output_path+'gene_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
gene_similarity_matrix

### Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = dist.pdist(binarymatrixT.to_numpy(dtype=np.int_), 'cosine')
attribute_similarity_matrix = dist.squareform(attribute_similarity_matrix)
attribute_similarity_matrix = 1 - attribute_similarity_matrix

attribute_similarity_matrix = pd.DataFrame(data=attribute_similarity_matrix, index=binarymatrixT.index, columns=binarymatrixT.index)
attribute_similarity_matrix.index.name = None
attribute_similarity_matrix.columns.name = None
attribute_similarity_matrix.to_csv(output_path+'attribute_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
attribute_similarity_matrix

### Knowledge Graph Serialization

In [None]:
nodes = {}
edges = []

for gene in geneslist.index:
    gene = geneslist.loc[gene]
    nodes[int(gene['Gene ID'])] = {
        "type":"gene",
        "properties": {
            "id":int(gene['Gene ID']),
            "label":gene['Gene']
        }}

for tfbsprofile in attributeslist.index:
    tfbsprofile = attributeslist.loc[tfbsprofile]
    nodes[tfbsprofile['Transcription Factor Binding Site']] = {
        "type":"transcription factor binding site profile",
        "properties": {
            "label":tfbsprofile['Transcription Factor Binding Site'],
            "id":tfbsprofile['Transcription Factor Binding Site']
        }}

for edge in edgelist.index:
    edge = edgelist.loc[edge]
    edges.append({
        "source": int(edge['Gene ID']),
        "relation": "has evidence",
        "target": edge['Transcription Factor Binding Site'],
        "properties":{
            "id":str(edge['Gene ID'])+":"+edge['Transcription Factor Binding Site'],
            "source_id":int(edge['Gene ID']),
            "source_label":edge['Gene'],
            "target_label":edge['Transcription Factor Binding Site'],
            "directed":True,
            "threshold":1
        }})

#### RDF

In [None]:
with open(output_path+'kg_serializations/chea22.rdf', 'w') as f:
    print('@prefix gene: ncbi.nlm.nih.gov/gene/', file=f)
    print('@prefix RO: purl.obolibrary.org/RO_', file=f)

    print('', file=f)
    for edge in edges:
        print('gene:'+str(edge['properties']['source_id']), 'RO:0002558', edge['properties']['target_label'], end=' .\n', file=f)

#### JSON

In [None]:
with open(output_path+'kg_serializations/chea22.json', 'w') as f:
    serial = json.dump(
        {
            "Version":"1", 
            "nodes": nodes,
            "edges": edges
        }, indent=4, fp=f)

#### TSV

In [None]:
nodeframe = pd.DataFrame(nodes).T
nodeframe['id'] = nodeframe['properties'].apply(lambda x: x['id'])
nodeframe['label'] = nodeframe['properties'].apply(lambda x: x['label'])
nodeframe['namespace'] = nodeframe['type'].apply(lambda x: {'gene':'NCBI Entrez', 'transcription factor binding site profile':'ChEA'}[x])
nodeframe = nodeframe.get(['namespace', 'id', 'label']).reset_index(drop=True)
nodeframe.to_csv(output_path+'kg_serializations/chea22_tsv/nodes.tsv', sep='\t')
nodeframe

In [None]:
edgeframe = pd.DataFrame(edges)
edgeframe['threshold'] = edgeframe['properties'].apply(lambda x: x['threshold'])
edgeframe = edgeframe.get(['source', 'relation', 'target', 'threshold'])
edgeframe.to_csv(output_path+'kg_serializations/chea22_tsv/edges.tsv', sep='\t')
edgeframe

## Create Visualizations

### Gene-Attribute Clustered Heatmap

In [None]:
sns.clustermap(binarymatrix, cmap='seismic', center=0)

### Gene-Gene Similarity Clustered Heatmap

In [None]:
sns.clustermap(gene_similarity_matrix, cmap='seismic', center=0)

### Attribute-Attribute Similarity Clustered Heatmap

In [None]:
sns.clustermap(attribute_similarity_matrix, cmap='seismic', center=0)

### UMAP

In [None]:
def load_gmt(file):
    gmt = OrderedDict()
    for line in file:
        term, *geneset = line.strip().split('\t')
        gmt[term] = ' '.join(set(geneset))
    return gmt

In [None]:
libdict = load_gmt(open('newdata/ChEA22/downloads/gene_set_library_crisp.gmt', 'r'))
scatterdir = 'newdata/ChEA22/images/'

In [None]:
def process_scatterplot(libdict, nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1):
    print("\tTF-IDF vectorizing gene set data...")
    vec = TfidfVectorizer(max_df=maxdf, min_df=mindf)
    X = vec.fit_transform(libdict.values())
    print(X.shape)
    adata = anndata.AnnData(X)
    adata.obs.index = libdict.keys()

    print("\tPerforming Leiden clustering...")
    ### the n_neighbors and min_dist parameters can be altered
    sc.pp.neighbors(adata, n_neighbors=nneighbors)
    sc.tl.leiden(adata, resolution=1.0)
    sc.tl.umap(adata, min_dist=mindist, spread=spread, random_state=42)

    new_order = adata.obs.sort_values(by='leiden').index.tolist()
    adata = adata[new_order, :]
    adata.obs['leiden'] = 'Cluster ' + adata.obs['leiden'].astype('object')

    df = pd.DataFrame(adata.obsm['X_umap'])
    df.columns = ['x', 'y']

    df['cluster'] = adata.obs['leiden'].values
    df['term'] = adata.obs.index
    df['genes'] = [libdict[l] for l in df['term']]

    return df

In [None]:
def get_scatter_colors(df):
    clusters = pd.unique(df['cluster']).tolist()
    colors = list(Category20[20])[::2] + list(Category20[20])[1::2]
    color_mapper = {clusters[i]: colors[i % 20] for i in range(len(clusters))}
    return color_mapper

def get_scatterplot(scatterdf):
    df = scatterdf.copy()
    color_mapper = get_scatter_colors(df)
    df['color'] = df['cluster'].apply(lambda x: color_mapper[x])

    hover_emb = HoverTool(name="df", tooltips="""
        <div style="margin: 10">
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
                <span style="font-size: 12px">@gene_set</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Coordinates:</span>
                <span style="font-size: 12px">(@x,@y)</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Cluster:</span>
                <span style="font-size: 12px">@cluster</span>
            </div>
        </div>
    """)
    tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']

    plot_emb = figure(
        width=1000, 
        height=700, 
        tools=tools_emb
    )

    source = ColumnDataSource(
        data=dict(
            x = df['x'],
            y = df['y'],
            gene_set = df['term'],
            cluster = df['cluster'],
            colors = df['color'],
            label = df['cluster']
        )
    )

    # hide axis labels and grid lines
    plot_emb.xaxis.major_tick_line_color = None
    plot_emb.xaxis.minor_tick_line_color = None
    plot_emb.yaxis.major_tick_line_color = None
    plot_emb.yaxis.minor_tick_line_color = None
    plot_emb.xaxis.major_label_text_font_size = '0pt'
    plot_emb.yaxis.major_label_text_font_size = '0pt' 

    plot_emb.output_backend = "svg"    
    
    plot_emb.title = 'Gene Sets in ChEA Transcription Factor Binding Site Profiles 2022 Library'
    plot_emb.xaxis.axis_label = "UMAP_1"
    plot_emb.yaxis.axis_label = "UMAP_2"
    plot_emb.xaxis.axis_label_text_font_style = 'normal'
    plot_emb.xaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_style = 'normal'
    plot_emb.title.align = 'center'
    plot_emb.title.text_font_size = '18px'
    
    s = plot_emb.scatter(
        'x', 
        'y', 
        size = 4, 
        source = source, 
        color = 'colors'
    )
    
    return plot_emb

In [None]:
## defaults: nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1
scatter_df = process_scatterplot(libdict, nneighbors=30,mindist=0.01
     ,spread=6.5, 
     maxdf=0.5, 
     mindf=15
)

# Display Scatter Plot
plot = get_scatterplot(scatter_df)
output_notebook()
show(plot)

In [None]:
output_file(filename=f"{scatterdir}/umap.html", title = 'Gene Sets in ChEA Transcription Factor Binding Site Profiles 2022 Library')
save(plot)