# hTFtarget ETL Appyter
### Authors
Ido Diamant - Bioinformatics Software Engineer

Ma’ayan Lab, Mount Sinai Center for Bioinformatics, Department of Pharmacological Sciences  
Icahn School of Medicine at Mount Sinai, New York, NY 10029 USA
### hTFtarget 2022 Dataset
**Genes: 24455**  
**Terms: 1710**  
**Data Source:** http://bioinfo.life.hust.edu.cn/static/hTFtarget/file_download/

[hTFtarget](http://bioinfo.life.hust.edu.cn/hTFtarget) is a database of human transcription factors. It provides tools for the visualization, interpretation, and analysis of pathway knowledge.

This appyter takes data from the hTFtarget human transcription factor database and outputs files that are usable for Machine Learning and other applications. It processes the [TF-Target-information.txt](http://bioinfo.life.hust.edu.cn/static/hTFtarget/file_download/tf-target-infomation.txt) file downloaded on 09-22-2022.
  
The Appyter uses the NCBI database to map the gene names to a set of approved gene symbols so that synonymous genes are mapped to the same symbol.

The Appyter creates gene and attribute similarity matrices, which contain the Jaccard Index between any two genes or attributes.
    
The following output files are made available for download:  
* A binary matrix
* Gene list
* Attribute list
* A gene set library: for each attribute (pathway), a list of genes that are associated with the attribute
* An attribute set library: for each gene, a list of attributes (TFs and tissues) that are associated with each gene
* Gene-gene similarity matrix
* Attribute-attribute similarity matrix
* Gene-attribute edge list: a list of gene-attribute pairs and the strength of each 
association
* Serialized data for Knowledge Graph ingestion: a list of gene and TF:tissue nodes, and gene &rarr; TF:Tissue edges  
  
A ZIP archive containing these files is provided at the bottom of the report.

In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
import os
import sys
from datetime import date
from PIL import Image

import numpy as np
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
import anndata
import scanpy as sc
from IPython.display import FileLink, FileLinks

import harmonizome.utility_functions as uf
import harmonizome.lookup as lookup

from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.palettes import Category20

In [None]:
%load_ext autoreload
%autoreload 2

# Initialization

In [None]:
%%appyter hide_code
{% do SectionField(
    name= 'data',
    title= 'Upload Data',
    img='load_icon.png'
)%}

In [None]:
%%appyter code_eval
{% do DescriptionField(
    name= 'Description',
    text= 'The example below was sourced from <a href="http://bioinfo.life.hust.edu.cn/hTFtarget#!/download" target ="_blank">bioinfo.life.hust.edu</a>. If clicking on the example does not work, it should be downloaded directly from the source.',
    section='data'
)%}

{% set df_file = FileField(
    constrant= '.*/.txt$',
    name= 'tf_target',
    label= 'Human Transcriptome Factors and Targets (txt)',
    default= 'TF-Target-information.txt',
    examples= {
        'TF-Target-information.txt': 'http://bioinfo.life.hust.edu.cn/static/hTFtarget/file_download/tf-target-infomation.txt'
    },
    section= 'data'
)%}

In [None]:
output_notebook()

## Create Output

In [None]:
output_name = 'hTFtarget'

path = 'output'
if not os.path.exists(path):
    os.makedirs(path)

# Load Data

In [None]:
%%appyter code_exec

df = pd.read_csv(
    {{df_file}},
    sep='\\t', header=0)

In [None]:
display(df.head())
df.shape

## Load and Save Mapping Dictionaries

In [None]:
symbol_lookup, geneid_lookup = lookup.get_lookups()

## Pre-process Data

In [None]:
df.columns = ['TF', 'Gene Symbol', 'Tissue']
dupes = df.shape[0]
df.drop_duplicates(inplace=True)
display(df.head())
display(df.shape)
print("Detected " + str(dupes-df.shape[0]) + " duplicate entries, keeping first instance of each.")

## Filter Data

## Map Gene Symbols to Up-to-Date Approved Gene Symbols

In [None]:
df.set_index('Gene Symbol', inplace=True)
dupes = df.shape[0]
count = 0
df = uf.map_symbols(df, symbol_lookup, remove_duplicates=True)

while (dupes-df.shape[0]-count) != 0:
    df = uf.map_symbols(df, symbol_lookup, remove_duplicates=True)
    count = (dupes-df.shape[0])


df.reset_index(inplace=True)
display(df.head())
display(df.shape)

print("Found " + str(dupes-df.shape[0]) + " entries with synonymous genes, keeping first instance of each.")
df['Tissue'] = df['Tissue'].apply(str.split, sep=',')

In [None]:
tissue_lookup = {
    'Adult hspcs':'Adult HSPCs',
    'Bcbl-1 cells':'BCBL-1 cells',
    'Cell:hl60':'HL-60',
    'Cell:human bronchial epithelial cell line':'Bronchial epithelial cell line',
    'Cell:human embryonic kidney cells':'Embryonic kidney cells',
    'Cell:luhmes':'LHUMES',
    'Cell:mesenchymal stem cells':'Mesenchymal stem cells',
    'Cell:primary human memory b cells':'Memory B cells',
    'Cell:rhabdomyoblast':'Rhabdomyoblast',
    'Cells:caco2':'Caco-2',
    'Fetal proes':'Fetal proES',
    'Human aortic endothelial cell':'Aortic endothelial cell',
    'Human cortex':'Cortex',
    'Human embryonic stem cell':'Embryonic stem cells',
    'Human fetal osteoblasts':'Fetal osteoblasts',
    'Human normal breast epithelial cells':'Normal breast epithelial cells',
    'Lung/bronchus':'Bronchial',
    'Marrow':'Bone marrow',
    'Pancrea':'Pancreas',
    'Patient':'Other',
    'Patient ccrcc':'ccRCC',
    'Primary human neonatal keratinocytes':'Neonatal keratinocytes',
    'Purified cardiomyocyte g296s mutants':'Purified cardiomyocyte G296S mutants',
    'Retinal':'Retina',
    'Synovial fluid and peripheral blood':'Synovial fluid',
    'Unclear':'Other',
    'Unknown':'Other'
}

def map_tissue(tissue):
    if tissue in tissue_lookup:
        tissue = tissue_lookup[tissue]
    return tissue

df = df.explode('Tissue')
df['Tissue'] = df['Tissue'].apply(str.capitalize).apply(map_tissue)
dupes = df.shape[0]
df.drop_duplicates(inplace=True)
display(df.head())
display(df.shape)
print("Found " + str(dupes-df.shape[0]) + " identical tissue entries, keeping first instance of each.")

# Analyze Data

## Generate Dataset Statistics

In [None]:
stats = pd.DataFrame()
stats.index = ["Genes", "Terms", "Mean Genes/Term"]
genestats = df.groupby("Gene Symbol")
df['TF:Tissue'] = df['TF'] + ":" + df["Tissue"]
df.drop(columns=['TF', 'Tissue'], inplace=True)
tfstats = df.groupby("TF:Tissue")
stats["Statistics"] = [len(genestats), len(tfstats), str(round(len(genestats)/len(tfstats),3))]
stats.head()

In [None]:
genestats.describe()

In [None]:
tfstats.describe()

## Create Binary Matrix

In [None]:
binary_matrix = uf.binary_matrix(df.set_index('Gene Symbol'))
display(binary_matrix.head())
binary_matrix.shape

In [None]:
uf.save_data(binary_matrix, path, output_name + '_binary_matrix',
            dtype=np.uint8)

## Create Gene List

In [None]:
gene_list = uf.gene_list(binary_matrix, geneid_lookup)
display(gene_list.head())
gene_list.shape

In [None]:
uf.save_data(gene_list, path, output_name + '_gene_list',
             ext= 'tsv', index=False)

## Create Attribute List

In [None]:
attribute_list = uf.attribute_list(binary_matrix)
display(attribute_list.head())
attribute_list.shape

In [None]:
uf.save_data(attribute_list, path, output_name + '_attribute_list',
             ext= 'tsv')

## Create Gene Set Library and Attribute Set Library

In [None]:
uf.save_setlib(binary_matrix, 'gene', 'up', path, output_name + '_gene_set')

In [None]:
uf.save_setlib(binary_matrix, 'attribute', 'up', path, output_name + '_attribute_set')

## Create Gene Similarity Matrix

In [None]:
gene_similarity_matrix = uf.similarity_matrix(binary_matrix, 'jaccard', sparse=True)
gene_similarity_matrix.head()

In [None]:
uf.save_data(gene_similarity_matrix, path, 
             output_name + '_gene_similarity_matrix', 
            compression= 'npz', symmetric=True, dtype=np.float32)

## Create Attribute Similarty Matrix

In [None]:
attribute_similarity_matrix = uf.similarity_matrix(binary_matrix.T, 'jaccard', sparse=True)
attribute_similarity_matrix.head()

In [None]:
uf.save_data(attribute_similarity_matrix, path, 
             output_name + '_attribute_similarity_matrix', 
            symmetric=True, dtype=np.float32)

## Create Gene-Attribute Edge List

In [None]:
edge_list = uf.edge_list(binary_matrix)
uf.save_data(edge_list, path, output_name + '_edge_list',
             ext='tsv')

## Serialize Data for Knowledge Graph Ingestion

In [None]:
nodes = {}

for gene in gene_list.index:
    id = geneid_lookup[gene]
    nodes[gene] = {
        "type": "gene",
        "properties": {
            "id": str(id),
            "label": gene,
            "URI": "ncbi.nlm.nih.gov/gene/" + str(id)
        }}

for tf in attribute_list.index:
    id, tissue = tf.split(sep=':')[0], tf.split(sep=':')[1]
    nodes[tf] = {
        "type": "TF:Tissue",
        "properties": {
            "id": id,
            "label": tf,
            "URI": "http://bioinfo.life.hust.edu.cn/hTFtarget#!/targets/chipseq_tf?tf=" + str(id),
            "tissue": tissue
        }}

In [None]:
edges = []

for i in range(len(edge_list)):
    if edge_list[i]:
        source = edge_list.index[i][0]
        target = edge_list.index[i][1]
        edge = {
            "source": source,
            "relationship": 'regulated by',
            "target": target,
            "properties": {
                "id": source + ' -> ' + target,
                "source_id": str(geneid_lookup[source]),
                "target_id": target.split(':')[0],
                "directed": True
            }}
        edges.append(edge)

In [None]:
with open(path + "/" + output_name + "_serialization_for_kg.json","w") as serialize:       
    serial = json.dump(
        {
            "Version": "1", 
            "nodes": nodes, 
            "edges": edges
        }, indent=4, fp=serialize)

# Visualize Data

## Gene Set Histogram

In [None]:
geneSetLibrary = pd.read_csv(path+'/hTFtarget_gene_set'+date.today().strftime('_%Y_%m')+'.gmt',
                            header=None)

geneSetLibrary.index = geneSetLibrary[0].apply(str.split, sep='\t').str[0]
geneSetLibrary[0] = geneSetLibrary[0].apply(str.split, sep='\t').str[1:].apply(' '.join)
geneSetLibrary.drop_duplicates(0, inplace=True)
geneSetLibrary[0] = geneSetLibrary[0].apply(str.split).str[:]

geneSetLibrary.columns=['Gene Set']
geneSetLibrary['Gene Set Length'] = geneSetLibrary['Gene Set'].apply(len)

geneSetLibrary.groupby('Gene Set Length').describe()

In [None]:
group = geneSetLibrary.groupby("Gene Set Length")
source = ColumnDataSource(group)
hist = figure(plot_width = 1000, 
        plot_height=500,
        x_axis_type = "log",
        y_range=(0, max(group['Gene Set'].count())),
        title="Gene Set Length in hTFtarget " + date.today().strftime('%Y') + " Library",
        x_axis_label = "Gene Set Length",
        y_axis_label = "Gene Sets")
hist.vbar(x="Gene Set Length", top = "Gene Set_count", line_color="black", hover_fill_color="firebrick", 
        hover_line_color="black", hover_alpha=0.3, source=source)

hist.xaxis.axis_label_text_font_style = 'normal'
hist.xaxis.axis_label_text_font_size = '18px'
hist.yaxis.axis_label_text_font_size = '18px'
hist.yaxis.axis_label_text_font_style = 'normal'
hist.title.align = 'center'
hist.title.text_font_size = '18px'

hist.add_tools(HoverTool(tooltips="""
        <div style="margin: 10">
            <div style="margin: 0 auto; width:150px;">
                <span style="font-size: 12px; font-weight: bold;">Gene Set Length:</span>
                <span style="font-size: 12px">@{Gene Set Length}</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Count:</span>
                <span style="font-size: 12px">@{Gene Set_count}</span>
            </div>
        </div>
    """))

show(hist)

## UMAP Visualization

In [None]:
geneSetLibrary['Gene Set'] = geneSetLibrary['Gene Set'].apply(' '.join)

In [None]:
libDict = geneSetLibrary.to_dict()['Gene Set']

vec = TfidfVectorizer()
X = vec.fit_transform(libDict.values())
adata = anndata.AnnData(X, dtype='float32')
adata.obs.index = libDict.keys()

sc.pp.neighbors(adata, n_neighbors=25, use_rep='X')
sc.tl.leiden(adata)
sc.tl.umap(adata, a = 12.8, b = 0.55)

new_order = adata.obs.sort_values(by='leiden').index.tolist()
adata = adata[new_order,:]
adata.obs['leiden'] = 'Cluster ' + adata.obs['leiden'].astype('object')

mapped_df = pd.DataFrame(adata.obsm['X_umap'])
mapped_df.columns = ['x', 'y']

mapped_df['cluster'] = adata.obs['leiden'].values
mapped_df['term'] = adata.obs.index

clusters = pd.unique(mapped_df['cluster']).tolist()
colors = list(Category20[20])[::2] + list(Category20[20])[1::2]
color_mapper = {clusters[i]:colors[i%20] for i in range(len(clusters))}

mapped_df['color'] = mapped_df['cluster'].apply(lambda x: color_mapper[x])

mapped_df.head()

In [None]:
xlabel = 'UMAP 1'
ylabel = 'UMAP 2'

source2 = ColumnDataSource(
        data=dict(
            x = mapped_df.x,
            y = mapped_df.y,
            alpha = [0.7] * mapped_df.shape[0],
            colors = mapped_df['color'], 
            size = [6] * mapped_df.shape[0],
            gene_set = mapped_df['term'],
            cluster = mapped_df['cluster']
        )
    )

hover_emb = HoverTool(names=["df"], tooltips="""
    <div style="margin: 10">
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
            <span style="font-size: 12px">@gene_set</span>
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Coordinates:</span>
            <span style="font-size: 12px">(@x,@y)</span>
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Cluster:</span>
            <span style="font-size: 12px">@cluster</span>
        </div>
    </div>
    """)

tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']
title_emb = 'Gene Sets in hTFtarget ' + date.today().strftime('%Y') + ' Library'
plot_emb = figure(plot_width=1000, plot_height=700, tools=tools_emb, title=title_emb, x_axis_label=xlabel, y_axis_label=ylabel)
plot_emb.circle( 'x', 'y', source = source2, size='size',
                alpha='alpha', line_alpha=0, line_width=0.01, name="df", 
                fill_color = 'colors', 
                line_color="black", hover_fill_color="firebrick")
plot_emb.xaxis.axis_label_text_font_style = 'normal'
plot_emb.xaxis.axis_label_text_font_size = '18px'
plot_emb.yaxis.axis_label_text_font_size = '18px'
plot_emb.yaxis.axis_label_text_font_style = 'normal'
plot_emb.title.align = 'center'
plot_emb.title.text_font_size = '18px'

show(plot_emb)

# Output Data

In [None]:
uf.archive(path, output_name+"_")

In [None]:
display(FileLink('hTFtarget_output_archive.zip', result_html_prefix='Archive of all files: '))
display(FileLinks('output'))