# KnockTF Gene Expression Profiles with Transcription Factor Perturbations Harmonizome Processing

[KnockTF](http://www.licpathway.net/KnockTF/index.html) is a resource dedicated to exploring the effect of transcription factor knockout/knockdown on gene regulation. Gene expression was measured before and after transcription factor perturbations. The resource collected data from 308 transcription factors and 570 manually curated RNA-seq and microrarray datasets from [ENCODE](https://www.encodeproject.org/) and [GEO](https://www.ncbi.nlm.nih.gov/geo/) to create a dataset of detailed gene expression data with associations between 566 transcription factor perturbations and 17,964 genes.

Feng, C., et al. (2020). "KnockTF: a comprehensive human gene expression profile database with knockdown/knockout of transcription factors." Nucleic Acids Research 48(D1): D93-D100.

In [None]:
import pandas as pd
import datetime
import numpy as np
import scipy.spatial.distance as dist
import seaborn
import sys
import json
import scanpy as sc

from sklearn.feature_extraction.text import TfidfVectorizer
import anndata
from bokeh.io import output_notebook, export_svg, output_file, save
from bokeh.io.export import get_screenshot_as_png
from bokeh.embed import json_item
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.palettes import Category20
sys.setrecursionlimit(100000)

## Mapping Setup

In [None]:
prodgenes = pd.read_csv('production/gene_updated.csv')
prodgenes = prodgenes.get(['id', 'symbol', 'ncbi_entrez_gene_id'])
prodgenes['symbol'] = prodgenes['symbol'].apply(str.upper)

geneinfo = pd.read_csv('tables/gene_info',sep='\t')
geneinfo = geneinfo[geneinfo['type_of_gene']=='protein-coding']
geneinfo = geneinfo.get(['GeneID','Symbol','description'])
geneinfo['Symbol'] = geneinfo['Symbol'].apply(str.upper)

## Load Data
The following data file was downloaded from knockTF's [download page](http://www.licpathway.net/KnockTF/download.php) and renamed from "differential expression of genes in all datasets.txt" to "knockTF.txt."

In [None]:
knocktf = pd.read_csv('newdata/knockTF/knockTF.txt', sep='\t', dtype=object)
knocktf

## Process Data

In [None]:
knocktf['threshold'] = knocktf['up_down'].replace('0',np.nan).replace('2','-1')
knocktf = knocktf.dropna().reset_index(drop=True).drop(columns=['Mean Expr. of Treat', 'Mean Expr. of Control', 'Rank', 'P_value', 'up_down'])
knocktf['threshold'] = knocktf['threshold'].astype(int)

In [None]:
knocktf['Gene'] = knocktf['Gene'].apply(str.upper)
knocktf = knocktf.sort_values(['Sample_ID', 'Log2FC'], ascending=[True,False])
knocktf['TF'] = knocktf['TF']+'_'+knocktf['Sample_ID'].apply(str.split, sep='_').str[1:].apply('_'.join)
knocktf

In [None]:
print(knocktf.TF.unique().__len__(), 'transcription factors perturbations,',knocktf.Gene.unique().__len__(),'genes')

In [None]:
dropgenes = []
for gene in knocktf.Gene.unique():
    if gene not in prodgenes['symbol'].to_list() and gene not in geneinfo['Symbol'].to_list():
        dropgenes.append(gene)
knocktf = knocktf.set_index('Gene').drop(dropgenes, axis=0).reset_index()

In [None]:
knocktf = knocktf.sort_values(['TF','Log2FC'],ascending=[True,False])
knocktf = knocktf.get(['TF','Gene','FC','Log2FC','threshold']).set_index('TF')
edgelist = pd.DataFrame(columns = knocktf.columns)
for tf in knocktf.index.unique():
    edgelist = pd.concat([edgelist,knocktf.loc[tf][:100]])
    edgelist = pd.concat([edgelist,knocktf.loc[tf][-100:]])
knocktf = knocktf.reset_index()
edgelist = edgelist.drop(['Gene','FC','Log2FC','threshold']).reset_index().get(['index','Gene','FC','Log2FC','threshold'])
edgelist.columns = knocktf.columns
edgelist

In [None]:
print(edgelist.TF.unique().__len__(), 'transcription factor perturbations,',edgelist.Gene.unique().__len__(),'genes')

## Harmonizome Additions

### Resource

In [None]:
('knockTF',
np.NaN,
'KnockTF collects upstream pathway information of TFs and functional annotation results of downstream target genes. It provides details about TFs binding to promoters, super-enhancers and typical enhancers of target genes. KnockTF constructs a TF-differentially expressed gene network and performs network analyses for genes of interest. KnockTF will help elucidate TF-related functions and potential biological effects.',
'a comprehensive human gene expression profile database with knockdown/knockout of transcription factors',
'http://www.licpathway.net/KnockTF/index.html', 
'566',
'1',
np.NaN)

### Dataset

In [None]:
(131,
'KnockTF Gene Expression Profiles with Transcription Factor Perturbations',
'Gene Expression Profiles with Transcription Factor Perturtbations',
'Gene expression profiles for cell lines or tissues following transcription factor perturbation (knockdown/knockout)',
'gene-transcription factor associations by differential expression of gene following perturbation of trasncription factor',
'genesdifferentially expressed following the {0} trascription factor perturbation from the KnockTF Gene Expression Profiles with Transcription Factor Perturbations dataset.',
'sets of genes differentially expressed following trascription factor perturbation from the KnockTF Gene Expression Profiles with Transcription Factor Perturbations dataset.',
'transcription factor perturbations changing expression of {0} gene from the KnockTF Gene Expression Profiles with Transcription Factor Perturbations dataset.',
'increased expression',
'decreased expression',
1,
1,
datetime.datetime.today().strftime('%Y-%m-%d'),
'knocktf',
0,
44,
15,
7,
37,
5,
'gene expression by microarray or RNA-seq'
'curated experimental data',
'high throughput, data driven',
'transcription factor perturbations')

### Publication

In [None]:
(131,
'Feng, C et al. (2019) KnockTF: a comprehensive human gene expression profile database with knockdown/knockout of transcription factors. Nucleic Acids Res. 48:D93-100.'
'Feng, Nucleic Acis Res, 2019',
'dx.doi.org/10.1093/nar/gkz881',
31598675,
'https://pubmed.ncbi.nlm.nih.gov/31598675',
'Feng',
'C',
'Nucleic Acids Res',
'2019',
'KnockTF: a comprehensive human gene expression profile database with knockdown/knockout of transcription factors',
48,
'D93-100')

### Genes To Add

In [None]:
genedict = geneinfo.set_index('Symbol')['GeneID'].to_dict()
genedict.update(prodgenes.set_index('symbol')['ncbi_entrez_gene_id'].to_dict())
genefk = prodgenes.set_index('symbol')['id'].to_dict()

In [None]:
i = 57025
for gene in edgelist['Gene'].unique():
    if gene not in prodgenes['symbol'].to_list():
        gene = geneinfo.set_index('Symbol').loc[gene]
        print((i, 
            gene.name, 
            gene['GeneID'], 
            gene['description'], 
            'http://www.ncbi.nlm.nih.gove/gene/'+str(gene['GeneID'])), end=',\n')
        genefk[gene.name] = i
        i += 1

In [None]:
genes = edgelist.copy().get('Gene').drop_duplicates().reset_index(drop=True).to_frame()
genes['Gene ID'] = genes['Gene'].apply(lambda x: genedict[x])
genes['Gene FK'] = genes['Gene'].apply(lambda x: genefk[x])
genes

### Naming Authority

In [None]:
(101, 'KnockTF', 'knockTF', 'KnockTF: a comprehensive human gene expression profile database with knockdown/knockout of transcription factors', 'http://www.licpathway.net/KnockTF/index.html', 131)

### Attributes To Add
Metadata was sourced from the knockTF [search page](http://www.licpathway.net/KnockTF/search/search_tf_result.php?tf_name=&tf_class=All&tf_superclass=All).

In [None]:
meta = pd.read_csv('newdata/knockTF/knockTFmetadata.csv')
meta['Dataset ID'] = meta['Dataset ID'].apply(str.split, sep='_').str[1:].apply('_'.join)
meta = meta.set_index('Dataset ID')
meta

In [None]:
def len3(sampleid):
    sample = sampleid.split('_')
    if len(sample[2])==2:
        return sample[1]+'_0'+sample[2]
    return sample[1]+'_'+sample[2]

In [None]:
attributes = edgelist.copy().get('TF').drop_duplicates().to_frame().reset_index(drop=True).reset_index().reset_index()
attributes.columns = ['attribute_fk', 'gene_set_fk', 'tfpert']
attributes['sample'] = attributes['tfpert'].apply(len3)
attributes['desc'] = attributes['sample'].apply(lambda x: meta.loc[x, 'TF']+' '+meta.loc[x,'Knock-Method']+' knock from '+meta.loc[x,'Tissue Type'].lower()+' '+meta.loc[x,'Biosample Type'].lower())
attributes['attribute_fk'] += 296446
attributes['gene_set_fk'] += 133100000

attributes

In [None]:
for tfpert in attributes.index:
    tfpert = attributes.loc[tfpert]
    id = 'DataSet_'+'_'.join(tfpert['tfpert'].split('_')[1:])
    #print((tfpert['attribute_fk'], tfpert['tfpert'], id, tfpert['desc'], 'http://www.licpathway.net/KnockTF/search/search_sample_result.php?sample_id='+id, 101), end=',\n')

### Gene Sets To Add

In [None]:
for tfpert in attributes.index:
    tfpert = attributes.loc[tfpert]
    id = 'DataSet_'+'_'.join(tfpert['tfpert'].split('_')[1:])
    #print((tfpert['gene_set_fk'], tfpert['tfpert'], id, tfpert['desc'], 'http://www.licpathway.net/KnockTF/search/search_sample_result.php?sample_id='+id, 131, 37, tfpert['attribute_fk']), end=',\n')

### Associations To Add

In [None]:
genefk = genes.set_index('Gene')['Gene FK'].to_dict()
genesetfk = attributes.set_index('tfpert')['gene_set_fk'].to_dict()
associations = edgelist.copy()
associations['gene_fk'] = associations['Gene'].apply(lambda x: genefk[x])
associations['gene_set_fk'] = associations['TF'].apply(lambda x: genesetfk[x])
associations.index += 11000000
associations = associations.rename_axis('id', axis=0).reset_index()
associations = associations.get(['id','gene_fk','gene_set_fk','FC','Log2FC','threshold'])
associations.to_csv('harmonizome-update/knocktf.csv')
associations

## Download Files

In [None]:
output_path = 'newdata/knockTF/downloads/'

### Gene-Attribute Ternary Matrix

In [None]:
ternary_matrix = pd.crosstab(index=edgelist['Gene'],
                columns=edgelist['TF'],
                values=edgelist['threshold'].values,
                aggfunc=np.max).fillna(0).astype(int)

ternary_matrix_T = ternary_matrix.T

ternary_matrix.to_csv(output_path+'gene_attribute_matrix.txt.gz', sep='\t', compression='gzip')
ternary_matrix

### Gene-Attribute Edge List

In [None]:
edgelist.to_csv(output_path+'gene_attribute_edges.txt.gz', sep='\t', compression='gzip')
edgelist

### Gene List

In [None]:
genes.to_csv(output_path+'gene_list_terms.txt.gz', sep='\t', compression='gzip')
genes

### Attribute List

In [None]:
attributes.to_csv(output_path+'attribute_list_entries.txt.gz', sep='\t', compression='gzip')
attributes

### Gene-Gene Similarity Matrix

In [None]:
similarity_matrix = dist.pdist(ternary_matrix.to_numpy(dtype=np.int_), 'cosine')
similarity_matrix = dist.squareform(similarity_matrix)
similarity_matrix = 1 - similarity_matrix

gene_similarity_matrix = pd.DataFrame(similarity_matrix, index=ternary_matrix.index, columns=ternary_matrix.index)
gene_similarity_matrix.index.name = None
gene_similarity_matrix.columns.name = None

gene_similarity_matrix.to_csv(output_path+'gene_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
gene_similarity_matrix

### Attribute-Attribute Similarity Matrix

In [None]:
similarity_matrix = dist.pdist(ternary_matrix_T.to_numpy(dtype=np.int_), 'cosine')
similarity_matrix = dist.squareform(similarity_matrix)
similarity_matrix = 1 - similarity_matrix

attribute_similarity_matrix = pd.DataFrame(similarity_matrix, index=ternary_matrix_T.index, columns=ternary_matrix_T.index)
attribute_similarity_matrix.index.name = None
attribute_similarity_matrix.columns.name = None

attribute_similarity_matrix.to_csv(output_path+'attribute_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
attribute_similarity_matrix

### Up Gene Set Library

In [None]:
arr = ternary_matrix.reset_index(drop=True).to_numpy(dtype=np.int_)

with open(output_path+'gene_set_library_up_crisp.gmt', 'w') as f:
    for i in range(arr.shape[1]):
        print(ternary_matrix.columns[i],*ternary_matrix.index[arr[:,i]==1],
            sep='\t', end='\n', file=f)

### Down Gene Set Library

In [None]:
arr = ternary_matrix.reset_index(drop=True).to_numpy(dtype=np.int_)

with open(output_path+'gene_set_library_dn_crisp.gmt', 'w') as f:
    for i in range(arr.shape[1]):
        print(ternary_matrix.columns[i],*ternary_matrix.index[arr[:,i]==-1],
            sep='\t', end='\n', file=f)

### Up Attribute Set Library

In [None]:
arr = ternary_matrix_T.reset_index(drop=True).to_numpy(dtype=np.int_)

with open(output_path+'attribute_set_library_up_crisp.gmt', 'w') as f:
    for i in range(arr.shape[1]):
        print(ternary_matrix_T.columns[i],*ternary_matrix_T.index[arr[:,i]==1],
            sep='\t', end='\n', file=f)

### Down Attribute Set Library

In [None]:
arr = ternary_matrix_T.reset_index(drop=True).to_numpy(dtype=np.int_)

with open(output_path+'attribute_set_library_dn_crisp.gmt', 'w') as f:
    for i in range(arr.shape[1]):
        print(ternary_matrix_T.columns[i],*ternary_matrix_T.index[arr[:,i]==-1],
            sep='\t', end='\n', file=f)

### Gene-Attribute Cleaned Matrix

In [None]:
cleaned_matrix = pd.crosstab(index=edgelist['Gene'],
                columns=edgelist['TF'],
                values=edgelist['FC'].values,
                aggfunc=np.max).fillna(0)

cleaned_matrix = cleaned_matrix.rename_axis('Gene Symbol', axis='index').rename_axis('TF',  axis='columns')

cleaned_matrix.to_csv(output_path+'gene_attribute_matrix_cleaned.txt.gz', sep='\t', compression='gzip')
cleaned_matrix

### Gene-Attribute Standardized Matrix

In [None]:
standardized_matrix = pd.crosstab(index=edgelist['Gene'],
                columns=edgelist['TF'],
                values=edgelist['Log2FC'].values,
                aggfunc=np.max).fillna(0)

standardized_matrix = standardized_matrix.rename_axis('Gene Symbol', axis='index').rename_axis('TF',  axis='columns')

standardized_matrix.to_csv(output_path+'gene_attribute_matrix_standardized.txt.gz', sep='\t', compression='gzip')
standardized_matrix

### Knowledge Graph Serialization

In [None]:
nodes = {}
edges = []
genes = genes.set_index('Gene')
attributes = attributes.set_index('tfpert')

for gene in genes.index.to_list():
    id = int(genes.loc[gene, 'Gene ID'])
    label = gene
    nodes[id] = {
        "type": "gene",
        "properties": {
            "id": id,
            "label": label
        }
    }

for attribute in attributes.index.to_list():
    id = 'DataSet_'+attributes.loc[attribute, 'sample']
    label = attribute
    nodes[id] = {
        "type": "transcription factor perturbation",
        "properties": {
            "id": id,
            "label": label
        }
    }

for edge in edgelist.index:
    edge = edgelist.loc[edge]
    source = edge['TF']
    sourceid = 'DataSet_'+attributes.loc[source, 'sample']
    target = edge['Gene']
    targetid = int(genes.loc[target, 'Gene ID'])
    edges.append({
        "source": sourceid,
        "relation": "regulates",
        "target": targetid,
        "properties": {
            "id": sourceid+":"+str(targetid),
            "source_label": source,
            "target_label": target,
            "directed":True,
            "fc":edge['FC'],
            "log2fc":edge['Log2FC'],
            "threshold":int(edge['threshold'])
        }
    })


RDF

In [None]:
with open(output_path+'serializations/knocktf.rdf', 'w') as f:
    print('@prefix tfpert: http://www.licpathway.net/KnockTF/search/search_sample_result.php?sample_id=', file=f)
    print('@prefix RO: http://purl.obolibrary.org/obo/RO_', file=f)
    print('@prefix gene: http://ncbi.nlm.nih.gov/gene/', file=f)
    print('', file=f)
    for edge in edges:
        print('tfpert:'+edge['source'], 'RO:0002211', 'gene:'+str(edge['target']), end=' .\n', file=f)

JSON

In [None]:
with open(output_path+'serializations/knocktf.json', 'w') as f:
    serial = json.dump(
        {
            "Version":"1", 
            "nodes": nodes,
            "edges": edges
        }, indent=4, fp=f)

TSV

In [None]:
nodeframe = pd.DataFrame(nodes)
nodeframe = nodeframe.T
nodeframe['properties'] = nodeframe['properties'].apply(lambda x: x['label'])
nodeframe = nodeframe.reset_index()
nodeframe.columns = ['id','type','label']
nodeframe['namespace'] = nodeframe['label'].apply(str.split,sep='_').str[-1].apply(str.split).str[0].apply(str.split,sep='[').str[1]
nodeframe['namespace'] = nodeframe['namespace'].replace(np.nan, 'NCBI Entrez')
nodeframe = nodeframe.get(['namespace', 'id', 'type', 'label'])
nodeframe.to_csv(output_path+'serializations/knocktf_tsv/nodes.tsv', sep='\t')
nodeframe

In [None]:
edgeframe = pd.DataFrame(edges)
edgeframe['cleaned_value'] = edgeframe['properties'].apply(lambda x: x['fc'])
edgeframe['standardized_value'] = edgeframe['properties'].apply(lambda x: x['log2fc'])
edgeframe['threshold_value'] = edgeframe['properties'].apply(lambda x: x['threshold'])
edgeframe = edgeframe.drop(columns=['properties'])
edgeframe.to_csv(output_path+'serializations/knocktf_tsv/edges.tsv', sep='\t')
edgeframe

## Visualizations

### Gene Attribute Clustered Heatmap

In [None]:
seaborn.clustermap(ternary_matrix, cmap='seismic', center=0)

### Gene Similarity Clustered Heatmap

In [None]:
seaborn.clustermap(gene_similarity_matrix,cmap='seismic',center=0)

### Attribute Similarity Clustered Heatmap

In [None]:
seaborn.clustermap(attribute_similarity_matrix, cmap='seismic', center=0)

### UMAP

In [None]:
output_notebook()

In [None]:
upgenesets = edgelist[edgelist['threshold']==1].groupby('TF')['Gene'].agg(' '.join).to_frame()
upgenesets.index += '_up'
dngenesets = edgelist[edgelist['threshold']==-1].groupby('TF')['Gene'].agg(' '.join).to_frame()
dngenesets.index += '_dn'
genesets = pd.concat([upgenesets,dngenesets])['Gene'].to_dict()

In [None]:
vec = TfidfVectorizer(max_df=0.5, min_df=10)
X = vec.fit_transform(genesets.values())
adata = anndata.AnnData(X, dtype='float32')
adata.obs.index = genesets.keys()

sc.pp.neighbors(adata, n_neighbors=190, use_rep='X')
sc.tl.leiden(adata)
sc.tl.umap(adata, min_dist=0.00025, spread=0.4)

new_order = adata.obs.sort_values(by='leiden').index.tolist()
adata = adata[new_order,:]
adata.obs['leiden'] = 'Cluster ' + adata.obs['leiden'].astype('object')

mapped_df = pd.DataFrame(adata.obsm['X_umap'])
mapped_df.columns = ['x', 'y']

mapped_df['cluster'] = adata.obs['leiden'].values
mapped_df['term'] = adata.obs.index

clusters = pd.unique(mapped_df['cluster']).tolist()
colors = list(Category20[20])[::2] + list(Category20[20])[1::2]
color_mapper = {clusters[i]:colors[i%20] for i in range(len(clusters))}

mapped_df['color'] = mapped_df['cluster'].apply(lambda x: color_mapper[x])

xlabel = 'UMAP 1'
ylabel = 'UMAP 2'

source2 = ColumnDataSource(
        data=dict(
            x = mapped_df.x,
            y = mapped_df.y,
            alpha = [0.7] * mapped_df.shape[0],
            colors = mapped_df['color'], 
            size = [6] * mapped_df.shape[0],
            gene_set = mapped_df['term'],
            cluster = mapped_df['cluster']
        )
    )

hover_emb = HoverTool(names=["df"], tooltips="""
    <div style="margin: 10">
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
            <span style="font-size: 12px">@gene_set</span>
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Coordinates:</span>
            <span style="font-size: 12px">(@x,@y)</span>
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Cluster:</span>
            <span style="font-size: 12px">@cluster</span>
        </div>
    </div>
    """)

tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']
title_emb = 'Gene Sets in knockTF Library'
plot_emb = figure(plot_width=1000, plot_height=700, tools=tools_emb, title=title_emb, x_axis_label=xlabel, y_axis_label=ylabel)
plot_emb.circle( 'x', 'y', source = source2, size='size',
                alpha='alpha', line_alpha=0, line_width=0.01, name="df", 
                fill_color = 'colors', 
                line_color="black", hover_fill_color="firebrick")
plot_emb.xaxis.axis_label_text_font_style = 'normal'
plot_emb.xaxis.axis_label_text_font_size = '18px'
plot_emb.yaxis.axis_label_text_font_size = '18px'
plot_emb.yaxis.axis_label_text_font_style = 'normal'
plot_emb.title.align = 'center'
plot_emb.title.text_font_size = '18px'

show(plot_emb)