# Benchmarking a TF-TF network against literature-reported TF-TF interactions
Use this notebook to benchmark the filtered networks produced from `filter_assertions.ipynb`.

### Setup

In [None]:
import pandas as pd
import numpy as np
import glob
import tqdm
import random
import statistics 
from scipy import stats
import os

Set a random seed to ensure reproducibility.

In [None]:
random.seed(314)

Add a subdirectory to hold the libraries used for benchmarking and one to store the results. 

In [None]:
lib_dir = './raw_data/benchmarking_libraries'
output_dir = './benchmarking_results'
if not os.path.exists(lib_dir):
    os.mkdir(lib_dir)
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

Define a function to upload the benchmarking files as source-target pairs. This function finds all such pairs in a GMT-formatted benchmarking file, where each GMT term is a TF target. Only targets found in the set of high-rank TFs are included, since these are the targets with which we constructed the original network.

In [None]:
def find_library_edges(lib, tflib):

    '''
    creates a dictionary of source-target pairs in a GMT-formatted benchmarking file
    each term is the source, targets are TFs in that term's set
    input: a library and set of valid transcription factors
    return: a dataframe of (source,target) edges and edge counts
    '''

    edges = {}

    # for each line in that library file, 
    with open(lib, 'r') as f:
        for line in f:
            # extract the gmt term and gene set
            items = line.strip().split("\t")
            term = items[0]

            # check the term/set formatting
            if items[1] != "":
                targets = items[1:]
            else:
                targets = items[2:]

            # extract the source TF from the term name 
            parts = term.strip().split(" ")

            if len(parts) == 1:
                source = term.split("_")[0].upper()
            else:
                source = parts[0].upper()
            
            # check if source is in transcript library -- if not, we know it's a TF, add it
            if source not in tflib:
                tflib.append(source)

            # add the edge to the list
            for target in targets:
                target = target.upper()
                if target in tflib:
                    if (source,target) in edges.keys():
                        edges[(source,target)] += 1
                    else:
                        edges[(source,target)] = 1

    # form dataframe and add counts
    edgeidx = pd.MultiIndex.from_tuples(edges.keys(), names = ['source', 'target'])
    edgedf = pd.DataFrame(index = edgeidx, columns = ['count'])

    for (s,t), count in edges.items():
        edgedf.loc[(s,t)] = count
    
    return edgedf

This function compares a specified network edge list against all three benchmarking libraries. The significance of the overlap between the two networks is determined by randomly shuffling the input network 100 times, then performing a z-test using the expected overlap and standard deviation. The results are saved to a CSV file for easy inspection.

In [None]:
def benchmark_edges(network_type, libraries):

    '''
    Compares an input network against a set of benchmarking libraries
    input: the type of network being compared and the benchmarking libraries being used 
    return: a dataframe containing the results organized by benchmarking library
    output: .csv file of the results for a single network
    '''

    # open network edge file and extract source,target tuples
    network_edge_file = f'./filtered_edge_list/{network_type}/edge_list_filtered.csv'
    network_edges = pd.read_csv(network_edge_file, usecols=[0, 1])
    n_ntwrk_edges = len(network_edges)

    results = {lib_name: None for lib_name in libraries}

    # for each benchmarking library
    for benchmarklib, library_counts in libraries.items():
        library_edgelist = library_counts.index.to_frame(index=False)

        # calculate the DIRECTED overlap ratio 
        # get unique source,target edges and group them by count 
        common_edges = pd.merge(library_edgelist, network_edges, how='inner')

        # calculate the undirected overlap ratio
        network_edges_undirected = pd.DataFrame([sorted(edge) for edge in network_edges.to_numpy()], columns = ['source', 'target'])
        library_edges_undirected = pd.DataFrame([sorted(edge) for edge in library_edgelist.to_numpy()], columns = ['source', 'target'])
        common_edges_undirected = pd.merge(library_edges_undirected, network_edges_undirected, how='inner')

        overlap = len(common_edges) / n_ntwrk_edges
        undirected_overlap = len(common_edges_undirected) / n_ntwrk_edges

        # generate a random set of edges using the original benchmark library and calculate the significance of the observed overlap
        N_TRIALS = 100
        trial_results = {
            'directed': np.zeros(N_TRIALS),
            'undirected': np.zeros(N_TRIALS)
        }

        ntwrk_source_labels = list(set(network_edges['source']))
        ntwrk_target_labels = list(set(network_edges['target']))

        for i in tqdm.tqdm(range(N_TRIALS)):
            # generate a new network using the weighted node distribution from the library network
            rand_sources = random.choices(ntwrk_source_labels, k=n_ntwrk_edges)
            rand_targets = random.choices(ntwrk_target_labels, k=n_ntwrk_edges)

            # calculate the overlap and save 
            random_network = pd.DataFrame(zip(rand_sources, rand_targets), columns = ['source', 'target'])
            random_network_undirected = pd.DataFrame([sorted(edge) for edge in random_network.to_numpy()], columns = ['source', 'target'])
            
            common_edges_trial = pd.merge(library_edgelist, random_network, how='inner')
            common_edges_undir_trial = pd.merge(library_edges_undirected, random_network_undirected, how='inner')

            trial_results['directed'][i] = len(common_edges_trial) / n_ntwrk_edges
            trial_results['undirected'][i] = len(common_edges_undir_trial) / n_ntwrk_edges
        

        expected_overlap = statistics.mean(trial_results['directed'])
        overlap_stdev = statistics.stdev(trial_results['directed'])

        expected_overlap_undir = statistics.mean(trial_results['undirected'])
        overlap_stdev_undir = statistics.stdev(trial_results['undirected'])

        z_score = (overlap - expected_overlap) / overlap_stdev
        undir_z_score = (undirected_overlap - expected_overlap_undir) / overlap_stdev_undir
        
        # only calculate p-value for cases where there is more than expected overlap
        p_value = 1 - stats.norm.cdf(z_score)
        undir_p = 1 - stats.norm.cdf(undir_z_score)

        results[benchmarklib] = {
            'library network size': len(library_counts),
            'input network size': n_ntwrk_edges,
            'observed overlap, directed': overlap,
            'observed overlap, undirected': undirected_overlap,
            'expected overlap, directed': expected_overlap,
            'expected overlap, undirected': expected_overlap_undir,
            'stdev, directed': overlap_stdev,
            'stdev, undirected': overlap_stdev_undir,
            'p-value directed': p_value,
            'p-value, undirected': undir_p
            }

    resultsdf = pd.DataFrame(results)

    resultsdf.to_csv(f"{output_dir}/{network_type}.csv")
    return resultsdf

### Benchmarking

Upload the full list of high-rank TFs using `mean_ranks_matrix.csv`  (produced by `construct_edge_list.ipynb`). This will be our library of TFs. 

In [None]:
# load only the first column of the mean ranks matrix, which contains all of the TFs identified by ChEA3
path_to_mean_ranks = './edge_constructing_files/mean_ranks_matrix.csv'
tf_library = pd.read_csv(path_to_mean_ranks, usecols=[0])
tf_library = tf_library.iloc[:, 0].tolist()
tf_library = [tf.upper() for tf in tf_library]

We'll use three libraries to perform the benchmarking. 1. - 2. are Enrichr libraries, and 3. - 8. are the six ChEA3 primary libraries used in `construct_edge_list.ipynb`:
1. TRANSFAC and JASPAR PWMs
2. TRRUST Transcription Factors 2019
3. ARCHS4 Coexpression
4. ENCODE ChIP-seq
5. Enrichr Queries
6. Literature ChIP-seq
7. ReMap ChIP-seq
8. GTEx coexpression

Download the Enrichr libraries by navigating here and clicking on their names: https://maayanlab.cloud/Enrichr/#libraries.\
Move all six libraries into `raw_data/benchmarking_libraries`, copying the six ChEA3 libraries from `raw_data/chea3libs`. 

Parse the benchmarking library names from their file paths. 

In [None]:
# specify order of files to make processing easier 
enrichrfiles = glob.glob(f'{lib_dir}/*.txt')
chea3files = glob.glob(f'{lib_dir}/*.gmt')

allfiles = enrichrfiles + chea3files

# format filenames
filenames = [(file.split("/")[-1]).split(".")[0].replace("_", " ") for file in allfiles]

Calculate the benchmarking results for all three network types. 

In [None]:
libraries = {file:find_library_edges(file, tf_library) for file in allfiles}
ntypes = ['signature_shuffling', 'edge_weighted', 'node_weighted']
network_results = [benchmark_edges(ntype, libraries) for ntype in ntypes]