# Benchmarking a TF-TF network against literature-reported TF-TF interactions
### Use this notebook to benchmark the filtered network produced from filter_assertions.ipynb

## Generate an edge list from benchmark libraries
Three Enrichr libraries https://maayanlab.cloud/Enrichr/#libraries) are used to perform benchmarking
1. TRANSFAC and JASPAR PWMs
2. TRRUST Transcription Factors 2019
3. Rummagene Transcription Factors

In [None]:
import pandas as pd
import numpy as np
import glob
import tqdm
import random
import matplotlib.pyplot as plt
import statistics as stats
import scipy.stats as spstats
from collections import defaultdict

#### First, load the full list of transcription factors used in the initial transcription factor ranking (build_assertions.ipynb)


In [None]:
# load only the first column of the mean ranks matrix, which contains all of the TFs identified by ChEA3 -- this will be our TF library
path_to_mean_ranks = '/Users/anna/Projects/KG_UI/build_TF_network/build_network_out/mean_ranks_matrix.csv'
tf_library = pd.read_csv(path_to_mean_ranks, usecols=[0])
tf_library = tf_library.iloc[:, 0].tolist()
tf_library = [tf.upper() for tf in tf_library]

#### Next, upload the gmt files as source-target pairs. Only include targets that are in the ChEA3 library list.  

In [None]:
benchmark_dir = './raw_data/benchmarking_libraries'

# specify order of files to make processing easier 
files = glob.glob(f'{benchmark_dir}/*.txt')

# format filenames
filenames = [(file.split("/")[-1]).split(".")[0].replace("_", " ") for file in files]

In [None]:
def find_library_edges(lib):
    edges = {}

    # for each line in that library file, 
    with open(lib, 'r') as f:
        for line in f:
            # extract the gmt term and gene set
            items = line.strip().split("\t")
            term = items[0]
            targets = items[2:]

            # extract the source TF from the term name 
            libname = lib.split("/")[-1]

            if libname == 'Rummagene_transcription_factors.txt':
                source = term.split(" ")[-1].upper()
            else:
                source = term.split(" ")[0].upper()
            
            # check if source is in transcript library -- if not, we know it's a TF, add it
            if source not in tf_library:
                tf_library.append(source)

            # add the edge to the list -- will remove duplicates later
            for target in targets:
                target = target.upper()
                if target in tf_library:
                    if (source,target) in edges.keys():
                        edges[(source,target)] += 1
                    else:
                        edges[(source,target)] = 1

    # form dataframe and add counts
    edgeidx = pd.MultiIndex.from_tuples(edges.keys(), names = ['source', 'target'])
    edgedf = pd.DataFrame(index = edgeidx, columns = ['count'])

    for (s,t), count in edges.items():
        edgedf.loc[(s,t)] = count
    
    return edgedf

# 
def benchmark_edges(network_type):
    # open netwoork edge file and extract source,target tuples
    network_edge_file = f'/Users/anna/Projects/KG_UI/build_TF_network/filter_assertions_out/benchmarking/{network_type}/edge_list_filtered.csv'
    network_edges = pd.read_csv(network_edge_file, usecols=[0, 1])
    n_ntwrk_edges = len(network_edges)

    results = {file: None for file in files}

    # for each benchmarking library
    for benchmarklib in results.keys():
        # find the edges in the library 
        library_counts = find_library_edges(benchmarklib)

        library_edgelist = library_counts.index.to_frame(index=False)

        # calculate the DIRECTED overlap ratio 
        # get unique source,target edges and group them by count 
        common_edges = pd.merge(library_edgelist, network_edges, how='inner')

        # calculate the undirected overlap ratio
        network_edges_undirected = pd.DataFrame([sorted(edge) for edge in network_edges.to_numpy()], columns = ['source', 'target'])
        library_edges_undirected = pd.DataFrame([sorted(edge) for edge in library_edgelist.to_numpy()], columns = ['source', 'target'])
        common_edges_undirected = pd.merge(library_edges_undirected, network_edges_undirected, how='inner')

        overlap = len(common_edges)/n_ntwrk_edges
        undirected_overlap = len(common_edges_undirected)/n_ntwrk_edges

        # generate a random set of edges using the original benchmark library and calculate the significance of the observed overlap
        N_TRIALS = 100
        trial_results = {
            'directed': np.zeros(N_TRIALS),
            'undirected': np.zeros(N_TRIALS)
        }

        lib_source_counts = library_counts.groupby(['source']).sum()
        lib_target_counts = library_counts.groupby(['target']).sum()

        lib_source_weights = lib_source_counts['count'].tolist()
        lib_target_weights = lib_target_counts['count'].tolist()

        num_s_edges = sum(lib_source_weights)
        num_t_edges = sum(lib_target_weights)

        ntwrk_source_labels = list(set(network_edges['source']))
        ntwrk_target_labels = list(set(network_edges['target']))

        for i in tqdm.tqdm(range(N_TRIALS)):
            # generate a new network using the weighted node distribution from the library network
            rand_sources = random.choices(ntwrk_source_labels, k=n_ntwrk_edges)
            rand_targets = random.choices(ntwrk_target_labels, k=n_ntwrk_edges)

            # calculate the overlap and save 
            random_network = pd.DataFrame(zip(rand_sources, rand_targets), columns = ['source', 'target'])
            random_network_undirected = pd.DataFrame([sorted(edge) for edge in random_network.to_numpy()], columns = ['source', 'target'])
            
            common_edges_trial = pd.merge(library_edgelist, random_network, how='inner')
            common_edges_undir_trial = pd.merge(library_edges_undirected, random_network_undirected, how='inner')

            trial_results['directed'][i] = len(common_edges_trial)/n_ntwrk_edges
            trial_results['undirected'][i] = len(common_edges_undir_trial)/n_ntwrk_edges
        
        expected_overlap = stats.mean(trial_results['directed'])
        overlap_stdev = stats.stdev(trial_results['directed'])

        expected_overlap_undir = stats.mean(trial_results['undirected'])
        overlap_stdev_undir = stats.stdev(trial_results['undirected'])

        z_score = (overlap - expected_overlap) / overlap_stdev
        undir_z_score = (undirected_overlap - expected_overlap_undir) / overlap_stdev_undir
        
        # only calculate p-value for cases where there is more than expected overlap
        if overlap > expected_overlap:
            p_value = 1 - spstats.norm.cdf(z_score)
        else:
            p_value = None
        if undirected_overlap > expected_overlap_undir:
            undir_p = 1 - spstats.norm.cdf(undir_z_score)
        else:
            undir_p = None

        results[benchmarklib] = {
            'library network size': len(library_counts),
            'input network size': n_ntwrk_edges,
            'observed overlap, directed': overlap * n_ntwrk_edges,
            'observed overlap, undirected': undirected_overlap * n_ntwrk_edges,
            'expected overlap, directed': expected_overlap * n_ntwrk_edges,
            'expected overlap, undirected': expected_overlap_undir * n_ntwrk_edges,
            'stdev, directed': overlap_stdev,
            'stdev, undirected': overlap_stdev_undir,
            'p-value directed': p_value,
            'p-value, undirected': undir_p
            }

    resultsdf = pd.DataFrame(results)

    resultsdf.to_csv(f"./benchmarking_results/{network_type}.csv")
    return resultsdf

In [None]:
# calculate directed and undirected benchmarking results
ntypes = ['signature_shuffling', 'edge_weighted', 'node_weighted']
network_results = [benchmark_edges(ntype) for ntype in ntypes]