# Set-up

In [178]:
import os
import sys
import glob
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
from igraph import Graph
import celloracle as co

In [81]:
from celloracle.network_analysis.links_object import _thresholding
from sklearn.linear_model import LinearRegression as lr

def read_links_dict_from_metadata(metadata_df, id_col="grn_name", verbose=True):
    links_dict = {}
    for _, row in tqdm(metadata_df.iterrows(), total=len(metadata_df)):
        name = row[id_col]
        links_file = row["output_table_path"]
        links_df = pd.read_csv(links_file, sep="\t")
        links_dict[name] = links_df
        if verbose:
            print(f"Finished reading {name} that has {len(links_df)} links")
    return links_dict

def filter_links_dict(links_dict, p=None, weight="weight_unsigned", threshold_number=2000, verbose=True):
    filtered_links_dict = {}
    for i, (name, links_df) in tqdm(enumerate(links_dict.items()), total=len(links_dict)):
        filtered_links_df = _thresholding(links_df, p=p, weight=weight, threshold_number=threshold_number)
        if verbose:
            print(f"{name} went from {len(links_df)} to {len(filtered_links_df)}")
        filtered_links_dict[name] = filtered_links_df
    return filtered_links_dict

def create_igraph_from_links(links):
    g = Graph.DataFrame(links[["source", "target"]], directed=True, use_vids=False)
    if links["weight_minmax_normalized"].min() <= 0:
        print("Non-positive value detected in weight_minmax_normalized. Adding 1e-6 to all values")
        links["weight_minmax_normalized"] = links["weight_minmax_normalized"] + 1e-6
    g.es["weight"] = links["weight_minmax_normalized"].values.copy()
    return g

def node_scores(g):
    df = g.get_vertex_dataframe()
    for i in ["all", "in", "out"]:
        df[f"degree_{i}"] = g.degree(mode=i)
        df[f"degree_centrality_{i}"] = df[f"degree_{i}"] / (df.shape[0]-1)
    df["betweenness_centrality"] = g.betweenness(directed=True, weights="weight")
    df["eigenvector_centrality"] = g.eigenvector_centrality(directed=False, weights="weight")
    df = df.set_index("name")
    df.index.name = None
    return df

def node_scores_from_links_dict(links_dict, id_col="grn_name", verbose=True):
    node_scores_lst = []
    for i, (name, links) in tqdm(enumerate(links_dict.items()), total=len(links_dict)):
        if verbose:
            print(f"Processing {name}")
        g = create_igraph_from_links(links)
        df = node_scores(g)
        df["grn_name"] = name
        node_scores_lst.append(df)
    return pd.concat(node_scores_lst, axis=0)

def scale_free_topology_score(g):
    degree_df = pd.DataFrame(g.degree(mode="all"), columns=["degree"])
    dist = degree_df.degree.value_counts()/degree_df.degree.value_counts().sum()
    dist.index = dist.index.astype(int)
    x = np.log(dist.index.values).reshape([-1,1])
    y = np.log(dist.values).reshape([-1,1])
    model = lr()
    model.fit(x,y)
    return model.score(x,y)

def mean_connectivity(g):
    return np.mean(g.degree(mode="all"))

def median_connectivity(g):
    return np.median(g.degree(mode="all"))

def max_connectivity(g):
    return np.max(g.degree(mode="all"))

def network_scores(g):
    return pd.Series({
        "scale_free_topology_score": scale_free_topology_score(g),
        "mean_connectivity": mean_connectivity(g),
        "median_connectivity": median_connectivity(g),
        "max_connectivity": max_connectivity(g)
    })

def network_scores_from_links_dict(links_dict, verbose=True):
    network_scores_df = pd.DataFrame()
    for i, (name, links) in tqdm(enumerate(links_dict.items()), total=len(links_dict)):
        if verbose:
            print(f"Calculating network scores for {name}")
        g = create_igraph_from_links(links)
        network_scores_df[name] = network_scores(g)
    return network_scores_df.T

In [11]:
results_dir = "/cellar/users/aklie/projects/igvf/topic_grn_links/eval/results"
dataset_name = "meta_analysis"
in_date = "03Sep23"

# Load in the metadata

In [198]:
# Read in all the networks
metadata_df = pd.read_csv(os.path.join(results_dir, dataset_name, in_date, "grn_metadata.tsv"), sep="\t")
len(metadata_df)

120

In [199]:
# Make a run specific ID for saving all the results of that run
metadata_df["grn_name"]

0      igvf_b01_LeftCortex_aracne_balanced_genotype_m...
1      igvf_b01_LeftCortex_aracne_balanced_genotype_m...
2      igvf_b01_LeftCortex_aracne_balanced_genotype_m...
3      igvf_b01_LeftCortex_aracne_balanced_genotype_m...
4      igvf_b01_LeftCortex_aracne_balanced_genotype_m...
                             ...                        
115    Bridge_Satpathy_wgcna_balanced_genotype_microg...
116    Bridge_Satpathy_wgcna_balanced_genotype_microg...
117    Bridge_Satpathy_wgcna_balanced_genotype_microg...
118    Bridge_Satpathy_wgcna_balanced_genotype_microg...
119    Bridge_Satpathy_wgcna_balanced_genotype_microg...
Name: grn_name, Length: 120, dtype: object

In [23]:
# Create a directory for each GRN
for i, row in metadata_df.iterrows():
    grn_name = row["grn_name"]
    if not os.path.exists(os.path.join(results_dir, dataset_name, in_date, "grns", grn_name)):
        os.makedirs(os.path.join(results_dir, dataset_name, in_date, "grns", grn_name))

# Load in the GRNs

In [45]:
# Read in all the links
links_dict = read_links_dict_from_metadata(metadata_df, id_col="grn_name", verbose=False)

Finished reading igvf_b01_LeftCortex_aracne_balanced_genotype_microglia_B6J_0.05_raw that has 21163 links
Finished reading igvf_b01_LeftCortex_aracne_balanced_genotype_microglia_CASTJ_0.05_raw that has 1860 links
Finished reading igvf_b01_LeftCortex_aracne_balanced_genotype_microglia_both_0.05_raw that has 19138 links
Finished reading igvf_b01_LeftCortex_aracne_balanced_genotype_microglia_B6J_0.05_log1p_cp10k that has 57597 links
Finished reading igvf_b01_LeftCortex_aracne_balanced_genotype_microglia_CASTJ_0.05_log1p_cp10k that has 37427 links
Finished reading igvf_b01_LeftCortex_aracne_balanced_genotype_microglia_both_0.05_log1p_cp10k that has 56634 links
Finished reading igvf_b01_LeftCortex_aracne_balanced_genotype_microglia_B6J_0.05_pf_log1p_pf that has 54815 links
Finished reading igvf_b01_LeftCortex_aracne_balanced_genotype_microglia_CASTJ_0.05_pf_log1p_pf that has 36845 links
Finished reading igvf_b01_LeftCortex_aracne_balanced_genotype_microglia_both_0.05_pf_log1p_pf that has 51

In [46]:
# Threshold the links
filtered_links_dict = filter_links_dict(links_dict, p=None, weight="weight_unsigned", threshold_number=2000, verbose=False)

In [50]:
# For each filtered links, save the tsv to a new run specific directory
filtered_links_file_lst = []
for i, row in tqdm(metadata_df.iterrows(), total=len(metadata_df)):
    name = row["grn_name"]
    links_df = filtered_links_dict[name].sort_values("weight_minmax_normalized", ascending=False)
    file = os.path.join(results_dir, dataset_name, in_date, "grns", name, "filterd_links.tsv")
    links_df.to_csv(file, sep="\t", index=False)
    filtered_links_file_lst.append(file)

  0%|          | 0/120 [00:00<?, ?it/s]

# Network metrics

In [64]:
network_metric_df = network_scores_from_links_dict(links_dict=filtered_links_dict, verbose=False)

  0%|          | 0/120 [00:00<?, ?it/s]

In [65]:
network_metric_df.to_csv(os.path.join(results_dir, dataset_name, in_date, "network_metrics.tsv"), sep="\t")

# TF rankings

In [None]:
merged_node_scores = node_scores_from_links_dict(filtered_links_dict, verbose=False)

## Get all unique TFs and subset to those

In [183]:
scenic_tfs = pd.read_csv("/cellar/users/aklie/opt/igvf-ucsd/grnboost2_pipeline/data/tf_lists/allTFs_mm.txt", header=None)[0].values
aracne_tfs = pd.read_csv("/cellar/users/aklie/opt/igvf-ucsd/aracne_pipeline/data/tf_cotf_signalling_list/mouse/tf_mus_symbol.txt", header=None)[0].values
literature_tfs = pd.read_csv("/cellar/users/aklie/projects/igvf/topic_grn_links/data/refs/microglia_regulators.csv")["regulator"].values
celloracle_tfs = co.data.load_mouse_scATAC_atlas_base_GRN().columns[2:]

In [184]:
tfs = set()
number_tfs_per_grn = []
for i, row in tqdm(metadata_df.iterrows(), total=len(metadata_df)):
    name = row["grn_name"]
    grn_tfs = filtered_links_dict[name]["source"].unique()
    number_tfs_per_grn.append(len(grn_tfs))
    tfs.update(grn_tfs)
len(tfs)

  0%|          | 0/120 [00:00<?, ?it/s]

709

In [185]:
# How many tfs in scenic_tfs
len(set(scenic_tfs).intersection(tfs))

555

In [186]:
# How many tfs in scenic_tfs
len(set(aracne_tfs).intersection(tfs))

456

In [188]:
# How many tfs in celloracle_tfs
len(set(celloracle_tfs).intersection(tfs))

223

In [189]:
len(set(literature_tfs).intersection(tfs))

9

In [190]:
# Which one is missing from literature_tfs
set(literature_tfs).difference(tfs)

{'Usf1'}

In [193]:
# Create a dataframe with these tfs as index and whether they in literature_tfs as a column
tf_df = pd.DataFrame(index=list(tfs))
tf_df["in_scenic"] = tf_df.index.isin(scenic_tfs)
tf_df["in_aracne"] = tf_df.index.isin(aracne_tfs)
tf_df["in_celloracle"] = tf_df.index.isin(celloracle_tfs)
tf_df["in_literature"] = tf_df.index.isin(literature_tfs)

In [194]:
tf_only_scores = merged_node_scores.loc[tfs]

In [195]:
tf_only_scores.to_csv(os.path.join(results_dir, dataset_name, in_date, "tf_only_network_scores.tsv"), sep="\t")

In [196]:
# Count up the number of unique GRNs that each TF is in (groupby the index)
tf_df["number_grns"] = tf_only_scores.groupby(tf_only_scores.index).size()

In [197]:
tf_df.to_csv(os.path.join(results_dir, dataset_name, in_date, "tf_metadata.tsv"), sep="\t")

## Network based measures

In [116]:
metrics = ["degree_centrality_out", "eigenvector_centrality", "betweenness_centrality"]

In [121]:
# For each grn and metric, rank the tfs, rename the metric column to rank_weight, and then save to the grn specific directory
for grn, group_df in tf_only_scores.groupby("grn_name"):
    for metric in metrics:
        df = group_df.copy()
        df["rank"] = df[metric].rank(ascending=False, method="min")
        df.sort_values("rank", inplace=True)
        df.rename(columns={metric: "rank_weight"}, inplace=True)
        df = df[["rank_weight", "rank"]]
        df.to_csv(os.path.join(results_dir, dataset_name, in_date, "grns", grn, f"{metric}.tsv"), sep="\t")

## Expression and network based measures

In [124]:
from scipy import sparse
import decoupler as dc

In [128]:
for i, row in tqdm(metadata_df.iterrows(), total=len(metadata_df)):
    
    # Read in the expression data
    h5ad_file = row["in_file_path"].replace(".tsv", ".h5ad").replace(".loom", ".h5ad")
    adata = sc.read_h5ad(h5ad_file)

    # Get the run name for outputting to
    grn_name = row["grn_name"]

    # Get the run specific GRN
    grn = filtered_links_dict[grn_name]

    # Make sure the adata uses the normalized counts
    adata.raw = adata.copy()
    if sparse.issparse(adata.layers["normalized_counts"]):
        adata.X = adata.layers["normalized_counts"].A
    else:
        adata.X = adata.layers["normalized_counts"]

    # Run decoupler with ULM
    dc.run_ulm(
        mat=adata,
        net=grn,
        source='source',
        target='target',
        weight='weight_minmax_normalized',
        verbose=False,
        use_raw=False
    )

    # Grab the ULM estimates
    acts = dc.get_acts(adata, obsm_key='ulm_estimate')
    acts.var["rank_weight"] = acts.X.mean(axis=0)
    acts.var["rank"] = acts.var["rank_weight"].rank(ascending=False)
    acts_ranked = acts.var.sort_values("rank")

    # Clean up
    acts_ranked = acts_ranked[["rank_weight", "rank"]]
    acts_ranked.index.name = None

    # Save the ranked TFs
    acts_ranked.to_csv(os.path.join(results_dir, dataset_name, in_date, "grns", grn_name, f"ulm.tsv"), sep="\t")
    """
    # Run decoupler with VIPER
    dc.run_viper(
        mat=adata,
        net=grn,
        source='source',
        target='target',
        weight='weight_minmax_normalized',
        verbose=False,
        use_raw=False
    )

    # Grab the VIPER estimates
    acts = dc.get_acts(adata, obsm_key='viper_estimate')
    acts.var["rank_weight"] = acts.X.mean(axis=0)
    acts.var["rank"] = acts.var["rank_weight"].rank(ascending=False)
    acts_ranked = acts.var.sort_values("rank")

    # Clean up
    acts_ranked = acts_ranked[["rank_weight", "rank"]]
    acts_ranked.index.name = None

    # Save the ranked TFs
    acts_ranked.to_csv(os.path.join(results_dir, dataset_name, in_date, "grns", grn_name, f"viper.tsv"), sep="\t")
    """
    # Run decoupler with AUCell
    dc.run_aucell(
        mat=adata,
        net=grn,
        source='source',
        target='target',
        verbose=False,
        use_raw=False
    )

    # Grab the AUCell estimates
    acts = dc.get_acts(adata, obsm_key='aucell_estimate')
    acts.var["rank_weight"] = acts.X.mean(axis=0)
    acts.var["rank"] = acts.var["rank_weight"].rank(ascending=False)
    acts_ranked = acts.var.sort_values("rank")

    # Clean up
    acts_ranked = acts_ranked[["rank_weight", "rank"]]
    acts_ranked.index.name = None

    # Save the ranked TFs
    acts_ranked.to_csv(os.path.join(results_dir, dataset_name, in_date, "grns", grn_name, f"aucell.tsv"), sep="\t")

  0%|          | 0/120 [00:00<?, ?it/s]

# Add everything to metadata table

In [152]:
cleaned_metadata_df = metadata_df.copy()
cleaned_metadata_df = cleaned_metadata_df[["grn_name", "dataset", "method", "cells", "genes", "normalization", "genotype"]]

In [153]:
cleaned_metadata_df = cleaned_metadata_df.merge(network_metric_df, left_on="grn_name", right_index=True)

In [154]:
cleaned_metadata_df["number_tfs"] = number_tfs_per_grn
cleaned_metadata_df["filtered_links_path"] = filtered_links_file_lst

In [155]:
cleaned_metadata_df["degree_centrality_out_path"] = cleaned_metadata_df["grn_name"].apply(lambda x: os.path.join(results_dir, dataset_name, in_date, "grns", x, "degree_centrality_out.tsv"))
cleaned_metadata_df["eigenvector_centrality_path"] = cleaned_metadata_df["grn_name"].apply(lambda x: os.path.join(results_dir, dataset_name, in_date, "grns", x, "eigenvector_centrality.tsv"))
cleaned_metadata_df["betweenness_centrality_path"] = cleaned_metadata_df["grn_name"].apply(lambda x: os.path.join(results_dir, dataset_name, in_date, "grns", x, "betweenness_centrality.tsv"))
cleaned_metadata_df["ulm_path"] = cleaned_metadata_df["grn_name"].apply(lambda x: os.path.join(results_dir, dataset_name, in_date, "grns", x, "ulm.tsv"))
cleaned_metadata_df["viper_path"] = cleaned_metadata_df["grn_name"].apply(lambda x: os.path.join(results_dir, dataset_name, in_date, "grns", x, "viper.tsv"))
cleaned_metadata_df["aucell_path"] = cleaned_metadata_df["grn_name"].apply(lambda x: os.path.join(results_dir, dataset_name, in_date, "grns", x, "aucell.tsv"))

In [175]:
cleaned_metadata_df.to_csv(os.path.join(results_dir, dataset_name, in_date, "cleaned_grn_metadata.tsv"), sep="\t")

# DONE!

---