# GRNBoost2 output adapter

In [11]:
import os
import sys
import glob
import pandas as pd
import numpy as np
from scipy.stats import ttest_1samp

In [2]:
example_out_dir = "/cellar/users/aklie/projects/igvf/topic_grn_links/grn_inference/grnboost2/results/Bridge_Satpathy/balanced_genotype_microglia/0.05/raw/B6J"

In [9]:
adj_tsvs = sorted(glob.glob(os.path.join(example_out_dir, "*.tsv")))
adj_tsvs

['/cellar/users/aklie/projects/igvf/topic_grn_links/grn_inference/grnboost2/results/Bridge_Satpathy/balanced_genotype_microglia/0.05/raw/B6J/run1_adj.tsv',
 '/cellar/users/aklie/projects/igvf/topic_grn_links/grn_inference/grnboost2/results/Bridge_Satpathy/balanced_genotype_microglia/0.05/raw/B6J/run2_adj.tsv',
 '/cellar/users/aklie/projects/igvf/topic_grn_links/grn_inference/grnboost2/results/Bridge_Satpathy/balanced_genotype_microglia/0.05/raw/B6J/run3_adj.tsv',
 '/cellar/users/aklie/projects/igvf/topic_grn_links/grn_inference/grnboost2/results/Bridge_Satpathy/balanced_genotype_microglia/0.05/raw/B6J/run4_adj.tsv',
 '/cellar/users/aklie/projects/igvf/topic_grn_links/grn_inference/grnboost2/results/Bridge_Satpathy/balanced_genotype_microglia/0.05/raw/B6J/run5_adj.tsv']

In [13]:
all_edges = pd.DataFrame()
for adj_tsv in adj_tsvs:
    adj_df = pd.read_csv(adj_tsv, sep="\t")
    adj_df["run"] = os.path.basename(adj_tsv).split(".")[0]
    all_edges = pd.concat([all_edges, adj_df])
all_edges.head()

Unnamed: 0,TF,target,importance,run
0,Eno1b,G630055G22Rik,872.974893,run1_adj
1,Zc3h11a,Zbed6,816.344353,run1_adj
2,Zbed6,Zc3h11a,815.475599,run1_adj
3,Atf7,Gm28047,796.05167,run1_adj
4,Gm28047,Atf7,777.347937,run1_adj


In [14]:
# Group by source and target and filter edges that appear in only one run
grouped = all_edges.groupby(['TF', 'target'])
filtered = grouped.filter(lambda x: len(x) > 1)
filtered

Unnamed: 0,TF,target,importance,run
0,Eno1b,G630055G22Rik,8.729749e+02,run1_adj
1,Zc3h11a,Zbed6,8.163444e+02,run1_adj
2,Zbed6,Zc3h11a,8.154756e+02,run1_adj
3,Atf7,Gm28047,7.960517e+02,run1_adj
4,Gm28047,Atf7,7.773479e+02,run1_adj
...,...,...,...,...
1451605,Ywhaz,Frmd4b,3.789460e-17,run5_adj
1451608,Trerf1,Pard3b,2.460765e-17,run5_adj
1451609,Etv6,Xist,2.163018e-17,run5_adj
1451610,Cnot4,Clint1,1.303673e-17,run5_adj


In [19]:
print(f"{len(all_edges) - len(filtered)} edges dropped")

650068 edges dropped


In [20]:
# Calculate mean importance for each edge
mean_importance = filtered.groupby(['TF', 'target'])['importance'].mean()

In [21]:
mean_importance

TF             target       
2010315B03Rik  0610012G03Rik    0.223045
               0610030E20Rik    0.266081
               1110004F10Rik    0.036324
               1110019D14Rik    0.159079
               1500004A13Rik    0.316529
                                  ...   
Zzz3           mt-Atp6          0.109764
               mt-Co1           0.053422
               mt-Co2           0.063543
               mt-Rnr1          0.098127
               mt-Rnr2          0.076723
Name: importance, Length: 1851654, dtype: float64

In [24]:
from tqdm.auto import tqdm
tqdm.pandas()

In [23]:
def calc_p_value(importances):
    _, p_value = ttest_1samp(importances, 0)
    return p_value

# Empirical p-value calculation
p_values_series = filtered.groupby(['TF', 'target'])['importance'].progress_apply(calc_p_value)
p_values = p_values_series.values

In [26]:
len(p_values)

1851654

In [27]:
# Calculate -log10 for the p-values
neg_log_p = -np.log10(p_values)

In [28]:
# Normalize the importance score using Min-Max normalization
normalized_importance = (mean_importance - mean_importance.min()) / (mean_importance.max() - mean_importance.min())

In [37]:
# Create the consolidated dataframe
consolidated = pd.DataFrame({
    'source': mean_importance.index.get_level_values('TF'),
    'target': mean_importance.index.get_level_values('target'),
    'weight_signed': np.nan,
    'weight_unsigned': mean_importance.values,
    'weight_minmax_normalized': normalized_importance.values,
    'p': p_values,
    '-logp': neg_log_p,
    'description': np.nan
}).reset_index(drop=True)

In [40]:
consolidated.sort_values('weight_unsigned', ascending=False)

Unnamed: 0,source,target,weight_signed,weight_unsigned,weight_minmax_normalized,p,-logp,description
909278,Nnt,Nnt.1,,8.984965e+02,1.000000e+00,2.827386e-03,2.548615,
1525662,Zbed6,Zc3h11a,,8.932668e+02,9.941794e-01,1.463887e-03,2.834493,
1798186,Zfp969,Zfp968,,8.785599e+02,9.778111e-01,8.300859e-04,3.080877,
1794411,Zfp968,Zfp969,,8.667597e+02,9.646778e-01,7.436492e-03,2.128632,
313560,Eno1b,G630055G22Rik,,8.287519e+02,9.223764e-01,6.773661e-07,6.169177,
...,...,...,...,...,...,...,...,...
1256722,Smarca1,Ap1m1,,4.197186e-09,4.671326e-12,4.999951e-01,0.301034,
678671,Lcor,Ftl1-ps1,,2.943033e-09,3.275491e-12,1.337898e-01,0.873577,
13661,Adarb1,Pik3cg,,1.526685e-09,1.699138e-12,4.999993e-01,0.301031,
989196,Phf21a,Slc7a2,,1.119726e-09,1.246204e-12,2.884945e-01,0.539862,


In [39]:
consolidated[consolidated["source"] == "Spi1"].sort_values('weight', ascending=False)

KeyError: 'weight'