# Set-up

In [1]:
import os
import sys
import glob
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import numpy as np
import pandas as pd
import seaborn as sns

In [13]:
results_dir = "/cellar/users/aklie/projects/igvf/topic_grn_links/eval/results"
dataset_name = "meta_analysis"
in_date = "03Sep23"

# Load in the topic results

In [2]:
microglia_topics = pd.read_csv("/cellar/users/aklie/opt/IGVF_Topyfic/bridge_samples/microglia_reg_gene/6/tables/gene_weights.csv", index_col=0)

In [33]:
# For every column, create a df that has the gene name as the index, the weight and rank as columns
result_files = []
tfs = set()
for column in microglia_topics.columns:
    topic_num = "_".join(column.split("_")[-2:])
    df = microglia_topics[[column]].copy()
    df['rank'] = df[column].rank(ascending=False)
    df.rename(columns={column: "rank_weight"}, inplace=True)
    df.sort_values(by="rank", inplace=True)
    if not os.path.exists(os.path.join(results_dir, dataset_name, in_date, "topics", topic_num)):
        os.makedirs(os.path.join(results_dir, dataset_name, in_date, "topics", topic_num))
    out_tsv = os.path.join(results_dir, dataset_name, in_date, "topics", topic_num, "topic_gene_weights.tsv")
    df.to_csv(out_tsv, sep="\t")
    result_files.append(out_tsv)
    tfs.update(df.index)

In [35]:
len(tfs)

2685

In [11]:
df.sort_values(by="rank", ascending=False, inplace=True)

In [25]:
metadata_df = pd.DataFrame(data={"gene_weight_path": result_files})
metadata_df["dataset"] = "igvf_b01_LeftCortex"
metadata_df["method"] = "topyfic"
metadata_df["genotype"] = "both"
metadata_df["genes"] = "reg_genes"
metadata_df["cells"] = "microglia"
metadata_df["normalization"] = "pf_log1p_pf"
metadata_df.head()

Unnamed: 0,gene_weight_path,dataset,method,genotype,genes,cells,normalization
0,/cellar/users/aklie/projects/igvf/topic_grn_li...,igvf_b01_LeftCortex,topyfic,both,reg_genes,microglia,pf_log1p_pf
1,/cellar/users/aklie/projects/igvf/topic_grn_li...,igvf_b01_LeftCortex,topyfic,both,reg_genes,microglia,pf_log1p_pf
2,/cellar/users/aklie/projects/igvf/topic_grn_li...,igvf_b01_LeftCortex,topyfic,both,reg_genes,microglia,pf_log1p_pf
3,/cellar/users/aklie/projects/igvf/topic_grn_li...,igvf_b01_LeftCortex,topyfic,both,reg_genes,microglia,pf_log1p_pf
4,/cellar/users/aklie/projects/igvf/topic_grn_li...,igvf_b01_LeftCortex,topyfic,both,reg_genes,microglia,pf_log1p_pf


In [28]:
metadata_df["grn_name"] = metadata_df["dataset"] + "_" + metadata_df["method"] + "_" + metadata_df["cells"] + "_" + metadata_df["genotype"] + "_" + metadata_df["genes"] + "_" + metadata_df["normalization"]
metadata_df.head()

Unnamed: 0,gene_weight_path,dataset,method,genotype,genes,cells,normalization,grn_name
0,/cellar/users/aklie/projects/igvf/topic_grn_li...,igvf_b01_LeftCortex,topyfic,both,reg_genes,microglia,pf_log1p_pf,igvf_b01_LeftCortex_topyfic_microglia_both_reg...
1,/cellar/users/aklie/projects/igvf/topic_grn_li...,igvf_b01_LeftCortex,topyfic,both,reg_genes,microglia,pf_log1p_pf,igvf_b01_LeftCortex_topyfic_microglia_both_reg...
2,/cellar/users/aklie/projects/igvf/topic_grn_li...,igvf_b01_LeftCortex,topyfic,both,reg_genes,microglia,pf_log1p_pf,igvf_b01_LeftCortex_topyfic_microglia_both_reg...
3,/cellar/users/aklie/projects/igvf/topic_grn_li...,igvf_b01_LeftCortex,topyfic,both,reg_genes,microglia,pf_log1p_pf,igvf_b01_LeftCortex_topyfic_microglia_both_reg...
4,/cellar/users/aklie/projects/igvf/topic_grn_li...,igvf_b01_LeftCortex,topyfic,both,reg_genes,microglia,pf_log1p_pf,igvf_b01_LeftCortex_topyfic_microglia_both_reg...


In [31]:
cleaned_metadata_df = metadata_df[["grn_name", "dataset", "method",  "cells", "genes", "normalization", "genotype", "gene_weight_path"]]

In [32]:
cleaned_metadata_df.to_csv(os.path.join(results_dir, dataset_name, in_date, "cleaned_topic_metadata.tsv"), sep="\t", index=False)

In [38]:
grn_tf_metadata = pd.read_csv("/cellar/users/aklie/projects/igvf/topic_grn_links/eval/results/meta_analysis/03Sep23/tf_metadata.tsv", sep="\t", index_col=0)

In [42]:
grn_tf_metadata

Unnamed: 0,in_scenic,in_aracne,in_celloracle,in_literature,number_grns
Runx1t1,False,True,False,False,41
Prdm15,True,True,False,False,27
Zfp445,True,True,False,False,26
Gm14308,True,False,False,False,21
Gas7,False,True,False,False,31
...,...,...,...,...,...
Gm6710,True,False,False,False,22
Suclg1,True,False,False,False,6
Nfia,True,True,True,False,88
Hcfc2,True,False,False,False,25


In [46]:
grn_tf_metadata["in_topics"] = grn_tf_metadata.index.isin(tfs)

In [50]:
grn_tf_metadata.loc["Maf"]

in_scenic         True
in_aracne         True
in_celloracle     True
in_literature    False
number_grns         80
in_topics         True
Name: Maf, dtype: object

In [51]:
grn_tf_metadata.to_csv(os.path.join(results_dir, dataset_name, in_date, "tf_metadata_topics.tsv"), sep="\t")

In [30]:
# Can add more topic info to this dataframe

# DONE!

---