In [1]:
# load packages
import pickle
import numpy as np
from pyscenic.cli.utils import load_signatures
import os
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
from pyscenic.genesig import Regulon

In [12]:
sig = []
for i in range(10):
    folder_name = '/home/yilin/hopper/results'+str(i)
    path = os.path.join(folder_name, "epi_hopper.regulons.dat")
    # take the ctx output results$i/epi_hopper.regulons.dat, load into pyscenic with
    sig.append(load_signatures(path))

In [42]:
# sig is a big list storing all regulons across scenic runs. We could access one regulon with
sig[0][0]

Regulon(name='AHRR(+)', gene2weight=frozendict({'USP32': 2.2412686592663893, 'ZNF438': 3.2716864712349847, 'N6AMT1': 2.850528071410117, 'MFN2': 3.114829130579938, 'MAGI3': 2.7997305972297664, 'C4orf33': 2.5130866145896693, 'PLEKHH2': 3.242807042852561}), gene2occurrence=frozendict({}), transcription_factor='AHRR', context=frozenset({'activating', 'flyfactorsurvey__ss_tgo_SANGER_10_FBgn0003513.png'}), score=0.7949965679744551, nes=0.0, orthologous_identity=0.0, similarity_qvalue=0.0, annotation='')

In [41]:
# access all of the TFs (across runs)
regulons_name = []
def get_name(sig):
    return list(map(lambda x: x.transcription_factor, sig))
regulons_name = list(map(get_name,sig))
regulons_name = [item for sublist in regulons_name for item in sublist]

In [19]:
# count the frequency for each regulon. The result is a dictionary with regulon name as key and frequency as value
regulons_count = Counter(regulons_name)

In [24]:
# count the number of regulons for each number of occurrence. The result is a dictionary with frequency as key and number of regulons as value
count = Counter(regulons_count.values())
count

Counter({2: 51,
         9: 28,
         5: 25,
         10: 145,
         1: 109,
         7: 17,
         8: 18,
         6: 24,
         3: 59,
         4: 36})

In [29]:
# set a threshold of 80% occurrence
def filterTheDict(dictObj, callback):
    # make a new dictionary
    newDict = dict()
    # Iterate over all the items in dictionary
    for (key, value) in dictObj.items():
        # Check if item satisfies the given condition then add to new dict
        if callback((key, value)):
            newDict[key] = value
    return newDict

# the result is a new dictionary with regulons above 80% occurrence
filter_regulon_dict = filterTheDict(regulons_count, lambda elem : elem[1] >= 8)

In [35]:
# a list of regulons that meet the threshold of 80% occurrence
regulons_filter = list(filter_regulon_dict)
len(regulons_filter)

191

In [36]:
filter_sig = []
def filter_regulon(sig):
    return list(filter(lambda x:x.transcription_factor in regulons_filter,sig))
sig_filter = list(map(filter_regulon,sig))

In [51]:
def get_gene2weight(regulon,sig):
        return list(filter(lambda x:x.transcription_factor==regulon,sig))[0].gene2weight

def get_occurrence(regulon,sig):
        weight = list(filter(lambda x:x.transcription_factor==regulon,sig))[0].gene2weight
        occurence = {x: 1 for x in weight}
        return occurence

# aggregate regulons across pySCENIC runs and calculate average gene weight. The function will return a dictionary with target gene as key and average gene2weight as value for each regulon
def aggregate_regulon(regulon):
    weight = Counter()
    occurrence = Counter()
    for i in range(10):
        if regulon in list(map(lambda x:x.transcription_factor,sig[i])):
            # update the weight dictionary
            weight.update(get_gene2weight(regulon,sig[i]))
            # update the occurence
            occurrence.update(get_occurrence(regulon,sig[i]))
            avg_weight = {k: (weight[k] / occurrence[k]) for k in weight}
    return avg_weight

In [53]:
# create each of aggregated regulons with
def create_agg_regulon(regulon):
    regulon = Regulon(name=regulon, gene2weight = aggregate_regulon(regulon), transcription_factor=regulon, gene2occurrence={})
    return regulon

In [54]:
regulons = list(map(create_agg_regulon,regulons_filter))

In [55]:
# save regulons
with open('agg.regulons.dat', 'wb') as f:
    pickle.dump(regulons, f)