In [1]:
import pandas as pd 
import os
import numpy as np
from tqdm import tqdm
os.chdir("./..")
import sys
sys.path.append('./src/sena_discrepancy_vae')
from utils import Norman2019DataLoader, Wessel2023HEK293DataLoader
import json

In [2]:
def get_datahandler(dataset_name, batch_size=32):

    if 'Norman2019' in dataset_name:
        data_handler = Norman2019DataLoader(batch_size=batch_size)
    elif dataset_name == 'wessel_hefk293':
        data_handler = Wessel2023HEK293DataLoader(batch_size=batch_size)

    return data_handler

In [3]:
def generate_differential_activation_scores(df, data_handler, layer='fc1'):

    #get activity score df of specific layer
    activity_score = df[layer]

    #define control cells
    ctrl_cells = activity_score[activity_score.index == 'ctrl'].to_numpy()

    #get perturbation targets
    ptb_targets = [x for x in set(activity_score.index) if x != 'ctrl']

    #get gene go dict
    gene_go_dict = data_handler.gene_go_dict
    genename_ensemble_dict = data_handler.ensembl_genename_mapping_rev

    #initialize analysis dataframe
    analysis_df_l = []

    for knockout in tqdm(ptb_targets):

        gos = activity_score.columns
        for i, geneset in enumerate(gos):
            
            if layer == 'fc1':
                #some knockouts may not have any targeted gene set
                if knockout not in genename_ensemble_dict:
                    continue

            #get knockout cells
            knockout_cells = activity_score[activity_score.index == knockout].to_numpy()

            #compute differential activation score
            score = abs(ctrl_cells[:,i].mean() - knockout_cells[:,i].mean())

            if layer == 'fc1':

                #compute affected genesets
                belonging_genesets = [geneset for geneset in gos if geneset in gene_go_dict[genename_ensemble_dict[knockout]]]

                #append
                analysis_df_l.append([knockout, geneset, score, geneset in belonging_genesets])

            else:
                
                #append
                analysis_df_l.append([knockout, geneset, score])


    #build the dataframe
    analysis_df = pd.DataFrame(analysis_df_l)

    if layer == 'fc1':
        analysis_df.columns = ['knockout', 'geneset', 'score', 'affected']
    else:
        analysis_df.columns = ['knockout', 'geneset', 'score']

    return analysis_df

In [4]:
#define folder path
folder_path = os.path.join('results', 'example')

#load the activation scores
activation_scores_dict = pd.read_pickle(os.path.join(folder_path, 'activation_scores.pickle'))

#get datasetname
with open(os.path.join(folder_path,'config.json'), 'r') as file:
        config_file = json.load(file)
dataset_name = config_file["dataset_name"]

#load data handler
data_handler = get_datahandler(dataset_name)

#we can generate the differential scores of any layer by just calling the previously defined function
activation_scores = generate_differential_activation_scores(activation_scores_dict, data_handler, layer='fc_mean')

100%|██████████| 3/3 [00:00<00:00, 17.41it/s]


In [5]:
activation_scores

Unnamed: 0,knockout,geneset,score
0,CD46,0,0.000164
1,CD46,1,0.000080
2,CD46,2,0.000030
3,CD46,3,0.000098
4,CD46,4,0.000082
...,...,...,...
310,CD71,100,0.000238
311,CD71,101,0.000285
312,CD71,102,0.000009
313,CD71,103,0.000090
