In [None]:
import os
import pathlib
import numpy as np
import pandas as pd

from joblib import Parallel, delayed


DATA_PATH = '../cluster_compute/parsed_results/'

def read_results(file: str):
    return pd.read_csv(file)

def load_results(tcga_project: str):
    file_names = [f for f in pathlib.Path(os.path.join(DATA_PATH, tcga_project)).iterdir() if f.is_file()]  
    file_names = sorted(file_names, key=lambda f: int(f.name.split('_')[-1].split('.')[0]))
    test = file_names[0]
    permutations = file_names[1:]

    df = read_results(test)
    dfs_perm = Parallel(n_jobs=-1)(delayed(read_results)(f) for f in permutations)

    return df, dfs_perm


tcga_projects = ['TCGA-BLCA','TCGA-BRCA', 'TCGA-COAD', 
                 'TCGA-GBM', 'TCGA-HNSC', 'TCGA-KIRC', 
                 'TCGA-LGG', 'TCGA-LIHC', 'TCGA-LUAD', 
                 'TCGA-LUSC', 'TCGA-OV', 'TCGA-SKCM', 
                 'TCGA-STAD']

interaction_scores_stats = []
rmst_stats = []
top_hits_stats = []

for tcga_project in tcga_projects:
    df_test, dfs_perms = load_results(tcga_project)
   
    columns = ['additive_interaction_score', 'competing_interaction_score', 'xor_interaction_score']

    interaction_scores_top_hits = {'tcga': tcga_project}
    for i, column_label in enumerate(columns):
        interactoin_type = column_label.split('_')[0]
        perm_values = np.concatenate([df_perm[column_label].values for df_perm in dfs_perms])
        threshold = np.percentile(perm_values, 99.99)
        
        interaction_scores = df_test[column_label].values
        top_hits = interaction_scores[interaction_scores > threshold]
        top_permutation_hits = perm_values[perm_values > threshold]
        interaction_scores_top_hits.update({f'{interactoin_type}': len(top_hits), f'{interactoin_type}_perm': len(top_permutation_hits), f'{interactoin_type}_threshold': threshold})

    interaction_scores_stats.append(interaction_scores_top_hits)


    columns = ['additive_rmst', 'competing_rmst', 'xor_rmst']

    rmst_top_hits = {'tcga': tcga_project}
    for i, column_label in enumerate(columns):
        interactoin_type = column_label.split('_')[0]
        perm_values = np.concatenate([df_perm[column_label].values for df_perm in dfs_perms])
        threshold = np.percentile(perm_values, 99.99)
        
        interaction_scores = df_test[column_label].values
        top_hits = interaction_scores[interaction_scores > threshold]
        top_permutation_hits = perm_values[perm_values > threshold]
        rmst_top_hits.update({f'{interactoin_type}': len(top_hits), f'{interactoin_type}_perm': len(top_permutation_hits), f'{interactoin_type}_threshold': threshold})

    rmst_stats.append(rmst_top_hits)


    columns = [('additive_rmst', 'additive_interaction_score'), ('competing_rmst','competing_interaction_score'), ('xor_rmst', 'xor_interaction_score')]

    top_hits = {'tcga': tcga_project}
    for i, column_label in enumerate(columns):
        interactoin_type = column_label[0].split('_')[0]

        rmst_column = column_label[0]
        rmst_perm_values = np.concatenate([df_perm[rmst_column].values for df_perm in dfs_perms])
        rmst_threshold = np.percentile(rmst_perm_values, 99.99)

        score_column = column_label[1]
        score_perm_values = np.concatenate([df_perm[score_column].values for df_perm in dfs_perms])
        score_threshold = np.percentile(score_perm_values, 99.99)

        top_hits_df = df_test[(df_test[score_column] > score_threshold) & (df_test[rmst_column] > rmst_threshold)]
        
        top_hits_perm = 0
        for df_perm in dfs_perms:
            top_hits_perm += len(df_perm[(df_perm[rmst_column] > rmst_threshold) & (df_perm[score_column] > score_threshold)])

        top_hits.update({f'{interactoin_type}': len(top_hits_df), f'{interactoin_type}_perm': top_hits_perm})

    top_hits_stats.append(top_hits)

    print('Finished', tcga_project)


df = pd.DataFrame(interaction_scores_stats)
df.to_csv('results/top_hits_interaction_scores.csv', index=False)

df = pd.DataFrame(rmst_stats)
df.to_csv('results/top_hits_rmst.csv', index=False)
    
df = pd.DataFrame(top_hits_stats)
df.to_csv('results/top_hits_count.csv', index=False)


In [None]:
df_thresholds_scores = pd.read_csv('results/top_hits_interaction_scores.csv', index_col=0)
df_thresholds_rmst = pd.read_csv('results/top_hits_rmst.csv', index_col=0)

for tcga_project in tcga_projects:
    thresholds_scores = df_thresholds_scores.loc[tcga_project].to_dict()
    thresholds_rmst = df_thresholds_rmst.loc[tcga_project].to_dict()
    
    df = pd.read_csv(f'../cluster_compute/parsed_results/{tcga_project}/task_0.csv.gz')

    additive = (df['additive_interaction_score'] > thresholds_scores['additive_threshold']) & (df['additive_rmst'] > thresholds_rmst['additive_threshold'])
    df_additive = df[additive][['feature1', 'feature2', 'additive_rmst', 'additive_interaction_score']].sort_values('additive_interaction_score', ascending=False)

    competing = (df['competing_interaction_score'] > thresholds_scores['competing_threshold']) & (df['competing_rmst'] > thresholds_rmst['competing_threshold'])
    df_competing = df[competing][['feature1', 'feature2', 'competing_rmst', 'competing_interaction_score']].sort_values('competing_interaction_score', ascending=False)

    xor = (df['xor_interaction_score'] > thresholds_scores['xor_threshold']) & (df['xor_rmst'] > thresholds_rmst['xor_threshold'])
    df_xor = df[xor][['feature1', 'feature2', 'xor_rmst', 'xor_interaction_score']].sort_values('xor_interaction_score', ascending=False)

    print(f"{tcga_project}: additive {len(df_additive)}, competing {len(df_competing)}, xor {len(df_xor)}")
    os.makedirs('results/top_hits', exist_ok=True)
    df_additive.to_csv(f'results/top_hits/{tcga_project}_additive.csv', index=False)
    df_competing.to_csv(f'results/top_hits/{tcga_project}_competing.csv', index=False)
    df_xor.to_csv(f'results/top_hits/{tcga_project}_xor.csv', index=False)