In [41]:
import os, sys
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import seaborn as sns
from joblib import Parallel, delayed
import scipy
import re
sns.set_style("whitegrid")
sns.set_context('paper')

data_type = 'TCGA'
unvariable_grouping = 'median'
test = 'Mann-Whitney-ls'
results_folder = './figures/'

## Import ElasticNet, PRECISE and TRANSACT (non-variable)

In [42]:
unvariable_folder_labels = {
    './figures/': '',
    './figures/baseline_A': 'baseline_A',
    './figures/linear': 'PRECISE',
    './figures/rbf_gamma_0_0005': 'TRANSACT'
}
unvariable_methods = ['baseline_A', 'PRECISE', 'TRANSACT']

variable_folder_labels = {
    './figures/baseline_B': 'baseline_B',
    './figures/baseline_C': 'baseline_C',
}

variable_methods = ['baseline_B', 'baseline_C']

In [43]:
regex_ustat_match = '\w*predicted_AUC(_n_pv_[0-9]+)*_GDSC_(5-)?\w*_%s_\w*_*[0-9]*_%s_ustat.txt'%(data_type, test)
all_ustat_files = {}

for f in os.walk(results_folder):
    if f[0] == './figures/' or f[0] not in unvariable_folder_labels:
        continue
    r_ustat = re.compile(regex_ustat_match)
    all_ustat_files[f[0]] = np.array(f[2])[[r_ustat.match(e) is not None for e in f[2]]]

global_ustat_df = pd.DataFrame()

for folder, files in all_ustat_files.items():
    setting = unvariable_folder_labels[folder]
    if setting not in global_ustat_df.index:
        global_ustat_df[(setting, 'pval')] = np.empty(global_ustat_df.shape[0])
        global_ustat_df[(setting, 'ustat')] = np.empty(global_ustat_df.shape[0])
        global_ustat_df[(setting, 'product_samples')] = np.empty(global_ustat_df.shape[0])
    
    for f in files:
        # Read data
        df = pd.read_csv(folder + '/' + f, header=None if data_type == 'TCGA' else 0, index_col=0)
        
        # Extract drug
        drugs = re.search('GDSC_(5-)?\w*_%s_\w*_%s'%(data_type, test), f).group()
        drugs = drugs.replace(test, '')
        GDSC_drug = re.search('GDSC_(5-)?\w*_%s'%(data_type), drugs).group().replace('GDSC_', '').replace('_%s'%(data_type), '')
        target_drug = re.search('%s_[A-Za-z]*'%(data_type), drugs).group().replace('%s_'%(data_type), '').replace('_', '')
        
        drug_index = (GDSC_drug, target_drug)
        if drug_index not in global_ustat_df.index:
            global_ustat_df = global_ustat_df.append(pd.Series(name=drug_index))
        for m in df.index:
            global_ustat_df.at[drug_index, (setting, m)] = df.at[m, 1] if data_type == 'TCGA' else df.at[m, 'PR-PD']
    
global_ustat_df.colums = pd.MultiIndex.from_tuples(global_ustat_df.columns)
global_ustat_df.index = pd.MultiIndex.from_tuples(global_ustat_df.index)
global_ustat_df.index.names = ['GDSC_drug', '%s_drug'%(data_type)]
global_ustat_df.reset_index().set_index(['GDSC_drug', '%s_drug'%(data_type)]).merge(global_ustat_df, 
                                                                                    right_index=True,
                                                                                    left_index=True)
global_ustat_df.colums = pd.MultiIndex.from_tuples(global_ustat_df.columns)

for s in unvariable_methods:
    global_ustat_df[(s, 'AUC')] = global_ustat_df[(s, 'ustat')] / global_ustat_df[(s, 'product_samples')]
    
global_ustat_df.columns = pd.MultiIndex.from_tuples(global_ustat_df.columns)



In [44]:
for folder, method in variable_folder_labels.items():
    aggregated_file = folder + '/%s_results_%s_GDSC_MSE.csv'%(data_type, unvariable_grouping)
    aggregated_df = pd.read_csv(aggregated_file, index_col=0)
    aggregated_df.columns = [(method, c) for c in aggregated_df.columns]
    aggregated_df.columns = pd.MultiIndex.from_tuples(aggregated_df.columns)
    aggregated_df.index = [(re.search('GDSC_(5-)?[A-Za-z]*', idx).group(0).replace('GDSC_', ''),
                            re.search('%s_(5-)?[A-Za-z]*'%(data_type), idx).group(0).replace('%s_'%(data_type), ''))
                           for idx in aggregated_df.index]
    aggregated_df.index = pd.MultiIndex.from_tuples(aggregated_df.index)
    aggregated_df.index.names = ['GDSC_drug', '%s_drug'%(data_type)]
    global_ustat_df = global_ustat_df.merge(aggregated_df, left_index=True, right_index=True)

In [45]:
global_ustat_df.to_csv('%s/%s_ustat_summary_aggregat_median.csv'%(results_folder, data_type))

## Write in readable format

In [46]:
if data_type == 'TCGA':
    order = [
        ('Afatinib', 'Trastuzumab'),
        ('Bleomycin', 'Bleomycin'),
        ('Cetuximab', 'Cetuximab'),
        ('Cisplatin', 'Cisplatin'),
        ('Cisplatin', 'Carboplatin'),
        ('Cyclophosphamide', 'Cyclophosphamide'),
        ('Docetaxel', 'Docetaxel'),
        ('Doxorubicin', 'Doxorubicin'),
        ('Etoposide', 'Etoposide'),
        ('5-Fluorouracil', 'Fluorouracil'),
        ('Gemcitabine', 'Gemcitabine'),
        ('Irinotecan', 'Irinotecan'),
        ('Oxaliplatin', 'Oxaliplatin'),
        ('Paclitaxel', 'Paclitaxel'),
        ('Pemetrexed', 'Pemetrexed'),
        ('Temozolomide', 'Temozolomide'),
        ('Vinorelbine', 'Vinorelbine')
    ]
elif data_type == 'HMF':
    order = [
        ('Afatinib', 'Trastuzumab'),
        ('Irinotecan', 'Irinotecan'),
        ('Cisplatin', 'Carboplatin'),
        ('5-Fluorouracil', 'Fluorouracil'),
        ('Paclitaxel', 'Paclitaxel'),
        ('Gemcitabine', 'Gemcitabine')
    ]

In [47]:
global_df = global_ustat_df.copy()
methods = unvariable_methods + variable_methods
for m in methods:
    global_df[(m, 'summary')] = global_df[(m, 'pval')].apply(lambda x: '%1.2E'%(x)) + ' ' +\
                                global_df[(m, 'AUC')].apply(lambda x: '[%1.2f]'%(x))
global_df = global_df[[(m, 'summary') for m in methods]]
global_df = global_df.loc[order]
global_df.to_csv('./figures/%s_results_summary_aggregat_%s.csv'%(data_type, unvariable_grouping))