# Plot ROC AUC curves for TCGA
In this notebook, we compute the ROC curves for PRECISE, ComBat+DL, DL and TRANSACT, for the TCGA drugs. The results are saved as figures, and ROC AUCs are saved for later processing.

In [None]:
from tcga_imports import *

## Parameters

In [None]:
from tcga_settings import *

In [None]:
# General data folder
figure_folder = './figures/'
kernel_subfolder = kernel_surname
if kernel_subfolder in os.listdir(figure_folder):
    print('BEWARE: ALREADY COMPUTATION IN FIGURE FILE')
else:
    os.makedirs(figure_folder + kernel_subfolder)

kernel_subfolder = figure_folder + kernel_subfolder

## Read data

In [None]:
data_df = read_data(tissues=tissues,
                    data_types=[e for e in data_types],
                    projects=projects,
                    data_sources=data_sources,
                    folder_basis='../data/')

source_data_key, target_data_key = reformat_df(data_df, source, target)

# Removing the healthy samples
healthy_samples_index = data_df[target_data_key].index.str.contains(r'-(10A|11A)-')
data_df[target_data_key] = data_df[target_data_key].loc[~healthy_samples_index]

# Library size correction
average_depth_global = 10**5
for ds in list(data_df.keys()):
    GE_normalized = library_size_normalization.TMM_normalization(data_df[ds].values.astype(float))
    GE_normalized = np.array(GE_normalized)
    average_depths = np.mean(np.sum(GE_normalized,1))
    GE_normalized = GE_normalized / average_depths * average_depth_global
    GE_normalized = np.log(np.array(GE_normalized)+1)
    data_df[ds] = pd.DataFrame(GE_normalized,
                               columns=data_df[ds].columns,
                               index=data_df[ds].index)


# Normalize data
normalized_data_df = {
    ds : StandardScaler(with_mean=with_mean, with_std=with_std).fit_transform(data_df[ds])
    for ds in data_df
}
for ds in normalized_data_df:
    normalized_data_df[ds] = pd.DataFrame(normalized_data_df[ds],
                                         index=data_df[ds].index,
                                         columns=data_df[ds].columns)

### Response data

In [None]:
unique_drugs = None
GDSC_drug_response_frames = {}
for x in ['GDSC2', 'GDSC1']:
    GDSC_drug_response_file = '../data/GDSC/response/%s_fitted_dose_response_25Feb20.xlsx'%(x)
    GDSC_drug_response_frames[x] = pd.read_excel(GDSC_drug_response_file)
    if unique_drugs is None:
        unique_drugs = np.unique(GDSC_drug_response_frames[x]['DRUG_NAME'])
    else:
        unique_drugs = np.concatenate([unique_drugs, np.unique(GDSC_drug_response_frames[x]['DRUG_NAME'])])

# TCGA
TCGA_drug_response_file = '../data/TCGA/response/response.csv'

## Alignment
### TRANSACT

In [None]:
TRANSACT_clf = TRANSACT(kernel=kernel_name,
                        kernel_params=kernel_param,
                        n_components=number_pc,
                        n_jobs=20,
                        verbose=10)

TRANSACT_clf.fit(normalized_data_df[source_data_key],
                 normalized_data_df[target_data_key],  
                 n_pv=n_pv,
                 step=n_interpolation,
                 with_interpolation=True)

### PRECISE

In [None]:
PRECISE_clf = TRANSACT(kernel='linear',
                       kernel_params={},
                       n_components=number_pc,
                       n_jobs=20,
                       verbose=10)

PRECISE_clf.fit(normalized_data_df[source_data_key],
                normalized_data_df[target_data_key],
                n_pv=n_pv,
                with_interpolation=True)

## Read drug response

In [None]:
drug_list =[
    ('Cisplatin', None, 'Cisplatin'),
    ('Cisplatin', None, 'Carboplatin'),
    ('Oxaliplatin', 1806, 'Oxaliplatin'),
    ('Afatinib', None, 'Trastuzumab'),
    ('Gemcitabine', None, 'Gemcitabine'),
    ('Paclitaxel', None, 'Paclitaxel'),
    ('Vinorelbine', None, 'Vinorelbine'),
    ('5-Fluorouracil', None, 'Fluorouracil'),
    ('Temozolomide', None, 'Temozolomide'),
    ('Doxorubicin', 133, 'Doxorubicin'),
    ('Docetaxel', 1819, 'Docetaxel'),
    ('Cyclophosphamide', None, 'Cyclophosphamide'),
    ('Etoposide', None, 'Etoposide'),
    ('Bleomycin', None, 'Bleomycin'),
    ('Pemetrexed', None, 'Pemetrexed'),
    ('Irinotecan', None, 'Irinotecan'),
    ('Cetuximab', None, 'Cetuximab'),
]

GDSC_drug_name, GDSC_drug_id, TCGA_drug_name = drug_list[0]
GDSC_drug_name

In [None]:
X_source, y_source = read_GDSC_response(GDSC_drug_response_frames, 
                                        GDSC_drug_name,
                                        normalized_data_df[source_data_key].copy(),
                                        GDSC_drug_id)
X_target, y_target = read_TCGA_response(TCGA_drug_name,
                                        normalized_data_df[target_data_key].copy(),
                                        TCGA_drug_response_file)

## Compute predictions

### Domain adaptation methods

In [None]:
# Train predictor using TRANSACT consensus features.
TRANSACT_clf.fit_predictor(X_source, y_source.values.flatten())

In [None]:
# Train predictor using PRECISE consensus features.
PRECISE_clf.fit_predictor(X_source, y_source.values.flatten())

In [None]:
# Predict value
def predict_tcga(clf):
    y = clf.predict(X_target)
    y_t = pd.DataFrame.copy(y_target)
    y_t['predicted'] = np.array(y).astype(float)
    y_t['RECIST'] = y_t['measure_of_response']

    # Merge response data
    y_t['measure_of_response'] = y_t['measure_of_response'].replace('Clinical Progressive Disease', 'Non Responder')
    y_t['measure_of_response'] = y_t['measure_of_response'].replace('Stable Disease', 'Non Responder')
    y_t['measure_of_response'] = y_t['measure_of_response'].replace('Partial Response', 'Responder')
    y_t['measure_of_response'] = y_t['measure_of_response'].replace('Complete Response', 'Responder')
    
    return y_t

y_target_transact_predicted = predict_tcga(TRANSACT_clf)
y_target_precise_predicted = predict_tcga(PRECISE_clf)

### ComBat + DL methods

In [None]:
from tcga_dl_imports import read_dl_results

In [None]:
type_agg = 'median'

output_folder = './output/baseline_C/'
output_folder += 'GDSC_%s_TCGA_%s/'%(GDSC_drug_name, TCGA_drug_name)
combat_dl_prediction, combat_dl_rank_scores = read_dl_results(output_folder)

output_folder = './output/baseline_B/'
output_folder += 'GDSC_%s_TCGA_%s/'%(GDSC_drug_name, TCGA_drug_name)
dl_prediction, dl_rank_scores = read_dl_results(output_folder)

## Compute ROC AUCs

In [None]:
from roc_auc_imports import compute_ROC_curve, compute_significance

In [None]:
binary_y_target_pred = y_target_transact_predicted
tr_fpr, tr_tpr, tr_thresholds, tr_AUC = compute_ROC_curve(binary_y_target_pred)
transact_sign = compute_significance(binary_y_target_pred)
del binary_y_target_pred

binary_y_target_pred = combat_dl_prediction
combat_dl_fpr, combat_dl_tpr, combat_dl_thresholds, combat_dl_AUC = compute_ROC_curve(binary_y_target_pred)
combat_dl_sign = compute_significance(binary_y_target_pred)
del binary_y_target_pred

binary_y_target_pred = dl_prediction
dl_fpr, dl_tpr, dl_thresholds, dl_AUC = compute_ROC_curve(binary_y_target_pred)
dl_sign = compute_significance(binary_y_target_pred)
del binary_y_target_pred

binary_y_target_pred = y_target_precise_predicted
precise_fpr, precise_tpr, precise_thresholds, precise_AUC = compute_ROC_curve(binary_y_target_pred)
precise_sign = compute_significance(binary_y_target_pred)
del binary_y_target_pred

print('PERFORMANCE: \n TRANSACT \t %s'%(tr_AUC))
print(' ComBat + DL \t %s'%(combat_dl_AUC))
print(' PRECISE \t %s'%(precise_AUC))
print(' DL \t \t %s'%(dl_AUC))

ROC_combat_dl_p_val['GDSC_%s_TCGA_%s'%(GDSC_drug_name, TCGA_drug_name)] = pd.DataFrame(combat_dl_sign)
ROC_transact_p_val['GDSC_%s_TCGA_%s'%(GDSC_drug_name, TCGA_drug_name)] = pd.DataFrame(transact_sign)
ROC_precise_p_val['GDSC_%s_TCGA_%s'%(GDSC_drug_name, TCGA_drug_name)] = pd.DataFrame(precise_sign)
ROC_dl_p_val['GDSC_%s_TCGA_%s'%(GDSC_drug_name, TCGA_drug_name)] = pd.DataFrame(dl_sign)

## Compute CIs using pROC

In [None]:
robjects.r.assign("response", y_target_df['measure_of_response'].replace('Non Responder', 1).replace('Responder', 0).values)

robjects.r.assign("transact_predict", y_target_transact_predicted['predicted'].values)
robjects.r.assign("precise_predict", y_target_precise_predicted['predicted'].values)
robjects.r.assign("combat_dl_predict", combat_dl_prediction['predicted'].values)
robjects.r.assign("dl_predict", dl_prediction['predicted'].values)

<b>EXPLANATION NOTE:</b> Here, for each method, we compute the ROC AUCs alongside the bootstrap confidence interval using the R-package pROC. We interface our code with R using rpy2.

In [None]:
grdevices = importr('grDevices')

grdevices.png(file="%s/R_ROC_combined_GDSC_%s_TCGA_%s.png"%(kernel_subfolder, GDSC_drug_name, TCGA_drug_name),
              width=512, height=512)
robjects.r('''
    pROC_transact_obj <- roc(response, transact_predict,smoothed = FALSE,
                            ci=TRUE, ci.alpha=0.95, stratified=FALSE,direction="<",
                            plot=TRUE, auc.polygon=TRUE, max.auc.polygon=TRUE, grid=TRUE,
                            print.auc=TRUE, show.thres=TRUE,cex.lab=2.0, cex.axis=2.0, cex.main=2.0, cex.sub=2.0)
    pROC_combat_dl_obj <- roc(response, combat_dl_predict,smoothed = FALSE,direction="<",
                            ci=TRUE, ci.alpha=0.95, stratified=FALSE,
                            plot=TRUE, auc.polygon=TRUE, max.auc.polygon=TRUE, grid=TRUE,
                            print.auc=TRUE, show.thres=TRUE,cex.lab=2.0, cex.axis=2.0, cex.main=2.0, cex.sub=2.0)
    pROC_precise_obj <- roc(response,precise_predict,smoothed = FALSE,direction="<",
                            ci=TRUE, ci.alpha=0.95, stratified=FALSE,
                            plot=TRUE, auc.polygon=TRUE, max.auc.polygon=TRUE, grid=TRUE,
                            print.auc=TRUE, show.thres=TRUE,cex.lab=2.0, cex.axis=2.0, cex.main=2.0, cex.sub=2.0)
    pROC_dl_obj <- roc(response, dl_predict,smoothed = FALSE,direction="<",
                        ci=TRUE, ci.alpha=0.95, stratified=FALSE,
                        plot=TRUE, auc.polygon=TRUE, max.auc.polygon=TRUE, grid=TRUE,
                        print.auc=TRUE, show.thres=TRUE,cex.lab=2.0, cex.axis=2.0, cex.main=2.0, cex.sub=2.0)
                    
    sens_transact.ci <- ci.se(pROC_transact_obj)
    sens_combat_dl.ci <- ci.se(pROC_combat_dl_obj)
    sens_precise.ci <- ci.se(pROC_precise_obj)
    sens_dl.ci <- ci.se(pROC_dl_obj)
    
    plot(sens_transact.ci, type="bars")
    plot(sens_combat_dl.ci, type="bars")
    plot(sens_precise.ci, type="bars")
    plot(sens_dl.ci, type="bars")

    ## Make new color using input color as base and alpha set by transparency
    rgb.val <- col2rgb("blue")
    transact.col <- rgb(rgb.val[1], rgb.val[2], rgb.val[3],
                     max = 255,
                     alpha = 70)
                 
    rgb.val <- col2rgb("orange")
    combat_dl.col <- rgb(rgb.val[1], rgb.val[2], rgb.val[3],
                         max = 255,
                         alpha = 70)
    
    rgb.val <- col2rgb("grey")
    precise.col <- rgb(rgb.val[1], rgb.val[2], rgb.val[3],
                     max = 255,
                     alpha = 70)
                 
    plot(sens_transact.ci, type="shape", col=transact.col)
    plot(sens_combat_dl.ci, type="shape", col=combat_dl.col)
    plot(sens_precise.ci, type="shape", col=precise.col)
''')
grdevices.dev_off()

robjects.r('''
    auc_test <- roc.test(response=response,
                        predictor1=transact_predict,
                        predictor2=combat_dl_predict,
                        alternative='greater',
                        paired=TRUE,
                        boot.stratified=TRUE, 
                        method="bootstrap")
''')

ROC_combat_dl_ci['GDSC_%s_TCGA_%s'%(GDSC_drug_name, TCGA_drug_name)] = np.array(robjects.r('''ci.auc(pROC_combat_dl_obj, conf.level=0.95, method='b')'''))
ROC_dl_ci['GDSC_%s_TCGA_%s'%(GDSC_drug_name, TCGA_drug_name)] = np.array(robjects.r('''ci.auc(pROC_dl_obj, conf.level=0.95, method='b')'''))
ROC_transact_ci['GDSC_%s_TCGA_%s'%(GDSC_drug_name, TCGA_drug_name)] = np.array(robjects.r('''ci.auc(pROC_transact_obj, conf.level=0.95, method='b')'''))
ROC_precise_ci['GDSC_%s_TCGA_%s'%(GDSC_drug_name, TCGA_drug_name)] = np.array(robjects.r('''ci.auc(pROC_precise_obj, conf.level=0.95, method='b')'''))

response_size['GDSC_%s_TCGA_%s'%(GDSC_drug_name, TCGA_drug_name)] = [
    np.sum(y_target_df['measure_of_response'] == 'Responder'),
    np.sum(y_target_df['measure_of_response'] == 'Non Responder')
]

## Save results
Once the ROC AUC has been computed for a certain number of drugs, you can save all results using the following command.

### Confidence intervals

In [None]:
ROC_CI_combat_dl_df = pd.DataFrame(ROC_combat_dl_ci).T
ROC_CI_combat_dl_df.columns = pd.MultiIndex.from_tuples([('ComBat + Deep Learning', x) for x in ['min', 'median', 'max']])

ROC_CI_dl_df = pd.DataFrame(ROC_dl_ci).T
ROC_CI_dl_df.columns = pd.MultiIndex.from_tuples([('Deep Learning', x) for x in ['min', 'median', 'max']])

ROC_CI_transact_df = pd.DataFrame(ROC_transact_ci).T
ROC_CI_transact_df.columns = pd.MultiIndex.from_tuples([('TRANSACT', x) for x in ['min', 'median', 'max']])

ROC_CI_precise_df = pd.DataFrame(ROC_precise_ci).T
ROC_CI_precise_df.columns = pd.MultiIndex.from_tuples([('PRECISE', x) for x in ['min', 'median', 'max']])

response_size_df = pd.DataFrame(response_size).T
response_size_df.columns = pd.MultiIndex.from_tuples([('size', x)
                                                      for x in ['Responders', 'Non Responders']])

ROC_CI_df = pd.concat([ROC_CI_combat_dl_df, ROC_CI_transact_df, ROC_CI_precise_df, ROC_CI_dl_df, response_size_df], axis=1)
ROC_CI_df.to_csv('%s/bootstrap_CI_TCGA.csv'%(kernel_subfolder))

### AUC and p-value (associated to Mann-Whitney test)

In [None]:
ROC_pval_df = pd.concat({
    'ComBat + Deep Learning': pd.concat(ROC_combat_dl_p_val),
    'Deep Learning': pd.concat(ROC_dl_p_val),
    'TRANSACT': pd.concat(ROC_transact_p_val),
    'PRECISE': pd.concat(ROC_precise_p_val)
})
ROC_pval_df.to_csv('%s/p_val_TCGA.csv'%(kernel_subfolder))

In [None]:
ROC_pval_df = ROC_pval_df.reset_index()
ROC_pval_df = ROC_pval_df[ROC_pval_df['level_2'] == 'label']
ROC_pval_df.columns = ['method', 'drug', 'x', 'one-sided', 'two-sided']
del ROC_pval_df['x']

ROC_pval_df.sort_values('drug').to_csv(
    '%s/p_val_TCGA_processed.csv'%(kernel_subfolder)
)