# Fig 2C-I : Drug response prediction from cell lines to PDX.
This notebooks support Fig2 panel C to I and corresponds to the PDX prediction based on cell lines drug response.

In [None]:
# All list of import is in module_import.py
from module_import import *
from src_utils import *

In [None]:
# All data settings are in data_settings.py
from data_settings import *

## Read data

In [None]:
data_df = read_data(tissues=tissues,
                    data_types=[e for e in data_types],
                    projects=projects,
                    data_sources=data_sources,
                    folder_basis='../data/')

source_data_key, target_data_key = reformat_df(data_df, source, target)

data_df_combat = deepcopy(data_df)

In [None]:
# Library size normalization
average_depth_global = 10**5
for ds in list(data_df.keys()):
    GE_normalized = library_size_normalization.TMM_normalization(data_df[ds].values.astype(float))
    GE_normalized = np.array(GE_normalized)
    average_depths = np.mean(np.sum(GE_normalized,1))
    data_df_combat[ds] = pd.DataFrame(np.log(np.array(GE_normalized)+1),
                                     columns=data_df_combat[ds].columns,
                                     index=data_df_combat[ds].index)
    GE_normalized = GE_normalized / average_depths * average_depth_global
    GE_normalized = np.log(np.array(GE_normalized)+1)
    data_df[ds] = pd.DataFrame(GE_normalized,
                               columns=data_df[ds].columns,
                               index=data_df[ds].index)

In [None]:
# Reducing genes for ComBat
number_top_genes = 1700

top_source_variable_genes = pd.DataFrame(np.var(data_df[source_data_key]), columns=['variance'])
top_source_variable_genes = top_source_variable_genes.sort_values('variance', ascending=False)
top_source_variable_genes = top_source_variable_genes.head(number_top_genes).index
top_target_variable_genes = pd.DataFrame(np.var(data_df[target_data_key]), columns=['variance'])
top_target_variable_genes = top_target_variable_genes.sort_values('variance', ascending=False)
top_target_variable_genes = top_target_variable_genes.head(number_top_genes).index
top_variable_genes = np.intersect1d(top_source_variable_genes, top_target_variable_genes)
print(top_variable_genes.shape)

for d in data_df:
    data_df_combat[d] = data_df_combat[d][top_variable_genes]

In [None]:
normalized_data_df = {
    ds : StandardScaler(with_mean=with_mean, with_std=with_std).fit_transform(data_df[ds])
    for ds in data_df
}

for ds in normalized_data_df:
    normalized_data_df[ds] = pd.DataFrame(normalized_data_df[ds],
                                         index=data_df[ds].index,
                                         columns=data_df[ds].columns)

### Drug response

In [None]:
# GDSC
unique_drugs = None
GDSC_drug_response_frames = {}
for x in ['GDSC2', 'GDSC1']:
    GDSC_drug_response_file = '../data/GDSC/response/%s_fitted_dose_response_25Feb20.xlsx'%(x)
    GDSC_drug_response_frames[x] = pd.read_excel(GDSC_drug_response_file)
    if unique_drugs is None:
        unique_drugs = np.unique(GDSC_drug_response_frames[x]['DRUG_NAME'])
    else:
        unique_drugs = np.concatenate([unique_drugs, np.unique(GDSC_drug_response_frames[x]['DRUG_NAME'])])

In [None]:
# PDX
PDX_drug_response_df = pd.read_csv('../data/PDXE/response/response.csv', index_col=0)

## Alignment settings
### Different similarity functions to test

In [None]:
# ALl experimental settings are in expt_settings.py
from expt_settings import *

### Load drug data

In [None]:
# Potential pairs:
#     ('Erlotinib', 'erlotinib'),
#     ('Cetuximab', 'cetuximab'),
#     ('Gemcitabine', 'gemcitabine-50mpk'),
#     ('Afatinib', 'trastuzumab'),
#     ('Paclitaxel', 'paclitaxel'),
#     ('Trametinib', 'trametinib'),
#     ('Ruxolitinib', 'INC424'),
GDSC_drug_name, PDXE_drug_name = ('Erlotinib', 'erlotinib')

drug_folder_name = 'response_GDSC_%s_PDXE_%s'%(GDSC_drug_name, PDXE_drug_name)
if drug_folder_name not in os.listdir('./figures/'):
    os.mkdir('./figures/'+drug_folder_name)
drug_folder_name = './figures/'+drug_folder_name


X_target_response, y_target = read_PDXE_response(PDX_drug_response_df,
                                                 PDXE_drug_name,
                                                 normalized_data_df[target_data_key])
X_source_response, y_source = read_GDSC_response(GDSC_drug_response_frames,
                                                 GDSC_drug_name,
                                                 normalized_data_df[source_data_key])

X_target_response_combat, y_target_combat = read_PDXE_response(PDX_drug_response_df,
                                                               PDXE_drug_name,
                                                               data_df_combat[target_data_key])
X_source_response_combat, y_source_combat = read_GDSC_response(GDSC_drug_response_frames,
                                                               GDSC_drug_name,
                                                               data_df_combat[source_data_key])

combat_cv_folder = output_combat_cv_folder + GDSC_drug_name
uncorrected_cv_folder = GDSC_drug_name + ('_centered' if with_mean else '') + ('_standardized' if with_std else '')
uncorrected_cv_folder = output_uncorrected_cv_folder + uncorrected_cv_folder

## Test for various values of similarities and baselines
### Import CV deep network architecture

In [None]:
uncorrected_param = read_best_param(uncorrected_cv_folder, random_state, 'uncorrected_cv_results.csv')
combat_param = read_best_param(combat_cv_folder, random_state, 'combat_cv_results.csv')

combat_param['n_input'] = data_df_combat[source_data_key].shape[1]
uncorrected_param['n_input'] = data_df[source_data_key].shape[1]

In [None]:
uncorrected_network = make_network(uncorrected_param)
uncorrected_network = Pipeline([
    ('scaler', StandardScaler(with_mean=with_mean, with_std=with_std)),
    ('regression', make_skorch_network(uncorrected_network, uncorrected_param))
])

combat_network = make_network(combat_param)
combat_network = make_skorch_network(combat_network, combat_param)

In [None]:
def predict_PDX_spearman_cor(n_jobs=20, verbose=0, return_clf=False):
    """
    This function performs all the comparisons, i.e. for all possible classifiers:
        - Train on GDSC.
        - Apply on PDXE.
        - Compare the predicted PDXE response to the actual Best Average Response.
    Different routines are used per classifier, which can be classified in 3 categories:
        - Native scikit-learn: ElasticNet and KRR (Kernel Ridge Regression).
        - Skorch: ComBat+DL and DL.
        - Domain adaptation: PRECISE and TRANSACT.
    For KRR and TRANSACT, one classifier is created per similarity function.
    """
    target_spearman = {}
    
    if return_clf:
        classifiers = {}
        
    for sim_surname, sim_name in zip(kernel_surnames, kernel_names):
        #For each kernel:
        #    - compute consensus features and project bootstrapped data on them,
        #    - train predictive model based on bootstrapped labels,
        #    - predict on target and save spearman correlation.
        print(sim_surname)
        clf = TRANSACT(kernel=sim_name,
                      kernel_params=kernel_param[sim_surname],
                      n_components=number_pc,
                      n_jobs=n_jobs,
                      verbose=verbose)
        
        clf.fit(normalized_data_df[source_data_key],
                normalized_data_df[target_data_key],
                n_pv=n_pv[sim_surname],
                step=n_interpolation,
                with_interpolation=True)

        clf.fit_predictor(X_source_response, y_source.values.flatten(), l1_ratio=0.)
        y_target_subsample_predicted = clf.predict(X_target_response)
        target_spearman[sim_surname] = scipy.stats.spearmanr(y_target_subsample_predicted,
                                                             y_target['BestAvgResponse'])
        if return_clf:
            classifiers[sim_surname] = deepcopy(clf)
        
    # Comparison to baseline
    print('raw')
    alpha_values = np.logspace(-5,10,16)
    l1_ratio_values = np.linspace(1,10,11)/10
    param_grid ={
        'regression__alpha': alpha_values,
        'regression__l1_ratio': l1_ratio_values
    }
    grid_raw = GridSearchCV(Pipeline([
                            ('scaler', StandardScaler(with_mean=with_mean, with_std=with_std)),
                            ('regression', ElasticNet())
                            ]),
                            cv=10, 
                            n_jobs=n_jobs, 
                            param_grid=param_grid, 
                            verbose=verbose, 
                            scoring='neg_mean_squared_error')
    grid_raw.fit(X_source_response, y_source.values.flatten())
    y_target_subsample_predicted = grid_raw.predict(X_target_response)
    target_spearman['uncorrected_EN'] = scipy.stats.spearmanr(y_target_subsample_predicted,
                                                        y_target['BestAvgResponse'])
    classifiers['raw'] = grid_raw
    
    # Neural network without correction
    print('Neural network uncorrected')
    uncorrected_network.fit(X_source_response.values.astype(np.float32), y_source.values.astype(np.float32))
    y_target_subsample_predicted = uncorrected_network.predict(X_target_response.values.astype(np.float32)).flatten()
    target_spearman['uncorrected_network'] = scipy.stats.spearmanr(y_target_subsample_predicted,
                                                                   y_target['BestAvgResponse'].values.flatten())
    classifiers['uncorrected_network'] = uncorrected_network
    
    # Neural network without correction
    print('Neural network with ComBat')
    combat_network.fit(X_source_response_combat.values.astype(np.float32),
                       y_source_combat.values.astype(np.float32))
    y_target_subsample_predicted = combat_network.predict(X_target_response_combat.values.astype(np.float32)).flatten()
    target_spearman['combat_network'] = scipy.stats.spearmanr(y_target_subsample_predicted,
                                                                   y_target['BestAvgResponse'].values.flatten())
    classifiers['combat_network'] = combat_network
    
    if return_clf:
        return target_spearman, classifiers
    return target_spearman

In [None]:
n_jobs=30

correlations_per_sim, clfs = predict_PDX_spearman_cor(n_jobs=n_jobs, verbose=0, return_clf=True)
saving_id = str(uuid.uuid4())[:8]
dump(correlations_per_sim, '%s/prediction_%s.csv'%(drug_folder_name,
                                                   saving_id))

In [None]:
potential_file = os.listdir(drug_folder_name)
potential_file = [p for p in potential_file if 'prediction' in p]
if len(potential_file) == 1:
    file = potential_file[0]
else:
    print('MORE THAN ONE FILE')
    print(potential_file)

In [None]:
file = 'prediction_122e8b39.csv'
saving_id = re.search(r'_([0-9a-z]*).csv', file).group(1)
correlations_per_sim = load(open(drug_folder_name + '/' + file, 'rb'))
del file

## Plot results

In [None]:
to_plot_df = pd.DataFrame(correlations_per_sim)
to_plot_df = to_plot_df.T
to_plot_df.columns = ['cor', 'p-val']
to_plot_df = to_plot_df.loc[order]
to_plot_df.index = labels

In [None]:
yticks = np.arange(0,8) / 10
yticks_labels = [str(y) for y in yticks]
colors = [mpl.colors.TABLEAU_COLORS['tab:gray']] * 4 + \
          [mpl.colors.TABLEAU_COLORS['tab:olive']] * 20

plt.figure(figsize=(8,9))
bplot = sns.barplot(data=to_plot_df.reset_index(),
                    x='index',
                    y='cor',
                    order=labels,
                    palette=colors, alpha=1.)

plt.xlabel(None)
plt.xticks(fontsize=25, color='black', rotation=90, fontproperties=prop_label)
plt.ylim(0,0.7)
plt.yticks(yticks, yticks_labels, fontsize=25, fontproperties=prop_ticks, color='black')
plt.ylabel('Spearman correlation on PDXs', fontsize=25, color='black', fontproperties=prop_label)
plt.tight_layout()
plt.savefig('%s/results_%s.png'%(drug_folder_name, saving_id), dpi=300)