# Figure 5 : Analysis of Afatinib, Gefitinib, Paclitaxel and Gemcitabine TRANSACT predictors
This notebook supports the four panels of Figure 5.

In [None]:
import os, sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from joblib import Parallel, delayed
import scipy
from datetime import date
from adjustText import adjust_text
import ast
import uuid
sns.set_style("whitegrid")
sns.set_context('paper')

from matplotlib import font_manager as fm, rcParams
fpath = os.path.join(rcParams["datapath"], "fonts/ttf/arial.ttf")
prop_label = fm.FontProperties(fname=fpath)
prop_label.set_size(30)
prop_ticks = fm.FontProperties(fname=fpath)
prop_ticks.set_size(25)
fname = os.path.split(fpath)[1]

from sklearn.metrics.pairwise import kernel_metrics
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.linear_model import Ridge, ElasticNet, Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from joblib import dump, load, Parallel, delayed
from statannot.statannot import add_stat_annotation
import umap
from pickle import load, dump

sys.path.insert(0, '../read_data/')
from read_data import read_data
from read_GDSC_response import read_GDSC_response
from read_TCGA_response import read_TCGA_response
import library_size_normalization

from transact.TRANSACT import TRANSACT

In [None]:
tissues = {
    'GDSC': ['All'],
    'TCGA': ['TCGA']
}
projects = {
    'GDSC': [None],
    'TCGA': ['all']
}

data_sources = ['GDSC', 'TCGA']

data_types = ['rnaseq']
genes_filtering = 'mini_cancer'

source = 'GDSC'
target = 'TCGA'

test = 'Mann-Whitney-ls'

with_mean = True
with_std = True

figure_folder = './figures/'

GDSC_drug_name = 'Gemcitabine'
GDSC_drug_id = None
TCGA_drug_name = 'Gemcitabine'

## Import data

### Read data

In [None]:
data_df = read_data(tissues=tissues,
                    data_types=[e for e in data_types],
                    projects=projects,
                    data_sources=data_sources,
                    folder_basis='../data/')

### Put data into format

In [None]:
for ds in list(data_df.keys()):
    assert len(data_df[ds].keys()) == 1
    new_key = ('%s_%s'%(ds, list(data_df[ds].keys())[0])).replace('fpkm', 'tpm')
    data_df[new_key] = data_df[ds][list(data_df[ds].keys())[0]]
    print(new_key, data_df[new_key].shape)
    del data_df[ds]

source_data_key = [ds for ds in data_df if source in ds]
assert len(source_data_key) == 1
source_data_key = np.unique(source_data_key)[0]

target_data_key = [ds for ds in data_df if target in ds]
assert len(target_data_key) == 1
target_data_key = np.unique(target_data_key)[0]

### Removing the healthy samples

In [None]:
healthy_samples_index = data_df[target_data_key].index.str.contains(r'-(10A|11A)-')
data_df[target_data_key] = data_df[target_data_key].loc[~healthy_samples_index]

### Library size correction

In [None]:
average_depth_global = 10**5
for ds in list(data_df.keys()):
    GE_normalized = library_size_normalization.TMM_normalization(data_df[ds].values.astype(float))
    GE_normalized = np.array(GE_normalized)
    average_depths = np.mean(np.sum(GE_normalized,1))
    GE_normalized = GE_normalized / average_depths * average_depth_global
    GE_normalized = np.log(np.array(GE_normalized)+1)
    data_df[ds] = pd.DataFrame(GE_normalized,
                               columns=data_df[ds].columns,
                               index=data_df[ds].index)
    del GE_normalized

### Normalize data

In [None]:
normalized_data_df = {
    ds : StandardScaler(with_mean=with_mean, with_std=with_std).fit_transform(data_df[ds])
    for ds in data_df
}

## TRANSACT

### Kernel params

In [None]:
kernel_surname = 'TRANSACT'
kernel_name = 'rbf'
kernel_param = {
    'gamma': 0.0005
}
kernel = kernel_metrics()[kernel_name]

number_pc = {
    'source': 70,
    'target': 150
}

# Make one subfolder for the kernel analysis
kernel_subfolder = '%s_%s'%(kernel_surname,
                           '_'.join(['%s_%s'%(ds, number_pc[ds]) for ds in number_pc]))
if kernel_subfolder in os.listdir(figure_folder):
    print('BEWARE: ALREADY COMPUTATION IN FIGURE FILE')
else:
    os.makedirs(figure_folder + '/' + kernel_subfolder)
kernel_subfolder = figure_folder + '/' + kernel_subfolder + '/'

In [None]:
with open(kernel_subfolder + 'params.txt', 'w') as param_file:
    param_file.write(str({
        'source': source,
        'target': target,
        'tissues': tissues,
        'genes_filtering': genes_filtering,
        'kernel_params': kernel_param,
        'number_pc': number_pc
    }))

### Alignment

In [None]:
# Compute principal vectors
TRANSACT_clf = TRANSACT(kernel=kernel_name,
                        kernel_params=kernel_param,
                        n_components=number_pc,
                        n_jobs=30,
                        verbose=10)

In [None]:
# Final instance 
n_pv = 30
n_interpolation = 100

TRANSACT_clf.fit(normalized_data_df[source_data_key],
                 normalized_data_df[target_data_key],  
                 n_pv=n_pv,
                 step=n_interpolation,
                 with_interpolation=True)

In [None]:
# Project data
source_consensus_features = TRANSACT_clf.transform(normalized_data_df[source_data_key])
target_consensus_features = TRANSACT_clf.transform(normalized_data_df[target_data_key])

# Put into format
source_consensus_features = pd.DataFrame(source_consensus_features,
                                         index=data_df[source_data_key].index)
target_consensus_features = pd.DataFrame(target_consensus_features,
                                         index=data_df[target_data_key].index)

## Drug response prediction

In [None]:
# GDSC
unique_drugs = None
GDSC_drug_response_frames = {}
for x in ['GDSC2', 'GDSC1']:
    GDSC_drug_response_file = '../data/GDSC/response/%s_fitted_dose_response_25Feb20.xlsx'%(x)
    GDSC_drug_response_frames[x] = pd.read_excel(GDSC_drug_response_file)
    if unique_drugs is None:
        unique_drugs = np.unique(GDSC_drug_response_frames[x]['DRUG_NAME'])
    else:
        unique_drugs = np.concatenate([unique_drugs, np.unique(GDSC_drug_response_frames[x]['DRUG_NAME'])])
        
# TCGA
TCGA_drug_response_file = '../data/TCGA/response/response.csv'

In [None]:
X_source, y_source = read_GDSC_response(GDSC_drug_response_frames, 
                                        GDSC_drug_name,
                                        pd.DataFrame(normalized_data_df[source_data_key].copy(), 
                                                     index=data_df[source_data_key].index, 
                                                     columns=data_df[source_data_key].columns),
                                        GDSC_drug_id)
X_target, y_target = read_TCGA_response(TCGA_drug_name,
                                        pd.DataFrame(normalized_data_df[target_data_key].copy(), 
                                                     index=data_df[target_data_key].index, 
                                                     columns=data_df[target_data_key].columns),
                                        TCGA_drug_response_file)

### Prediction

In [None]:
TRANSACT_clf.fit_predictor(X_source, y_source.values.flatten())

## Interpretability
I here study the direction of drug response.

In [None]:
predictive_coef = TRANSACT_clf.predictive_clf.best_estimator_.named_steps['regression'].coef_

In [None]:
optimal_time = TRANSACT_clf.optimal_time
source_angular = TRANSACT_clf.interpolation_._gamma_interpolations(optimal_time)
target_angular = TRANSACT_clf.interpolation_._xi_interpolations(optimal_time)

source_angular = np.diag(source_angular)
target_angular = np.diag(target_angular)

### Offset

In [None]:
source_norm = np.square(np.linalg.norm(normalized_data_df[source_data_key], axis=1))
source_norm = np.exp(- kernel_param['gamma'] * source_norm)

target_norm = np.square(np.linalg.norm(normalized_data_df[target_data_key], axis=1))
target_norm = np.exp(- kernel_param['gamma'] * target_norm)

In [None]:
sigma_offset_source = source_angular.dot(TRANSACT_clf.principal_vectors_.gamma_coef['source'].dot(source_norm))
sigma_offset_target = target_angular.dot(TRANSACT_clf.principal_vectors_.gamma_coef['target'].dot(target_norm))

sigma_offset = sigma_offset_source + sigma_offset_target
offset_contribution = np.square(sigma_offset)

In [None]:
offset_contribution_pred_direction = np.square(predictive_coef.dot(sigma_offset))

### Linear

In [None]:
def basis_function(x,i, rbf_gamma):
    norm = np.square(np.linalg.norm(x, axis=1))
    constant = np.sqrt(2*rbf_gamma)
    y = np.multiply(constant * x[:,i], np.exp(- rbf_gamma * norm))
    return y - np.mean(y)

p = normalized_data_df[source_data_key].shape[1]
source_basis_eval = Parallel(n_jobs=30, verbose=1)(delayed(basis_function)(normalized_data_df[source_data_key],
                                                                          i, kernel_param['gamma'])
                                                   for i in range(p))

target_basis_eval = Parallel(n_jobs=30, verbose=1)(delayed(basis_function)(normalized_data_df[target_data_key],
                                                                          i, kernel_param['gamma'])
                                                   for i in range(p))

target_basis_eval = np.array(target_basis_eval).T
source_basis_eval = np.array(source_basis_eval).T

In [None]:
sigma_linear_source = source_angular.dot(TRANSACT_clf.principal_vectors_.gamma_coef['source'].dot(source_basis_eval))
sigma_linear_target = target_angular.dot(TRANSACT_clf.principal_vectors_.gamma_coef['target'].dot(target_basis_eval))

sigma_linear = sigma_linear_source + sigma_linear_target
linear_contribution = np.square(np.linalg.norm(sigma_linear, axis=1))

In [None]:
sigma_linear_pred_direction = predictive_coef.dot(sigma_linear)
linear_contribution_pred_direction = np.sum(np.square(sigma_linear_pred_direction))
genes = np.array(data_df[source_data_key].columns).astype(str)

### Interactions

In [None]:
def interaction_loading(i,j, rbf_gamma, norm_source, norm_target):
    constant = 2*rbf_gamma
    if i == j:
        constant /= 2
    
    X_source = normalized_data_df[source_data_key]
    X_source = np.multiply(X_source[:,i], X_source[:,j])
    X_source = np.multiply(X_source, norm_source)
    X_source = source_angular.dot(TRANSACT_clf.principal_vectors_.gamma_coef['source'].dot(X_source))
    
    X_target = normalized_data_df[target_data_key]
    X_target = np.multiply(X_target[:,i], X_target[:,j])
    X_target = np.multiply(X_target, norm_target)
    X_target = target_angular.dot(TRANSACT_clf.principal_vectors_.gamma_coef['target'].dot(X_target))
    
    return constant * (X_source + X_target)

def interaction_loadings_genes(i, rbf_gamma):
    loadings = []
    
    norm_source = np.exp(-rbf_gamma*np.square(np.linalg.norm(normalized_data_df[source_data_key], axis=1)))
    norm_target = np.exp(-rbf_gamma*np.square(np.linalg.norm(normalized_data_df[target_data_key], axis=1)))
    for j in range(i+1):
        if np.random.rand() < 0.01:
            print(j)
        x = interaction_loading(i, j, rbf_gamma, norm_source, norm_target)
        loadings.append(x)
    
    return loadings

In [None]:
loadings = Parallel(n_jobs=40, verbose=1)(delayed(interaction_loadings_genes)(i, kernel_param['gamma'])
                                          for i in range(normalized_data_df[source_data_key].shape[1]))

loadings = np.array([np.array(e) for e in loadings])
interaction_loadings_pred_dir = [x.dot(predictive_coef) for x in loadings]

In [None]:
# Interaction coef between genes i and j in factor k
def interaction_coef(i,j,k):
    return loadings[max(i,j)][min(i,j), k]

# Interaction coefs with gene i in factor k
def gene_interaction_coef(i,k):
    return np.array([interaction_coef(i,j,k) for j in range(len(genes))])

def batch_gene_interaction_coef(I,k):
    return [gene_interaction_coef(i,k) for i in I]

In [None]:
interaction_contribution_pred = [np.sum(np.square(e)) for e in interaction_loadings_pred_dir]
interaction_contribution_pred = np.array(interaction_contribution_pred)
interaction_contribution_pred = np.sum(interaction_contribution_pred, axis=0)

In [None]:
contribution_df = pd.DataFrame(np.array([
    offset_contribution_pred_direction,
    linear_contribution_pred_direction,
    interaction_contribution_pred])).T
contribution_df.columns = ['Offset', 'Linear', 'Interaction']

In [None]:
contribution_df = pd.DataFrame(np.array([
    offset_contribution_pred_direction,
    linear_contribution_pred_direction,
    interaction_contribution_pred])).T
contribution_df.columns = ['Offset', 'Linear', 'Interaction']
contribution_df = contribution_df / np.sum(np.square(predictive_coef))
contribution_df['Higher order'] = 1 - np.sum(contribution_df, axis=1)
contribution_df.plot.barh(stacked=True, width=1.2)

plt.xlim(0,1)
plt.xticks(np.linspace(0,1,5), ['%s%%'%(int(100*e)) for e in np.linspace(0,1,5)], fontsize=20)
plt.yticks([], [])
plt.legend(bbox_to_anchor=(0.7, 1.05), loc=4, borderaxespad=0., fontsize=15, ncol=2)
plt.tight_layout()

plt.savefig('%s/contribution_%s_%s.png'%(kernel_subfolder, GDSC_drug_name, TCGA_drug_name),
           dpi=300)

In [None]:
interaction_loadings_pred_dir = np.concatenate([e for e in interaction_loadings_pred_dir])
gene_list = np.concatenate([list(zip([genes[i]]*(i+1), genes[:i+1])) for i in range(len(genes))])
sorted_index = np.argsort(interaction_loadings_pred_dir)

In [None]:
threshold = 0.000

thresholded_index = (np.abs(interaction_loadings_pred_dir) >= threshold)
interactions_loadings_pred_df = pd.DataFrame(np.array([
    gene_list[thresholded_index][:,0],
    gene_list[thresholded_index][:,1],
    interaction_loadings_pred_dir[thresholded_index]]).T,
                                            columns=['Gene A', 'Gene B', 'Contribution'])
interactions_loadings_pred_df['Contribution'] = interactions_loadings_pred_df['Contribution'].astype(float)

In [None]:
top_interactions = 10**4

interactions_loadings_pred_df['Absolute Contribution'] = np.abs(interactions_loadings_pred_df['Contribution'])
interactions_loadings_pred_df.sort_values('Absolute Contribution', ascending=False, inplace=True)
interactions_loadings_pred_df = interactions_loadings_pred_df.head(top_interactions)
del interactions_loadings_pred_df['Absolute Contribution']
interactions_loadings_pred_df.sort_values('Contribution', ascending=False, inplace=True)

interactions_loadings_pred_df.to_csv('%s/interactions_top_%s_GDSC_%s_TCGA_%s.csv'%('./figures/', 
                                                                                   top_interactions, 
                                                                                   GDSC_drug_name, 
                                                                                   TCGA_drug_name))

In [None]:
interactions_loadings_pred_df = interactions_loadings_pred_df.sort_values('Contribution').reset_index(drop=True)

x = np.arange(interaction_loadings_pred_dir.shape[0])
y = np.sort(interaction_loadings_pred_dir)
plt.figure(figsize=(7,4.5))
plt.plot(x, y, '+')

# Bottom
ts = []
limit = 10
for i in range(limit):
    t = interactions_loadings_pred_df.loc[i]
    t = '(%s,%s)'%(t['Gene A'], t['Gene B'])
    ts.append(plt.text(x[i], y[i], t))
adjust_text(ts, x=x[:limit], y=y[:limit], force_points=(0.1,1.),
            arrowprops=dict(arrowstyle='-', color='red'),
            size=20, force_text=(-1., .5), precision=0.0001, lim=1000)

# Bottom
ts = []
limit = 10
for i in range(1,limit+1):
    t = interactions_loadings_pred_df.loc[interactions_loadings_pred_df.shape[0]-i]
    t = '(%s,%s)'%(t['Gene A'], t['Gene B'])
    ts.append(plt.text(x[-i], y[-i], t))
adjust_text(ts, x=x[-limit-1:][::-1], y=y[-limit-1:][::-1],
            force_points=(.1,1.), force_text=(1.,.5),
            arrowprops=dict(arrowstyle='-', color='red'),
            size=20, precision=0.0001, lim=1000)

plt.xlabel('Gene pairs', fontsize=20, color='black')
plt.ylabel('Interaction loading', fontsize=20, color='black')
plt.xticks([0, interaction_loadings_pred_dir.shape[0]-1],
           [1, str(interaction_loadings_pred_dir.shape[0])], fontsize=15)
plt.yticks(fontsize=15, color='black')
plt.tight_layout()
plt.savefig('%s/interaction_loadings_%s_%s.png'%(kernel_subfolder,
                                                 TCGA_drug_name,
                                                 GDSC_drug_name),
           dpi=300)

del x,y,limit,ts

In [None]:
x = np.arange(interaction_loadings_pred_dir.shape[0])
y = np.sort(interaction_loadings_pred_dir)
plt.figure(figsize=(10,4.5))
plt.plot(x, y, color='black')


plt.fill_between(x, y.clip(0), color='red')
plt.fill_between(x, -(-y).clip(0), color='green')
plt.hlines(0, x.shape[0], 0)
plt.vlines(0, 0, np.min(y))
plt.vlines(y.shape[0]-1, 0, np.max(y))
plt.xticks([])
plt.xlabel('')
plt.yticks(rotation=0, fontsize=20, color='black')
plt.legend().remove()
plt.tight_layout()
plt.savefig('./figures/interaction_plot_%s_%s.png'%(GDSC_drug_name, GDSC_drug_id), dpi=300)

In [None]:
interactions_loadings_pred_df = interactions_loadings_pred_df.sort_values('Contribution')
interactions_loadings_pred_df.to_csv('%s/interaction_loadings_%s_%s_threshold_%s.csv'%(kernel_subfolder,
                                                                                       GDSC_drug_name,
                                                                                       TCGA_drug_name,
                                                                                       str(threshold).replace('.', '_')))
interactions_loadings_pred_df[['Gene A', 'Gene B']].to_csv('%s/interaction_pairs_%s_%s_threshold_%s.csv'%(
    kernel_subfolder,
    GDSC_drug_name,
    TCGA_drug_name,
    str(threshold).replace('.', '_')), sep='\t', index=False)

unique_gene_interacted = np.unique(interactions_loadings_pred_df[['Gene A', 'Gene B']].values.flatten())
with open('%s/interaction_genes_%s_%s_threshold_%s.txt'%(kernel_subfolder,
                                                         GDSC_drug_name,
                                                         TCGA_drug_name,
                                                         str(threshold).replace('.', '_')),
         'w') as f:
    f.write('\n'.join(unique_gene_interacted))