## Biomarker test
Certain biomarkers are known to be correlated with the drug response. This notebook investigates such behaviors. First, we predict the drug response using our domain-adapted regression model. Then we correlate this drug response to the biomarker activity, be it mutation or copy number alteration.
<br/>
We also provide a comparison of correlation with two other pipelines:
<ul>
    <li> Without using anything. Ridge regression model (or ElasticNet) is directly transferred to the tumors
    <li> Using ComBat and considering cell lines, PDX and tumors as two batches. Transcriptomics data is corrected using ComBat. Then the Ridge regression model is directly applied to human tumors.
</ul>
ComBat implementation used comes from: <a href="https://github.com/brentp/combat.py">https://github.com/brentp/combat.py</a>

In [None]:
tumor_surname = 'Breast'
cell_line_type = 'BRCA'
drug_id = 119

normalization = 'TMM'
transformation = 'log'
mean_center = True
std_unit = False
filter_mytochondrial = False
protein_coding_only = True

In [None]:
import sys, os
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy
from time import time
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet, Ridge
from sklearn.externals.joblib import Parallel, delayed
from sklearn.decomposition import PCA
plt.style.use('ggplot')

os.environ['OMP_NUM_THREADS'] = '1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'

import precise
from data_reader.read_data import read_data
from data_reader.read_drug_response import read_drug_response
from data_reader.read_cna_tumors import read_cna_tumors
from data_reader.read_mutations_tumors import read_mutations_tumors
from data_reader.read_translocations_tumors import read_translocations_tumors
from normalization_methods.feature_engineering import feature_engineering
from precise import DrugResponsePredictor, IntermediateFactors

sys.path.insert(0, './combat/')
from combat import combat

sys.path.insert(0, './statannot/')
from statannot.statannot import add_stat_annotation

## Data import

In [None]:
# Genomic data
X_target_raw, X_source_raw, gene_names, source_sample_names, tumor_barcodes = read_data('cell_line',
                                                                                    'tumor',
                                                                                    'count',
                                                                                    cell_line_type if cell_line_type != 'All' else None,
                                                                                    tumor_surname,
                                                                                    filter_mytochondrial)

_, X_source_all_raw, gene_names, all_source_sample_names, _ = read_data('cell_line',
                                                                        'tumor',
                                                                        'count',
                                                                        None,
                                                                        tumor_surname,
                                                                        filter_mytochondrial)

### Library size normalization

In [None]:
# Normalisation
X_target = feature_engineering(X_target_raw, normalization, transformation, True, False)
X_source = feature_engineering(X_source_raw, normalization, transformation, True, False)
X_source_all = feature_engineering(X_source_all_raw, normalization, transformation, True, False)

In [None]:
## Normalize for total variance
total_variance = 10**3

target_total_variance = np.sqrt(np.sum(np.var(X_target, 0)))
X_target = X_target / target_total_variance
X_target *= total_variance

source_total_variance = np.sqrt(np.sum(np.var(X_source, 0)))
X_source = X_source / source_total_variance
X_source *= total_variance

source_all_total_variance = np.sqrt(np.sum(np.var(X_source_all, 0)))
X_source_all = X_source_all / source_total_variance
X_source_all *= total_variance

In [None]:
# Drug response
X_source_response, y_source, response_sample_names , name = read_drug_response(drug_id,
                                                                               X_source_all,
                                                                               all_source_sample_names,
                                                                              'count')
print(name)

In [None]:
# Filter data
target_data = X_target
source_data = X_source_all[np.where(~np.isin(source_sample_names,response_sample_names))]

## Train regression model

### Domain-adapted method

In [None]:
regression_type = 'consensus'
n_samples = 100
n_pv = 40
n_factors = 70

predictor = DrugResponsePredictor(source_data=source_data, method=regression_type, n_representations=n_samples,\
                                l1_ratio=0., target_data=target_data, n_pv=n_pv, n_factors=n_factors,\
                                n_jobs=10, mean_center=mean_center, std_unit=std_unit, dim_reduction='pca', dim_reduction_target='pca')
predictor.alpha_values = list(np.logspace(-7,7,20))
predictor.cv_fold = 10
predictor.verbose = 5
predictor.fit(X_source_response, y_source, use_data=True)

#predictor.compute_predictive_performance(X_source_response, y_source)

### Ridge regression on raw data

In [None]:
#Parameters for the grid search
alpha_values = np.logspace(-10,10,30)
param_grid ={
    'regression__alpha': alpha_values
}

#Grid search setup

grid_en = GridSearchCV(Pipeline([
                        ('normalization', StandardScaler(with_mean=mean_center, with_std=False)),
                        ('regression', Ridge())
                    ]),\
                    cv=10, n_jobs=30, param_grid=param_grid, verbose=1, scoring='neg_mean_squared_error')

#Fit grid search
grid_en.fit(X_source_response, y_source)

### ComBat as domain adaptation

In [None]:
X_total = np.concatenate([X_target, X_source_all])

# Filter genes to remove the potential 0: remain conservative
gene_variance = np.var(X_total, 0)
gene_variance = np.argsort(gene_variance)[::-1]
list_genes = gene_variance[:15000]
X_total = X_total[:, list_genes]

# Combat normalization
a = [0]*X_target.shape[0] + [1]*X_source_all.shape[0]
batch = pd.Series(a)
batch.index = np.concatenate([tumor_barcodes, all_source_sample_names])

data = pd.DataFrame(X_total.transpose().astype(int))
data.columns = np.concatenate([tumor_barcodes, all_source_sample_names])
data.index = gene_names[list_genes]

batch_corrected_data = combat.combat(data, batch=batch)
X_target_corrected = np.array(batch_corrected_data)[:,:X_target.shape[0]].transpose()
X_source_corrected = np.array(batch_corrected_data)[:,X_target.shape[0]:].transpose()
X_source_response_corrected = X_source_corrected[np.where(np.isin(all_source_sample_names, response_sample_names))]

In [None]:
#Parameters for the grid search
alpha_values = np.logspace(-1,8,20)
param_grid ={
    'regression__alpha': alpha_values
}

#Grid search setup

grid_combat = GridSearchCV(Pipeline([
                        ('normalization', StandardScaler(with_mean=mean_center, with_std=std_unit)),
                        ('regression', Ridge())
                    ]),\
                    cv=10, n_jobs=20, param_grid=param_grid, verbose=1, scoring='neg_mean_squared_error')

#Fit grid search
grid_combat.fit(X_source_response_corrected, y_source)

## Biomarker test

In [None]:
df = pd.DataFrame(tumor_barcodes, columns=['NAME'])

### Read data
#### For CNA

In [None]:
gene_name = 'ERBB2'
allele = None
data_location = './data/biomarkers/tcga_%s/data.txt'%(tumor_surname.lower())
cna_tumors = read_cna_tumors(gene_name, tumor_barcodes, data_location)

# /!!\\ TO CHANGE
data_location_linear = './data/biomarkers/tcga_%s/data_linear_CNA.txt'%(tumor_surname.lower())
cna_tumors_linear = read_cna_tumors(gene_name, tumor_barcodes, data_location_linear)

biomarkers_discrete = cna_tumors
biomarkers_continuous = cna_tumors_linear

biomarkers_discrete = biomarkers_discrete.astype(int).astype(str)
# /!!\\ TO CHANGE (magnitude of CNA)
biomarkers_discrete[~np.isin(biomarkers_discrete, np.arange(-1,4).astype(str))] = 'na'

df['CNN_discrete'] = biomarkers_discrete
df['CNN_continuous'] = biomarkers_continuous

#### For mutations data

In [None]:
gene_name = 'BRCA1'
allele = ''
data_location = './data/biomarkers/tcga_%s/%s_mutation_status.csv'\
                %(tumor_surname.lower(), gene_name)
detail_data_location = './data/biomarkers/tcga_%s/%s_mutation_detailed.csv'\
                %(tumor_surname.lower(), gene_name)
mutations_tumors = read_mutations_tumors(gene_name, tumor_barcodes, data_location, detail_data_location)

if allele != '':
    mutations_tumors = [e if allele not in e else allele for e in mutations_tumors]

biomarkers_discrete = np.copy(mutations_tumors)
biomarkers_continuous = np.copy(mutations_tumors)
biomarkers_continuous[np.where(mutations_tumors == '-1.0')] = 'na'
biomarkers_discrete[np.where(mutations_tumors == '-1.0')] = 'na'

if allele == '':
    biomarkers_continuous[~np.isin(biomarkers_continuous, ['na', '0.0'])] = ''
    biomarkers_discrete[~np.isin(biomarkers_discrete, ['na', '0.0'])] = 'MT'
    biomarkers_discrete[np.isin(biomarkers_discrete, ['na', '0.0'])] = 'WT'
else:
    biomarkers_discrete = np.array([e for e in biomarkers_discrete])
    biomarkers_discrete = np.array(['WT' if e == '0.0' else \
                           (allele if e == allele else\
                            (e if e == 'na' else 'MT')) for e in biomarkers_discrete])
    biomarkers_continuous = np.array([e[:len(allele)] for e in biomarkers_continuous])
    biomarkers_continuous = np.array(['WT' if e == '0.0' else \
                           (allele if e == allele else\
                            (e if e == 'na' else 'MT')) for e in biomarkers_continuous])
    
df['mutation_discrete'] = biomarkers_discrete
df['mutation_continuous'] = biomarkers_continuous

#### For translocations data

In [None]:
gene_A = 'BCR'
gene_B = 'ABL1'
allele = ''

translocation_indicator = read_translocations_tumors(gene_A, gene_B, tumor_barcodes)
biomarkers_continuous = translocation_indicator
biomarkers_discrete = translocation_indicator

df['translocation'] = translocation_indicator

### Compute drug response for target

In [None]:
y_tumors = predictor.predict(X_target)
y_tumors_en = grid_en.predict(X_target)
y_tumors_combat = grid_combat.predict(X_target_corrected)

#df['consensus'] = y_tumors
df['ridge'] = y_tumors_en
df['combat'] = y_tumors_combat

if allele is None:
    to_keep = ~(np.isnan(biomarkers_continuous))
    to_keep_discrete = biomarkers_discrete != 'na'
else:
    to_keep = biomarkers_continuous != 'na'
    to_keep_discrete = biomarkers_discrete != 'na'

x = biomarkers_continuous[to_keep]
y_precise = y_tumors[to_keep]
y_en = y_tumors_en[to_keep]
y_combat = y_tumors_combat[to_keep]

In [None]:
plt.figure(figsize=(12,6))
plt.subplot(131)
plt.scatter(y_tumors_en, y_tumors)

plt.subplot(132)
plt.scatter(grid_combat.predict(pca.transform(X_source_response)), predictor.predict(X_source_response))

plt.subplot(133)
plt.scatter(grid_en.predict(X_source_response), predictor.predict(X_source_response))
plt.show()

In [None]:
print('Ridge: %s'%(np.mean(np.square(grid_en.predict(X_source_response) - y_source))))
print('ComBat: %s'%(np.mean(np.square(grid_combat.predict(X_source_response_corrected) - y_source))))
print('DA: %s'%(np.mean(np.square(predictor.predict(X_source_response) - y_source))))

In [None]:
# Takes available gene status
gene_status = np.unique(biomarkers_discrete[to_keep_discrete])
if allele == '':
    gene_status = ['WT','MT']
elif allele is not None:
    gene_status = ['WT', allele, 'MT']

### Mutations

In [None]:
def plot_mutation_status(df, type_regression, x, y):
    ax = plt.figure(figsize=(7,8))

    #Plot the different values in a boxplot
    bplot = sns.boxplot(data=df[df.mutation_discrete != 'na'],
                        x='mutation_discrete', 
                        y=type_regression,   
                        linewidth=2.,
                        width=.8,
                        whis=[5,95],
                        showfliers=False,
                        boxprops=dict(alpha=.2))
    sns.swarmplot(data=df[df.mutation_discrete != 'na'], x='mutation_discrete', y=type_regression)

    # Annotation
    if allele == '':
        bxpair = [('MT', 'WT')]
    elif allele is not None:
        bxpair = [(allele, 'MT'), ('WT', allele)]
    add_stat_annotation(bplot, data=df[df.mutation_discrete != 'na'],
                        x='mutation_discrete', y=type_regression,
                        boxPairList=bxpair,
                        textFormat='full', loc='inside', verbose=2, fontsize=16)
    
    plt.xlabel('Mutation', fontsize=17)
    plt.yticks(fontsize=15)
    plt.ylabel('Log IC50 predicted for tumors', fontsize=17)
    plt.tight_layout()
    plt.savefig('figures/fig_4_%s_%s_%s_%s_%s_%s_%s_%s_%s%s.png'%(
        name,
        type_regression,
        gene_name,
        allele if allele is not None else 'cna',
        tumor_surname,
        cell_line_type.replace('/',''),
        n_pv,
        n_factors,
       n_samples,\
        '_standardized' if std_unit else ''
    ), dpi=300)

    plt.show()

In [None]:
plot_mutation_status(df, 'consensus', x, y_precise)
plot_mutation_status(df, 'ridge', x, y_en)
plot_mutation_status(df, 'combat', x, y_combat)

## Copy Number

In [None]:
def plot_cna_status(df, type_regression, x, y):
    ax = plt.figure(figsize=(7,8))
    
    #Plot the different values in a boxplot
    #df = df.sort_values('CNN_continuous')
    bplot = sns.boxplot(data=df[df['CNN_discrete'] != 'na'],
                        x='CNN_discrete',
                        y=type_regression,
                        linewidth=2.,
                        width=.8,
                        whis=[5,95],
                        showfliers=False,
                        boxprops=dict(alpha=.2),
                        order=['-1', '0', '1', '2'])
    sns.swarmplot(data=df[df['CNN_discrete'] != 'na'],
                  x='CNN_discrete',
                  y=type_regression,
                  order=['-1', '0', '1', '2'],
                  alpha=.6)
    
    # Compute correlation
    spearman = scipy.stats.spearmanr(x, y)
    pearson = scipy.stats.pearsonr(x, y)
    
    # Annotation
    ## /!| TO CHANGE DEPENDENDING ON WHICH TEST YOU WANT TO DO
    bxpair = [('-1','0'), ('-1', '1'), ('-1', '2')]
    add_stat_annotation(bplot, data=df[df['CNN_discrete'] != 'na'],
                        x='CNN_discrete', y=type_regression,
                        boxPairList=bxpair,
                        textFormat='star', loc='inside', verbose=2, fontsize=16)

    plt.title('Spearman correlation: %.3f p-value %.3E \n Pearson correlation: %.3f p-value %.3E'\
              %(spearman[0], spearman[1], pearson[0], pearson[1]))
    plt.xlabel('Mutation', fontsize=17)
    plt.yticks(fontsize=15)
    plt.ylabel('Log IC50 predicted for tumors', fontsize=17)
    plt.tight_layout()
    plt.savefig('figures/fig_4_%s_%s_cna_%s_%s_%s_%s_%s_%s%s.png'%(
        name,
        type_regression,
        gene_name,
        tumor_surname,
        cell_line_type,
        n_pv,
        n_factors,
       n_samples,\
        '_standardized' if std_unit else ''
    ), dpi=300)

    plt.show()

In [None]:
plot_cna_status(df, 'consensus', x, y_precise)
plot_cna_status(df, 'ridge', x, y_en)
plot_cna_status(df, 'combat', x, y_combat)

### Translocation

In [None]:
def plot_translocation_status(df, type_regression, x, y):
    ax = plt.figure(figsize=(7,8))
    
    #Plot the different values in a boxplot
    #df = df.sort_values('CNN_continuous')
    bplot = sns.boxplot(data=df,
                        x='translocation',
                        y=type_regression,
                        linewidth=2.,
                        width=.8,
                        whis=[5,95],
                        showfliers=False,
                        boxprops=dict(alpha=.2))
    sns.swarmplot(data=df,
                  x='translocation',
                  y=type_regression,
                  alpha=.6)
    
    # Annotation
    bxpair = [(0., 1.)]
    add_stat_annotation(bplot, data=df,
                        x='translocation', y=type_regression,
                        boxPairList=bxpair,
                        textFormat='full', loc='inside', verbose=2)
    
    plt.xlabel('Translocation', fontsize=17)
    plt.ylabel('Log IC50 predicted for tumors', fontsize=17)
    plt.tight_layout()
    plt.savefig('figures/fig_4_%s_%s_translocation_%s_%s_%s_%s_%s_%s_%s%s.png'%(
        name,
        type_regression,
        gene_A,
        gene_B,
        tumor_surname,
        cell_line_type,
        n_pv,
        n_factors,
       n_samples,\
        '_standardized' if std_unit else ''
    ), dpi=300)

    plt.show()

In [None]:
plot_translocation_status(df, 'consensus', x, y_precise)
plot_translocation_status(df, 'ridge', x, y_en)
plot_translocation_status(df, 'combat', x, y_combat)