<a href="https://colab.research.google.com/github/JohannesKarwou/notebooks/blob/main/Bootstrapping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [11]:
df = pd.read_csv("https://raw.githubusercontent.com/JohannesKarwou/notebooks/main/data/summary_ddG.csv")
df_dg = pd.read_csv("https://raw.githubusercontent.com/JohannesKarwou/notebooks/main/data/summary_dG.csv")

In [12]:
def bootstrap_function(x_values, y_values):
  # bootstrap metric
  def bootstrap_metric(fct, x_values, y_values):
      assert callable(fct) == True
      bootstrapped_metric = []
      # bootstrap metric to generate test distribution
      for _ in range(1000):
          indices = np.random.choice(range(0, len(x_values)), size=len(x_values), replace=True)
          x_selection = np.take(x_values, indices)
          y_selection = np.take(y_values, indices)
          r = fct(x_selection, y_selection)
          bootstrapped_metric.append(r)    

      # define 90% CI
      alpha = 10.0
      lower_p = alpha / 2.0
      # get value at or near percentile (take a look at the definition of percentile if 
      # you have less than 100 values to make sure you understand what is happening)
      lower = np.percentile(bootstrapped_metric, lower_p)
      upper_p = (100 - alpha) + (alpha / 2.0)
      upper = np.percentile(bootstrapped_metric, upper_p)
      # calculate true mean
      mean = fct(x_values, y_values)

      return mean, lower, upper

  # bootstrap MAE
  mean, lower, upper = bootstrap_metric(mean_absolute_error, x_values, y_values)
  print(f'MAE:  {round(mean, 2):.2f} [{round(lower,2):.2f}, {round(upper,2):.2f}]')

  # bootstrap RMSE
  def calc_rmse(x_values, y_values):
      from sklearn.metrics import mean_squared_error
      return np.sqrt(mean_squared_error(x_values, y_values))
  mean, lower, upper = bootstrap_metric(calc_rmse, x_values, y_values)
  print(f'RMSE:  {round(mean, 2):.2f} [{round(lower,2):.2f}, {round(upper,2):.2f}]')
  plt.show()



In [39]:
def calc_scipy(x,y):
  pearson = scipy.stats.pearsonr(x_values,y_values)
  spearman = scipy.stats.spearmanr(x_values,y_values)
  print(f' Pearson correlation {round(pearson[0],2)}')
  print(f' Spearmans {round(spearman[0],2)}')


In [45]:
### Here the calculated dG values are processed! ###
############ FOR TRANSFORMATO RESULTS ##############

print('#### Results for TRANSFORMATO #####')

##### summary of all dG values of all systems ######

x_values = np.asarray_chkfinite(df_dg["literature"][0:67])
y_values = np.asarray_chkfinite(df_dg['TF'][0:67])
print(f'for all dG values')
bootstrap_function(x_values, y_values)
calc_scipy(x_values,y_values)

######## Values for the indiviual systems #########
## Galectin ###
print('#################')
x_values = np.asarray_chkfinite(df_dg["literature"][21:29])
y_values = np.asarray_chkfinite(df_dg['TF'][21:29])
print(f'dG values for Galectin')
bootstrap_function(x_values, y_values)
calc_scipy(x_values,y_values)

## CDK2 ###
print('#################')
x_values = np.asarray_chkfinite(df_dg["literature"][39:52])
y_values = np.asarray_chkfinite(df_dg['TF'][39:52])
print(f'dG values for CDK2')
bootstrap_function(x_values, y_values)
calc_scipy(x_values,y_values)

## 2RA0 ###
print('#################')
x_values = np.asarray_chkfinite(df_dg["literature"][29:39])
y_values = np.asarray_chkfinite(df_dg['TF'][29:39])
print(f'dG values for 2RA)')
bootstrap_function(x_values, y_values)
calc_scipy(x_values,y_values)

## TYK2 ###
print('#################')
x_values = np.asarray_chkfinite(df_dg["literature"][52:67])
y_values = np.asarray_chkfinite(df_dg['TF'][52:67])
print(f'dG values for TYK2')
bootstrap_function(x_values, y_values)
calc_scipy(x_values,y_values)

## JNK1 ###
print('#################')
x_values = np.asarray_chkfinite(df_dg["literature"][0:21])
y_values = np.asarray_chkfinite(df_dg['TF'][0:21])
print(f'dG values for JNK1')
bootstrap_function(x_values, y_values)
calc_scipy(x_values,y_values)

#### Results for TRANSFORMATO #####
for all dG values
MAE:  0.85 [0.69, 1.01]
RMSE:  1.17 [0.97, 1.36]
 Pearson correlation 0.73
 Spearmans 0.7
#################
dG values for Galectin
MAE:  0.69 [0.35, 1.04]
RMSE:  0.90 [0.48, 1.25]
 Pearson correlation 0.59
 Spearmans 0.54
#################
dG values for CDK2
MAE:  0.80 [0.45, 1.18]
RMSE:  1.12 [0.72, 1.45]
 Pearson correlation 0.61
 Spearmans 0.59
#################
dG values for 2RA)
MAE:  0.84 [0.58, 1.11]
RMSE:  0.98 [0.72, 1.25]
 Pearson correlation 0.78
 Spearmans 0.65
#################
dG values for TYK2
MAE:  1.37 [0.91, 1.85]
RMSE:  1.74 [1.23, 2.12]
 Pearson correlation 0.42
 Spearmans 0.21
#################
dG values for JNK1
MAE:  0.57 [0.38, 0.79]
RMSE:  0.81 [0.54, 1.05]
 Pearson correlation 0.6
 Spearmans 0.63
#### Results for PMX #####
#################
dG values for Galectin
MAE:  0.43 [0.28, 0.56]
RMSE:  0.50 [0.37, 0.62]
 Pearson correlation 0.9
 Spearmans 0.78
#################
dG values for CDK2
MAE:  0.89 [0.55, 

In [46]:
############ FOR PMX RESULTS ##############

print('#### Results for PMX #####')

######## Values for the indiviual systems #########
## Galectin ###
print('#################')
x_values = np.asarray_chkfinite(df_dg["literature"][21:29])
y_values = np.asarray_chkfinite(df_dg['pmx'][21:29])
print(f'dG values for Galectin')
bootstrap_function(x_values, y_values)
calc_scipy(x_values,y_values)

## CDK2 ###
print('#################')
x_values = np.asarray_chkfinite(df_dg["literature"][39:52])
y_values = np.asarray_chkfinite(df_dg['pmx'][39:52])
print(f'dG values for CDK2')
bootstrap_function(x_values, y_values)
calc_scipy(x_values,y_values)

## TYK2 ###
print('#################')
x_values = np.asarray_chkfinite(df_dg["literature"][52:67])
y_values = np.asarray_chkfinite(df_dg['pmx'][52:67])
print(f'dG values for TYK2')
bootstrap_function(x_values, y_values)
calc_scipy(x_values,y_values)

## JNK1 ###
print('#################')
x_values = np.asarray_chkfinite(df_dg["literature"][0:21])
y_values = np.asarray_chkfinite(df_dg['pmx'][0:21])
print(f'dG values for JNK1')
bootstrap_function(x_values, y_values)
calc_scipy(x_values,y_values)

#### Results for PMX #####
#################
dG values for Galectin
MAE:  0.43 [0.27, 0.58]
RMSE:  0.50 [0.36, 0.61]
 Pearson correlation 0.9
 Spearmans 0.78
#################
dG values for CDK2
MAE:  0.89 [0.59, 1.22]
RMSE:  1.14 [0.81, 1.41]
 Pearson correlation 0.41
 Spearmans 0.63
#################
dG values for TYK2
MAE:  1.61 [1.22, 2.04]
RMSE:  1.87 [1.46, 2.24]
 Pearson correlation 0.53
 Spearmans 0.46
#################
dG values for JNK1
MAE:  0.57 [0.38, 0.78]
RMSE:  0.81 [0.53, 1.05]
 Pearson correlation 0.66
 Spearmans 0.77


In [54]:
############ FOR FEP+ RESULTS ##############

print('#### Results for Schroedinger FEP+ #####')

######## Values for the indiviual systems #########
## CDK2 ###
print('#################')
x_values = np.asarray_chkfinite(df_dg["literature"][39:52])
y_values = np.asarray_chkfinite(df_dg['schroedinger'][39:52])
print(f'dG values for CDK2')
bootstrap_function(x_values, y_values)
calc_scipy(x_values,y_values)

## TYK2 ###
print('#################')
x_values = np.asarray_chkfinite(df_dg["literature"][52:67])
y_values = np.asarray_chkfinite(df_dg['schroedinger'][52:67])
print(f'dG values for TYK2')
bootstrap_function(x_values, y_values)
calc_scipy(x_values,y_values)

## JNK1 ###
print('#################')
x_values = np.asarray_chkfinite(df_dg["literature"][0:21])
y_values = np.asarray_chkfinite(df_dg['schroedinger'][0:21])
print(f'dG values for JNK1')
bootstrap_function(x_values, y_values)
calc_scipy(x_values,y_values)

#### Results for Schroedinger FEP+ #####
#################
dG values for CDK2
MAE:  0.82 [0.61, 1.05]
RMSE:  0.95 [0.76, 1.13]
 Pearson correlation 0.52
 Spearmans 0.58
#################
dG values for TYK2
MAE:  0.46 [0.31, 0.62]
RMSE:  0.58 [0.38, 0.76]
 Pearson correlation 0.88
 Spearmans 0.85
#################
dG values for JNK1
MAE:  1.06 [0.90, 1.21]
RMSE:  1.14 [0.99, 1.30]
 Pearson correlation 0.85
 Spearmans 0.9


In [53]:
############ FOR AMBER TI RESULTS ##############

print('#### Results for AMBER TI #####')

######## Values for the indiviual systems #########


## CDK2 ###
print('#################')
x_values = np.asarray_chkfinite(df_dg["literature"][39:52])
y_values = np.asarray_chkfinite(df_dg['AMBER TI'][39:52])
print(f'dG values for CDK2')
bootstrap_function(x_values, y_values)
calc_scipy(x_values,y_values)

## 2RA0 ###
print('#################')
x_values = np.asarray_chkfinite(df_dg["literature"][29:39])
y_values = np.asarray_chkfinite(df_dg['AMBER TI'][29:39])
print(f'dG values for 2RA)')
bootstrap_function(x_values, y_values)
calc_scipy(x_values,y_values)



#### Results for AMBER TI #####
#################
dG values for CDK2
MAE:  0.72 [0.53, 0.92]
RMSE:  0.84 [0.63, 1.02]
 Pearson correlation 0.74
 Spearmans 0.79
#################
dG values for 2RA)
MAE:  0.66 [0.34, 1.04]
RMSE:  0.96 [0.42, 1.41]
 Pearson correlation 0.83
 Spearmans 0.81


In [13]:
## all values ###
x_values = np.asarray_chkfinite(df["literature"][0:75])
y_values = np.asarray_chkfinite(df['TF'][0:75])

print(f'for all ddG values')
bootstrap_function(x_values, y_values)
print(f' Spearmans {scipy.stats.spearmanr(x_values,y_values)}')
## Galectin ###
x_values = np.asarray_chkfinite(df["literature"][0:7])
y_values = np.asarray_chkfinite(df['TF'][0:7])

print(f'ddG values for Galectin')
bootstrap_function(x_values, y_values)
print(f' Spearmans {scipy.stats.spearmanr(x_values,y_values)}')
## CDK2 ###
x_values = np.asarray_chkfinite(df["literature"][7:20])
y_values = np.asarray_chkfinite(df['TF'][7:20])

print(f'ddG values for CDK2')
bootstrap_function(x_values, y_values)
print(f' Spearmans {scipy.stats.spearmanr(x_values,y_values)}')
## 2RA0 ###
x_values = np.asarray_chkfinite(df["literature"][20:31])
y_values = np.asarray_chkfinite(df['TF'][20:31])

print(f'ddG values for 2RA)')
bootstrap_function(x_values, y_values)
print(f' Spearmans {scipy.stats.spearmanr(x_values,y_values)}')
## TYK2 ###
x_values = np.asarray_chkfinite(df["literature"][31:46])
y_values = np.asarray_chkfinite(df['TF'][31:46])

print(f'ddG values for TYK2')
bootstrap_function(x_values, y_values)
print(f' Spearmans {scipy.stats.spearmanr(x_values,y_values)}')

x_values = np.asarray_chkfinite(df["literature"][31:46])
y_values = np.asarray_chkfinite(df['pmx'][31:46])

## JNK1 ###
x_values = np.asarray_chkfinite(df["literature"][46:75])
y_values = np.asarray_chkfinite(df['TF'][46:75])

print(f'ddG values for JNK1')
bootstrap_function(x_values, y_values)
print(f' Spearmans {scipy.stats.spearmanr(x_values,y_values)}')


for all ddG values
MAE:  0.87 [0.72, 1.02]
RMSE:  1.18 [0.97, 1.37]
 Spearmans SpearmanrResult(correlation=0.4832560401447462, pvalue=1.1252322032098382e-05)
ddG values for Galectin
MAE:  0.49 [0.31, 0.68]
RMSE:  0.58 [0.39, 0.73]
 Spearmans SpearmanrResult(correlation=0.5714285714285715, pvalue=0.1802019889115274)
ddG values for CDK2
MAE:  0.80 [0.47, 1.17]
RMSE:  1.12 [0.68, 1.44]
 Spearmans SpearmanrResult(correlation=0.5934065934065934, pvalue=0.03252444027009699)
ddG values for 2RA)
MAE:  1.02 [0.70, 1.38]
RMSE:  1.24 [0.75, 1.70]
 Spearmans SpearmanrResult(correlation=0.7090909090909092, pvalue=0.014552051953727704)
ddG values for TYK2
MAE:  1.37 [0.90, 1.83]
RMSE:  1.74 [1.25, 2.11]
 Spearmans SpearmanrResult(correlation=0.21071428571428572, pvalue=0.450957865215285)
ddG values for JNK1
MAE:  0.68 [0.50, 0.86]
RMSE:  0.91 [0.64, 1.17]
 Spearmans SpearmanrResult(correlation=0.32105459874215997, pvalue=0.08947537386606988)
