<a href="https://colab.research.google.com/github/JohannesKarwou/notebooks/blob/main/Bootstrapping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [22]:
df = pd.read_csv("https://raw.githubusercontent.com/JohannesKarwou/notebooks/main/data/summary_ddG.csv")
df_dg = pd.read_csv("https://raw.githubusercontent.com/JohannesKarwou/notebooks/main/data/summary_dG.csv")

In [23]:
def bootstrap_function(x_values, y_values):
  # bootstrap metric
  def bootstrap_metric(fct, x_values, y_values):
      assert callable(fct) == True
      bootstrapped_metric = []
      # bootstrap metric to generate test distribution
      for _ in range(1000):
          indices = np.random.choice(range(0, len(x_values)), size=len(x_values), replace=True)
          x_selection = np.take(x_values, indices)
          y_selection = np.take(y_values, indices)
          r = fct(x_selection, y_selection)
          bootstrapped_metric.append(r)    

      # define 90% CI
      alpha = 10.0
      lower_p = alpha / 2.0
      # get value at or near percentile (take a look at the definition of percentile if 
      # you have less than 100 values to make sure you understand what is happening)
      lower = np.percentile(bootstrapped_metric, lower_p)
      upper_p = (100 - alpha) + (alpha / 2.0)
      upper = np.percentile(bootstrapped_metric, upper_p)
      # calculate true mean
      mean = fct(x_values, y_values)

      return mean, lower, upper

  # bootstrap MAE
  mean, lower, upper = bootstrap_metric(mean_absolute_error, x_values, y_values)
  print(f'MAE:  {round(mean, 2):.2f} [{round(lower,2):.2f}, {round(upper,2):.2f}]')

  # bootstrap RMSE
  def calc_rmse(x_values, y_values):
      from sklearn.metrics import mean_squared_error
      return np.sqrt(mean_squared_error(x_values, y_values))
  mean, lower, upper = bootstrap_metric(calc_rmse, x_values, y_values)
  print(f'RMSE:  {round(mean, 2):.2f} [{round(lower,2):.2f}, {round(upper,2):.2f}]')
  plt.show()



In [24]:
## all values ###
x_values = np.asarray_chkfinite(df["literature"][0:75])
y_values = np.asarray_chkfinite(df['TF'][0:75])

print(f'for all ddG values')
bootstrap_function(x_values, y_values)
print(f' Spearmans {scipy.stats.spearmanr(x_values,y_values)}')
## Galectin ###
x_values = np.asarray_chkfinite(df["literature"][0:7])
y_values = np.asarray_chkfinite(df['TF'][0:7])

print(f'ddG values for Galectin')
bootstrap_function(x_values, y_values)
print(f' Spearmans {scipy.stats.spearmanr(x_values,y_values)}')
## CDK2 ###
x_values = np.asarray_chkfinite(df["literature"][7:20])
y_values = np.asarray_chkfinite(df['TF'][7:20])

print(f'ddG values for CDK2')
bootstrap_function(x_values, y_values)
print(f' Spearmans {scipy.stats.spearmanr(x_values,y_values)}')
## 2RA0 ###
x_values = np.asarray_chkfinite(df["literature"][20:31])
y_values = np.asarray_chkfinite(df['TF'][20:31])

print(f'ddG values for 2RA)')
bootstrap_function(x_values, y_values)
print(f' Spearmans {scipy.stats.spearmanr(x_values,y_values)}')
## TYK2 ###
x_values = np.asarray_chkfinite(df["literature"][31:46])
y_values = np.asarray_chkfinite(df['TF'][31:46])

print(f'ddG values for TYK2')
bootstrap_function(x_values, y_values)
print(f' Spearmans {scipy.stats.spearmanr(x_values,y_values)}')

x_values = np.asarray_chkfinite(df["literature"][31:46])
y_values = np.asarray_chkfinite(df['pmx'][31:46])

## JNK1 ###
x_values = np.asarray_chkfinite(df["literature"][46:75])
y_values = np.asarray_chkfinite(df['TF'][46:75])

print(f'ddG values for JNK1')
bootstrap_function(x_values, y_values)
print(f' Spearmans {scipy.stats.spearmanr(x_values,y_values)}')


for all ddG values
MAE:  0.87 [0.72, 1.02]
RMSE:  1.18 [0.98, 1.37]
 Spearmans SpearmanrResult(correlation=0.4832560401447462, pvalue=1.1252322032098382e-05)
ddG values for Galectin
MAE:  0.49 [0.32, 0.67]
RMSE:  0.58 [0.37, 0.74]
 Spearmans SpearmanrResult(correlation=0.5714285714285715, pvalue=0.1802019889115274)
ddG values for CDK2
MAE:  0.80 [0.44, 1.15]
RMSE:  1.12 [0.68, 1.47]
 Spearmans SpearmanrResult(correlation=0.5934065934065934, pvalue=0.03252444027009699)
ddG values for 2RA)
MAE:  1.02 [0.71, 1.37]
RMSE:  1.24 [0.75, 1.70]
 Spearmans SpearmanrResult(correlation=0.7090909090909092, pvalue=0.014552051953727704)
ddG values for TYK2
MAE:  1.37 [0.94, 1.82]
RMSE:  1.74 [1.25, 2.15]
 Spearmans SpearmanrResult(correlation=0.21071428571428572, pvalue=0.450957865215285)
ddG values for JNK1
MAE:  0.68 [0.51, 0.87]
RMSE:  0.91 [0.64, 1.15]
 Spearmans SpearmanrResult(correlation=0.32105459874215997, pvalue=0.08947537386606988)


In [30]:
## all values ###
x_values = np.asarray_chkfinite(df_dg["literature"][0:66])
y_values = np.asarray_chkfinite(df_dg['TF'][0:66])

print(f'for all ddG values')
bootstrap_function(x_values, y_values)
print(f' Spearmans {scipy.stats.spearmanr(x_values,y_values)}')
## Galectin ###
x_values = np.asarray_chkfinite(df["literature"][0:7])
y_values = np.asarray_chkfinite(df['TF'][0:7])

print(f'ddG values for Galectin')
bootstrap_function(x_values, y_values)
print(f' Spearmans {scipy.stats.spearmanr(x_values,y_values)}')
## CDK2 ###
x_values = np.asarray_chkfinite(df["literature"][7:20])
y_values = np.asarray_chkfinite(df['TF'][7:20])

print(f'ddG values for CDK2')
bootstrap_function(x_values, y_values)
print(f' Spearmans {scipy.stats.spearmanr(x_values,y_values)}')
## 2RA0 ###
x_values = np.asarray_chkfinite(df["literature"][20:31])
y_values = np.asarray_chkfinite(df['TF'][20:31])

print(f'ddG values for 2RA)')
bootstrap_function(x_values, y_values)
print(f' Spearmans {scipy.stats.spearmanr(x_values,y_values)}')
## TYK2 ###
x_values = np.asarray_chkfinite(df["literature"][31:46])
y_values = np.asarray_chkfinite(df['TF'][31:46])

print(f'ddG values for TYK2')
bootstrap_function(x_values, y_values)
print(f' Spearmans {scipy.stats.spearmanr(x_values,y_values)}')

x_values = np.asarray_chkfinite(df["literature"][31:46])
y_values = np.asarray_chkfinite(df['pmx'][31:46])

## JNK1 ###
x_values = np.asarray_chkfinite(df["literature"][46:75])
y_values = np.asarray_chkfinite(df['TF'][46:75])

print(f'ddG values for JNK1')
bootstrap_function(x_values, y_values)
print(f' Spearmans {scipy.stats.spearmanr(x_values,y_values)}')


for all ddG values
MAE:  0.82 [0.68, 0.98]
RMSE:  1.12 [0.92, 1.30]
 Spearmans SpearmanrResult(correlation=0.7223039938346053, pvalue=7.626648363919074e-12)
ddG values for Galectin
MAE:  0.49 [0.31, 0.68]
RMSE:  0.58 [0.37, 0.74]
 Spearmans SpearmanrResult(correlation=0.5714285714285715, pvalue=0.1802019889115274)
ddG values for CDK2
MAE:  0.80 [0.46, 1.16]
RMSE:  1.12 [0.75, 1.46]
 Spearmans SpearmanrResult(correlation=0.5934065934065934, pvalue=0.03252444027009699)
ddG values for 2RA)
MAE:  1.02 [0.68, 1.37]
RMSE:  1.24 [0.76, 1.71]
 Spearmans SpearmanrResult(correlation=0.7090909090909092, pvalue=0.014552051953727704)
ddG values for TYK2
MAE:  1.37 [0.94, 1.86]
RMSE:  1.74 [1.28, 2.15]
 Spearmans SpearmanrResult(correlation=0.21071428571428572, pvalue=0.450957865215285)
ddG values for JNK1
MAE:  0.68 [0.50, 0.87]
RMSE:  0.91 [0.63, 1.17]
 Spearmans SpearmanrResult(correlation=0.32105459874215997, pvalue=0.08947537386606988)


In [29]:
df_dg["literature"][0:66]

0     -8.70
1     -8.49
2     -9.74
3     -8.70
4     -9.14
      ...  
61    -8.98
62   -10.53
63    -9.21
64   -11.70
65   -10.98
Name: literature, Length: 66, dtype: float64