<a href="https://colab.research.google.com/github/JohannesKarwou/notebooks/blob/main/freeSolvSummary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install sketch

In [16]:
import sketch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [10]:
data = pd.read_csv("https://raw.githubusercontent.com/JohannesKarwou/notebooks/main/data/finalOverview.csv")

In [11]:
def plotting_grey_area(i):
    # shaded grey area   
    x = np.linspace(-30,20,5)
    y = x
    axs[i].plot(x, y, alpha = 0.9, color = 'C7',linestyle='dashed')
    y2 = x +1
    axs[i].plot(x, y2, alpha = 0.3, color = 'C7',linestyle='dashed')
    y3 = x -1 
    axs[i].plot(x, y3, alpha = 0.3, color = 'C7',linestyle='dashed')
    axs[i].fill_between(x,y2,y3, alpha = 0.3, color = 'Grey')

    x = np.linspace(-30,20,5)
    y = x
    axs[i].plot(x, y, alpha = 0.9, color = 'C7',linestyle='dashed')
    y2 = x +2
    axs[i].plot(x, y2, alpha = 0.3, color = 'C7',linestyle='dashed')
    y3 = x -2 
    axs[i].plot(x, y3, alpha = 0.3, color = 'C7',linestyle='dashed')
    axs[i].fill_between(x,y2,y3, alpha = 0.3, color = 'Grey')

In [61]:
def bootstrap_function(x_values, y_values):
  # bootstrap metric
  def bootstrap_metric(fct, x_values, y_values):
      assert callable(fct) == True
      bootstrapped_metric = []
      # bootstrap metric to generate test distribution
      for _ in range(1000):
          indices = np.random.choice(range(0, len(x_values)), size=len(x_values), replace=True)
          x_selection = np.take(x_values, indices)
          y_selection = np.take(y_values, indices)
          r = fct(x_selection, y_selection)
          bootstrapped_metric.append(r)    

      # define 90% CI
      alpha = 10.0
      lower_p = alpha / 2.0
      # get value at or near percentile (take a look at the definition of percentile if 
      # you have less than 100 values to make sure you understand what is happening)
      lower = np.percentile(bootstrapped_metric, lower_p)
      upper_p = (100 - alpha) + (alpha / 2.0)
      upper = np.percentile(bootstrapped_metric, upper_p)
      # calculate true mean
      mean = fct(x_values, y_values)

      return mean, lower, upper
  
  # bootstrap RMSE
  def calc_rmse(x_values, y_values):
      return np.sqrt(mean_squared_error(x_values, y_values))
  mean, lower, upper = bootstrap_metric(calc_rmse, x_values, y_values)
  print(f'RMSE:  {round(mean, 2):.2f} [{round(lower,2):.2f}, {round(upper,2):.2f}]')

  # bootstrap MAE
  mean, lower, upper = bootstrap_metric(mean_absolute_error, x_values, y_values)
  print(f'MAE:  {round(mean, 2):.2f} [{round(lower,2):.2f}, {round(upper,2):.2f}]')

In [62]:
def bootstrap_function_spearman(x_values, y_values):
  # bootstrap metric
  def bootstrap_metric(x_values, y_values):
      bootstrapped_metric = []
      # bootstrap metric to generate test distribution
      for _ in range(1000):
          indices = np.random.choice(range(0, len(x_values)), size=len(x_values), replace=True)
          x_selection = np.take(x_values, indices)
          y_selection = np.take(y_values, indices)
          r = scipy.stats.spearmanr(x_selection, y_selection)[0]
          bootstrapped_metric.append(r)    

      # define 90% CI
      alpha = 10.0
      lower_p = alpha / 2.0
      # get value at or near percentile (take a look at the definition of percentile if 
      # you have less than 100 values to make sure you understand what is happening)
      lower = np.percentile(bootstrapped_metric, lower_p)
      upper_p = (100 - alpha) + (alpha / 2.0)
      upper = np.percentile(bootstrapped_metric, upper_p)
      # calculate true mean
      spearman = scipy.stats.spearmanr(x_values, y_values)[0]
      return spearman, lower, upper
      # bootstrap MAE
  spear, lower, upper = bootstrap_metric(x_values, y_values)
  print(f"Spearman's correlation:  {round(spear, 2):.2f} [{round(lower,2):.2f}, {round(upper,2):.2f}]")

def bootstrap_function_pearson(x_values, y_values):
  # bootstrap metric
  def bootstrap_metric(x_values, y_values):
      bootstrapped_metric = []
      # bootstrap metric to generate test distribution
      for _ in range(1000):
          indices = np.random.choice(range(0, len(x_values)), size=len(x_values), replace=True)
          x_selection = np.take(x_values, indices)
          y_selection = np.take(y_values, indices)
          r = scipy.stats.pearsonr(x_selection, y_selection)[0]
          bootstrapped_metric.append(r)    

      # define 90% CI
      alpha = 10.0
      lower_p = alpha / 2.0
      # get value at or near percentile (take a look at the definition of percentile if 
      # you have less than 100 values to make sure you understand what is happening)
      lower = np.percentile(bootstrapped_metric, lower_p)
      upper_p = (100 - alpha) + (alpha / 2.0)
      upper = np.percentile(bootstrapped_metric, upper_p)
      # calculate true mean
      pearson = scipy.stats.pearsonr(x_values, y_values)[0]

      return pearson, lower, upper

  # bootstrap MAE
  pearson, lower, upper = bootstrap_metric(x_values, y_values)
  print(f"Pearson correlation:  {round(pearson, 2):.2f} [{round(lower,2):.2f}, {round(upper,2):.2f}]")

In [66]:
data = data[~np.isnan(data['dG (TF)'])]
bootstrap_function(data[" experimental value (kcal/mol)"],data["dG (TF)"])

RMSE:  1.88 [1.71, 2.09]
MAE:  1.31 [1.22, 1.40]


In [42]:
scipy.stats.spearmanr(data[" experimental value (kcal/mol)"],data["dG (TF)"])

SpearmanrResult(correlation=0.9107855188075532, pvalue=5.510583506275039e-240)

In [63]:
bootstrap_function_spearman(data[" experimental value (kcal/mol)"],data["dG (TF)"])

Spearman's correlation:  0.91 [0.89, 0.93]


In [58]:
round(scipy.stats.pearsonr(data[" experimental value (kcal/mol)"],data["dG (TF)"])[0],2)

0.9

In [64]:
bootstrap_function_pearson(data[" experimental value (kcal/mol)"],data["dG (TF)"])

Pearson correlation:  0.90 [0.88, 0.92]
