## Fitting drug response curves with sigmoid function

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from scipy.optimize import curve_fit
from sklearn.metrics import r2_score

from tqdm import tqdm
# import warnings
# warnings.filterwarnings("ignore")

import os, sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
os.environ["PYTHONWARNINGS"] = "ignore"

_FOLDER = "data/"
_FOLDER_2 = "figures/"
_FOLDER_3 = "results/"
from fitting import *
from filtering import *

## Fitting data

In [2]:
drug_curves = pd.read_csv(_FOLDER+"normalised_dose_response_data_GDCS2_EC_conc.csv", sep = "\t")

if "Unnamed: 0" in drug_curves:
    drug_curves.drop("Unnamed: 0", axis=1, inplace =True)

conc_columns= ["fd_num_"+str(i) for i in range(8)]
response_norm = ['norm_cells_'+str(i) for i in range(8)]

CCL_names = dict(zip(drug_curves["COSMIC_ID"], drug_curves["CELL_LINE_NAME"]))
df= pd.read_csv(_FOLDER+'Drug_Features.csv')
drug_names = dict(zip(df["Drug ID"].values, df["Drug Name"].values))
del df
drug_curves["drug_name"] = drug_curves["DRUG_ID"].map(drug_names)
drug_curves["CCL_name"] = drug_curves["COSMIC_ID"].map(CCL_names)
drug_curves.shape

(212349, 33)

## Comparison of fitting models

In [3]:
functions = [
             "fsigmoid", 
             "sigmoid_2_param",
             "sigmoid_3_param",
             "sigmoid_4_param",
             "logistic_4_param",
            "ll4_4_param",
            "ll4R_4_param",
            "logLogist_3_param"]

### What if don't use any filtering

In [4]:
%%time
df = CompareFittingFunctions(drug_curves, functions, conc_columns, response_norm, save_file_name = _FOLDER+"fit_no_filt_gdsc2.csv")

  0%|          | 0/212349 [00:00<?, ?it/s]

(212349, 33)

 fsigmoid


100%|██████████| 212349/212349 [07:36<00:00, 464.69it/s]


<function fsigmoid at 0x7f9045ac9598>

 sigmoid_2_param


100%|██████████| 212349/212349 [05:44<00:00, 615.62it/s]


<function sigmoid_2_param at 0x7f9045ac9510>

 sigmoid_3_param


100%|██████████| 212349/212349 [21:59<00:00, 160.93it/s]  


<function sigmoid_3_param at 0x7f9045ac9620>

 sigmoid_4_param


100%|██████████| 212349/212349 [22:46<00:00, 155.44it/s] 


<function sigmoid_4_param at 0x7f9045ac96a8>

 logistic_4_param


100%|██████████| 212349/212349 [31:26<00:00, 112.57it/s]  


<function logistic_4_param at 0x7f9045ac9840>

 ll4_4_param


100%|██████████| 212349/212349 [28:49<00:00, 122.81it/s] 


<function ll4_4_param at 0x7f9045ac9730>

 ll4R_4_param


100%|██████████| 212349/212349 [18:32<00:00, 190.82it/s]


<function ll4R_4_param at 0x7f9045ac97b8>

 logLogist_3_param


100%|██████████| 212349/212349 [25:01<00:00, 141.46it/s]


<function logLogist_3_param at 0x7f9045ac98c8>




Unnamed: 0,best_fitting_count,min,max,r2>0,r2>0.8,r2>0.9,r2>0.99
fsigmoid,242.0,-483.626782,0.999798,175791.0,69820.0,39968.0,39968.0
sigmoid_2_param,553.0,-6.800888,0.999798,168211.0,69741.0,39887.0,39887.0
sigmoid_3_param,12222.0,-1.017457,0.999905,210517.0,87327.0,56530.0,56530.0
sigmoid_4_param,29864.0,-0.101671,0.999933,186159.0,83364.0,62506.0,62506.0
logistic_4_param,34511.0,-0.031318,0.999969,198848.0,98245.0,70961.0,70961.0
ll4_4_param,35413.0,-0.046725,0.999979,198464.0,98074.0,70917.0,70917.0
ll4R_4_param,44580.0,-0.682404,0.999969,204257.0,97933.0,70729.0,70729.0
logLogist_3_param,54964.0,-5.742577,0.999987,185051.0,89565.0,63249.0,63249.0



Examples of bad fitting with sigmoid_4_param (r2<0.61): 108498


Unnamed: 0,COSMIC_ID,DRUG_ID,fsigmoid_r2,sigmoid_2_param_r2,sigmoid_3_param_r2,sigmoid_4_param_r2,logistic_4_param_r2,ll4_4_param_r2,ll4R_4_param_r2,logLogist_3_param_r2,better_fitting
5,924100,1005,0.845425,0.845425,0.551665,7.057422e-09,0.883194,0.886702,0.886702,0.856996,ll4_4_param
6,924100,1005,0.196062,0.196062,0.246722,0.2183123,0.367059,0.367062,0.367062,0.314096,ll4_4_param
7,924100,1005,0.805569,0.805569,0.809687,0.2881323,0.814873,0.812688,0.838101,0.768592,ll4R_4_param
8,924100,1006,0.258827,0.258826,0.006678,0.003444313,0.285134,0.285046,0.285412,0.152983,ll4R_4_param
9,924100,1007,0.717247,0.717247,0.717958,0.0,0.819205,0.819206,0.819205,0.807878,ll4_4_param


CPU times: user 2h 18min 38s, sys: 9min 24s, total: 2h 28min 3s
Wall time: 2h 45min 58s


In [15]:
df1 = pd.read_csv(_FOLDER+"fit_no_filt_gdsc1.csv")
df2 = pd.read_csv(_FOLDER+"fit_no_filt_gdsc2.csv")

In [17]:
#GDSC1
print("GDSC1:", df1.shape[0])
stat =  pd.DataFrame(index =  functions, columns = ["r2>0.9", "r2>0.99"])
for func in stat.index:
    for col in stat.columns:
        r2 = float(col.split(">")[-1])
        stat.loc[func, col] = df1[df1[func+"_r2"]>= r2].shape[0]
stat

GDSC1: 225384


Unnamed: 0,r2>0.9,r2>0.99
fsigmoid,53091,6638
sigmoid_2_param,53063,6612
sigmoid_3_param,70557,11402
sigmoid_4_param,53848,15166
logistic_4_param,73123,16752
ll4_4_param,73155,17082
ll4R_4_param,67847,16614
logLogist_3_param,72883,16215


In [19]:
#GDSC2
print("GDSC2:", df2.shape[0])
stat =  pd.DataFrame(index =  functions, columns = ["r2>0.9", "r2>0.99"])
for func in stat.index:
    for col in stat.columns:
        r2 = float(col.split(">")[-1])
        stat.loc[func, col] = df2[df2[func+"_r2"]>= r2].shape[0]
stat

GDSC2: 212349


Unnamed: 0,r2>0.9,r2>0.99
fsigmoid,39968,1475
sigmoid_2_param,39887,1465
sigmoid_3_param,56530,3908
sigmoid_4_param,62506,7857
logistic_4_param,70961,8856
ll4_4_param,70917,8867
ll4R_4_param,70729,8811
logLogist_3_param,63249,6942
