In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import gc
_FOLDER = "data/"
_FOLDER_2 = "figures/"
_FOLDER_3 = "results/"

In [2]:
from fitting import *

In [3]:
from numba import njit

@njit
def CatData(arr):
    bins = np.empty(arr.shape[0])
    for idx, x in enumerate(arr):
        if (x >= 0) & (x < 0.5):
            bins[idx] = 1
        elif (x >= 0.5) & (x < 0.7):
            bins[idx] = 2
        elif (x >= 0.7) & (x < 0.9):
            bins[idx] = 3
        elif (x >= 0.9) & (x < 0.95):
            bins[idx] = 4
        elif (x >= 0.95) & (x < 0.99):
            bins[idx] = 5
        elif x >= 0.99:
            bins[idx] = 6
        else:
            bins[idx] = 0

    return bins

def BinningFunction(df, column, new_column, categorical=True):
    df[new_column] = CatData(df[column].to_numpy())
    
    if categorical:
        map_dict = {
            1: "0-0.5",
            2: "0.5-0.7",
            3: "0.7-0.9",
            4: "0.9-0.95",
            5: "0.95-0.99",
            6: ">0.99",
            0: "<0"
        }
        df[new_column]= df[new_column].map(map_dict)
    

In [4]:
os.listdir("results/")

['fit_1234.csv',
 '.DS_Store',
 'filt_1234.csv',
 'fit_auc_02.csv',
 'fit_auc_04.csv',
 'fit_auc_06.csv',
 'filt_auc.csv',
 'filt_auc_06.csv',
 'filt_auc_04.csv',
 'filt_123.csv',
 'filt_auc_02.csv',
 '.ipynb_checkpoints',
 'fit_auc.csv',
 'fit_123.csv']

In [5]:
functions = [
             "fsigmoid", 
             "sigmoid_2_param",
             "sigmoid_3_param",
             "sigmoid_4_param",
             "logistic_4_param",
            "ll4_4_param",
            "ll4R_4_param",
            "logLogist_3_param"]

#r2_columns = [fitting_function+"_r2" for fitting_function in functions]

filtration_types = ['123', '1234', 'auc_02', 'auc_04','auc_06', "auc"]


In [6]:
df_099 = pd.DataFrame(index=filtration_types, columns = functions)
df_09 = pd.DataFrame(index=filtration_types, columns = functions)

In [7]:
%%time


for filtering in filtration_types:
    print("Filtering:", filtering)
    
    df = pd.read_csv(_FOLDER_3+"filt_"+filtering+".csv")
    print("Data left after filtering: %d" % df.shape[0])
    del df
    
    df_1 = pd.read_csv(_FOLDER_3+"fit_"+filtering+".csv")
    print("Left after fitting:", df_1.shape[0])
    
    df_0 = pd.DataFrame()
    df_0["index"] = ['>0.99', '0.95-0.99', '0.9-0.95', '0.7-0.9', '0.5-0.7', '0-0.5', "<0", "sum_>0.9"]

    for func in functions:
        BinningFunction(df_1, func +"_r2", func) 
        df = df_1[func].value_counts().reset_index()
        df.loc[7, "index"] = "sum_>0.9" 
        df.loc[7, func]= df.iloc[:3][func].sum()
        df[func] = df[func].astype("int32")
#         display(df)
        df_0 = pd.merge(df_0, df, on="index", how="left")
    display(df_0)
    df_099.loc[filtering, :]= df_0.loc[0, functions]
    df_09.loc[filtering, :]= df_0.loc[7, functions]
    print("********************************\n")

Filtering: 123
Data left after filtering: 2776
Left after fitting: 2710


Unnamed: 0,index,fsigmoid,sigmoid_2_param,sigmoid_3_param,sigmoid_4_param,logistic_4_param,ll4_4_param,ll4R_4_param,logLogist_3_param
0,>0.99,916.0,916.0,1247.0,1868.0,1774.0,1774.0,1774.0,1753.0
1,0.95-0.99,1122.0,1122.0,1195.0,722.0,846.0,846.0,846.0,840.0
2,0.9-0.95,392.0,392.0,212.0,70.0,70.0,70.0,70.0,89.0
3,0.7-0.9,264.0,264.0,52.0,15.0,17.0,17.0,17.0,24.0
4,0.5-0.7,15.0,15.0,4.0,4.0,3.0,3.0,3.0,4.0
5,0-0.5,1.0,1.0,,31.0,,,,
6,<0,,,,,,,,
7,sum_>0.9,2430.0,2430.0,2654.0,2660.0,2690.0,2690.0,2690.0,2682.0


********************************

Filtering: 1234
Data left after filtering: 2600
Left after fitting: 2552


Unnamed: 0,index,fsigmoid,sigmoid_2_param,sigmoid_3_param,sigmoid_4_param,logistic_4_param,ll4_4_param,ll4R_4_param,logLogist_3_param
0,>0.99,916.0,916.0,1247.0,1866.0,1771.0,1771.0,1771.0,1752.0
1,0.95-0.99,1084.0,1084.0,1137.0,636.0,757.0,757.0,757.0,764.0
2,0.9-0.95,334.0,334.0,150.0,21.0,24.0,24.0,24.0,33.0
3,0.7-0.9,210.0,210.0,18.0,,,,,3.0
4,0.5-0.7,7.0,7.0,,,,,,
5,0-0.5,1.0,1.0,,29.0,,,,
6,<0,,,,,,,,
7,sum_>0.9,2334.0,2334.0,2534.0,2531.0,2552.0,2552.0,2552.0,2549.0


********************************

Filtering: auc_02
Data left after filtering: 12169
Left after fitting: 7574


Unnamed: 0,index,fsigmoid,sigmoid_2_param,sigmoid_3_param,sigmoid_4_param,logistic_4_param,ll4_4_param,ll4R_4_param,logLogist_3_param
0,>0.99,1216.0,1208.0,2052.0,2895.0,2884.0,2884.0,2884.0,1876.0
1,0.95-0.99,4252.0,4226.0,4532.0,3868.0,3985.0,3985.0,3985.0,4085.0
2,0.9-0.95,1409.0,1423.0,824.0,512.0,602.0,602.0,602.0,1152.0
3,0.7-0.9,656.0,669.0,164.0,74.0,102.0,102.0,102.0,423.0
4,0.5-0.7,37.0,41.0,2.0,,1.0,1.0,1.0,30.0
5,0-0.5,4.0,7.0,,225.0,,,,8.0
6,<0,,,,,,,,
7,sum_>0.9,6877.0,6857.0,7408.0,7275.0,7471.0,7471.0,7471.0,7113.0


********************************

Filtering: auc_04
Data left after filtering: 24164
Left after fitting: 22225


Unnamed: 0,index,fsigmoid,sigmoid_2_param,sigmoid_3_param,sigmoid_4_param,logistic_4_param,ll4_4_param,ll4R_4_param,logLogist_3_param
0,>0.99,1729,1692,3109,5209.0,5274.0,5276.0,5255,4032
1,0.95-0.99,9226,9187,11411,10710.0,11576.0,11575.0,11568,11566
2,0.9-0.95,5816,5834,5130,3282.0,3656.0,3655.0,3666,4171
3,0.7-0.9,4857,4890,2493,1457.0,1656.0,1657.0,1659,2199
4,0.5-0.7,493,499,61,46.0,43.0,42.0,42,205
5,0-0.5,102,111,16,1521.0,20.0,20.0,31,51
6,<0,2,12,5,,,,4,1
7,sum_>0.9,19899,19911,19650,19201.0,20506.0,20506.0,20489,19769


********************************

Filtering: auc_06
Data left after filtering: 41613
Left after fitting: 24142


Unnamed: 0,index,fsigmoid,sigmoid_2_param,sigmoid_3_param,sigmoid_4_param,logistic_4_param,ll4_4_param,ll4R_4_param,logLogist_3_param
0,>0.99,1495,1487,2709.0,5184.0,5306.0,5306.0,5306.0,3301
1,0.95-0.99,8148,8121,10904.0,11711.0,12453.0,12453.0,12453.0,11266
2,0.9-0.95,6198,6209,6092.0,3776.0,4116.0,4116.0,4117.0,5086
3,0.7-0.9,7272,7284,4278.0,1986.0,2182.0,2182.0,2180.0,3724
4,0.5-0.7,851,851,148.0,68.0,71.0,71.0,72.0,470
5,0-0.5,167,171,11.0,1417.0,14.0,14.0,14.0,276
6,<0,11,19,,,,,,19
7,sum_>0.9,21618,21614,21274.0,20671.0,21875.0,21875.0,21876.0,20076


********************************

Filtering: auc
Data left after filtering: 122642
Left after fitting: 98703


Unnamed: 0,index,fsigmoid,sigmoid_2_param,sigmoid_3_param,sigmoid_4_param,logistic_4_param,ll4_4_param,ll4R_4_param,logLogist_3_param
0,>0.99,1848,1811,3475,6501.0,6646,6644,6617,5106
1,0.95-0.99,12726,12684,18215,20204.0,22026,22016,21953,21186
2,0.9-0.95,12392,12404,14883,12366.0,13873,13807,13786,13388
3,0.7-0.9,26458,26471,25666,20148.0,24005,23744,23813,21859
4,0.5-0.7,13163,13140,11654,9317.0,12219,11782,12159,10540
5,0-0.5,23080,23779,24257,30167.0,19839,20420,19962,18377
6,<0,9036,8414,553,,95,290,413,8247
7,sum_>0.9,62701,63390,68138,70519.0,65870,66180,65728,61422


********************************

CPU times: user 6.05 s, sys: 821 ms, total: 6.88 s
Wall time: 8.09 s


## Fitting results

In [8]:
#Number of drug curves with fitting R2>0.99
df_099

Unnamed: 0,fsigmoid,sigmoid_2_param,sigmoid_3_param,sigmoid_4_param,logistic_4_param,ll4_4_param,ll4R_4_param,logLogist_3_param
123,916,916,1247,1868,1774,1774,1774,1753
1234,916,916,1247,1866,1771,1771,1771,1752
auc_02,1216,1208,2052,2895,2884,2884,2884,1876
auc_04,1729,1692,3109,5209,5274,5276,5255,4032
auc_06,1495,1487,2709,5184,5306,5306,5306,3301
auc,1848,1811,3475,6501,6646,6644,6617,5106


In [9]:
#Number of drug curves with fitting R2>0.9
df_09

Unnamed: 0,fsigmoid,sigmoid_2_param,sigmoid_3_param,sigmoid_4_param,logistic_4_param,ll4_4_param,ll4R_4_param,logLogist_3_param
123,2430,2430,2654,2660,2690,2690,2690,2682
1234,2334,2334,2534,2531,2552,2552,2552,2549
auc_02,6877,6857,7408,7275,7471,7471,7471,7113
auc_04,19899,19911,19650,19201,20506,20506,20489,19769
auc_06,21618,21614,21274,20671,21875,21875,21876,20076
auc,62701,63390,68138,70519,65870,66180,65728,61422


In [10]:
max_099 = int(max(df_099.max(axis=0)))
print("Fitting>0.99:", max_099, "samples")

for col in functions:
    if max_099 in df_099[col].values:
        print("Filtering:", df_099.index[np.argwhere(df_099[col].values == max_099)][0][0])
        print("Fitting function:", col)
print("")

max_09 = int(max(df_09.max(axis=0)))
print("Fitting>0.90:", max_09, "samples")

for col in functions:
    if max_09 in df_09[col].values:
        print("Filtering:", df_09.index[np.argwhere(df_09[col].values == max_09)][0][0])
        print("Fitting function:", col)

Fitting>0.99: 6646 samples
Filtering: auc
Fitting function: logistic_4_param

Fitting>0.90: 70519 samples
Filtering: auc
Fitting function: sigmoid_4_param
