## Exploring data quality

In [1]:
import pandas as pd
import numpy as np
import os
import gc
import matplotlib.pyplot as plt

_FOLDER = "data/"
_FOLDER_2 = "figures/"
_FOLDER_3 = "results/"
SAVE_FIGURES = False

from functions.filtering import *
from functions.fitting import *
from functions.plotting import *

In [2]:
Y_LIMIT_1 = 0.8 # for group 1a
R2_LIMIT = 0.9 # for S-shape fitting
Y_LOWER_LIMIT  = 0.5 #for group 2a

### Original data

In [3]:
drug_curves = pd.read_csv(_FOLDER+"normalised_dose_response_data.csv")

if "Unnamed: 0" in drug_curves:
    drug_curves.drop("Unnamed: 0", axis=1, inplace =True)

col_to_drop = ["per_slope_change_"+str(i) for i in range(8)]+\
            ["slope_" + str(i) for i in range(9)]    
drug_curves.drop(col_to_drop, axis=1, inplace=True)    
    
conc_columns= ["fd_num_"+str(i) for i in range(10)]
response_norm = ['norm_cells_'+str(i) for i in range(10)]
CCL_names = dict(zip(drug_curves["COSMIC_ID"], drug_curves["CELL_LINE_NAME"]))
df= pd.read_csv(_FOLDER+'Drug_Features.csv')
drug_names = dict(zip(df["Drug ID"].values, df["Drug Name"].values))
del df
drug_curves["drug_name"] = drug_curves["DRUG_ID"].map(drug_names)
drug_curves["CCL_name"] = drug_curves["COSMIC_ID"].map(CCL_names)
drug_curves.shape

(225384, 28)

## Group 1: Responses above 1

In [4]:
#available functions in a filtering script
print("Available functions in filtering module: \n")
for func in content_filtering().keys():
    print(func)

Available functions in filtering module: 

find_high_responses
cut_off_outliers
find_ascending_data
filtering_sigmoid_curves
auc_fitration
filter_good_response
select_group_limits
select_group_1
select_group_1a
select_group_1b
select_group_2
select_group_2a
select_group_2b


In [5]:
# group 1 - all responses above 1
gr_1 = select_group_1(drug_curves, response_norm)
gr_1.shape

(162059, 28)

In [6]:
gr_1a = select_group_1a(gr_1, response_norm, Y_LIMIT_1)
gr_1a.shape

(74115, 28)

In [None]:
%%time
fit_functions = ["sigmoid_4_param","logistic_4_param"]

gr_1b = select_group_1b(gr_1.loc[list(set(gr_1.index)-set(gr_1a.index))], 
                        fit_functions, conc_columns, response_norm, 
                        y_limit=Y_LIMIT_1, r2_limit=R2_LIMIT,
                       )
gr_1b.shape 

  0%|          | 14/87944 [00:00<11:19, 129.32it/s]


 sigmoid_4_param


100%|██████████| 87944/87944 [09:33<00:00, 153.35it/s] 


<function sigmoid_4_param at 0x7fbffe2201e0>


  0%|          | 0/76794 [00:00<?, ?it/s]

Reduced number of samples: 11150

 logistic_4_param


 91%|█████████ | 70009/76794 [07:18<00:52, 129.34it/s]

In [None]:
gr_1b.columns

In [None]:
gr_1c = gr_1.loc[list(set(gr_1.index)-set(gr_1a.index)-set(gr_1b.index))]
gr_1c.shape

In [None]:
assert gr_1.shape[0]==gr_1a.shape[0]+gr_1b.shape[0]+gr_1c.shape[0]

In [None]:
gr_2 = select_group_2(drug_curves, response_norm)
gr_2.shape

In [None]:
gr_2a = select_group_2a(drug_curves, response_norm, Y_LOWER_LIMIT)
gr_2a.shape

In [None]:
%%time
fit_functions = ["sigmoid_4_param","logistic_4_param"]

gr_2b = select_group_2b(gr_2.loc[list(set(gr_2.index)-set(gr_2a.index))],
                        fit_functions, conc_columns, response_norm,
                        y_lower_limit =Y_LOWER_LIMIT, r2_limit= R2_LIMIT
                       )
gr_2b.shape 

In [None]:
gr_2c = gr_2.loc[list(set(gr_2.index)-set(gr_2a.index)-set(gr_2b.index))]
gr_2c.shape

In [None]:
assert gr_2.shape[0]==gr_2a.shape[0]+gr_2b.shape[0]+gr_2c.shape[0]

### Save figures for the paper

In [None]:
# group_1a
group = "1a"
drug_id = 205
ccl_name = "ES6"
one_fig_no_fitting(gr_1, drug_id=drug_id, ccl_name=ccl_name, size=4, dpi=500,
                x_columns = conc_columns, y_columns = response_norm, 
                upper_limit=1, lower_limit=None, 
                save_fig_name=f"figures/gr_{group}_{drug_id}_{ccl_name}.png"
               )

In [None]:
# group_1b
group = "1b"
drug_id = 245
ccl_name = "HDQ-P1"
OneFigNoFitting(gr_1b, drug_id=drug_id, ccl_name=ccl_name, size=4, dpi=500,
                x_columns = conc_columns, y_columns = response_norm, 
                upper_limit=1, lower_limit=None, 
                save_fig_name=f"figures/gr_{group}_{drug_id}_{ccl_name}.png"
               )

In [None]:
# group_1c
group = "1c"
drug_id = 56
ccl_name = "RKO"
OneFigNoFitting(gr_1c, drug_id=drug_id, ccl_name=ccl_name, size=4, dpi=500,
                x_columns = conc_columns, y_columns = response_norm, 
                upper_limit=1, lower_limit=None, 
                save_fig_name=f"figures/gr_{group}_{drug_id}_{ccl_name}.png"
               )

In [None]:
# group_2a
group = "2a"
drug_id = 223
ccl_name = "HDQ-P1"
OneFigNoFitting(gr_2a, drug_id=drug_id, ccl_name=ccl_name, size=4, dpi=500,
                x_columns = conc_columns, y_columns = response_norm, 
                upper_limit=1, lower_limit=None, 
                save_fig_name=f"figures/gr_{group}_{drug_id}_{ccl_name}.png"
               )

In [None]:
# group_2b
group = "2b"
drug_id = 252
ccl_name = "SK-MEL-30"
OneFigNoFitting(gr_2b, drug_id=drug_id, ccl_name=ccl_name, size=4, dpi=500,
                x_columns = conc_columns, y_columns = response_norm, 
                upper_limit=1, lower_limit=None, 
                save_fig_name=f"figures/gr_{group}_{drug_id}_{ccl_name}.png"
               )

In [None]:
gr_2c.sample(5)

In [None]:
# group_2c
group = "2c"
drug_id = 1242
ccl_name = "CAL-29"
OneFigNoFitting(gr_2c, drug_id=drug_id, ccl_name=ccl_name, size=4, dpi=500,
                x_columns = conc_columns, y_columns = response_norm, 
                upper_limit=1, lower_limit=None, 
                save_fig_name=f"figures/gr_{group}_{drug_id}_{ccl_name}.png"
               )

In [None]:
def OneFigNoFitting(df, drug_id, ccl_name, 
                    x_columns, y_columns, size=8,dpi=300,
                    upper_limit=None, lower_limit=None, save_fig_name=None):
    
    ind = df[(df["DRUG_ID"]==drug_id)&(df["CELL_LINE_NAME"]==ccl_name)].index
    drug_name = df.loc[ind, "drug_name"].values[0]
    
    print(f"Drug: {drug_name} ({drug_id}) / CCL: {ccl_name}")# % drug_name +str(drug_id) +" / CCL: "+ str(ccl_name))
    x = df.loc[ind, x_columns]
    y = df.loc[ind, y_columns].values[0] #possible problems are here
    
    plt.figure(figsize=(size, size))
    if max(y)>1:
        max_y= max(y)+0.1
    else:
        max_y = 1.1
    plt.ylim([0, max_y])
    plt.scatter(x,y)
    
    plt.xlabel("Scaled dosage")
    plt.ylabel("Normalised response")
    if upper_limit:
        plt.axhline(upper_limit,color='red',ls='--')
    if lower_limit:
        plt.axhline(lower_limit, color='black',ls='--')

    plt.tick_params(labelsize=14)
    plt.xlabel("Scaled dosage", fontsize=14)
    plt.ylabel("Normalised Response", fontsize=14)
    if save_fig_name:
        
        plt.savefig(save_fig_name, bbox_inches='tight', dpi=dpi)
        plt.show();
    else:
        plt.show();
        

## Ambigous data: Are some points wrong?

In [None]:
specific_samples = [("YK-4-279", "HSC-2"),
                    ("YK-4-279", "PCI-38"),
                   ("QL-VIII-58", "SKM-1"),
                    ("(5Z)-7-Oxozeaenol", "HT-1080")
                   ]
ShowSpecific(drug_curves, specific_samples, conc_columns, response_norm)

# Part 3: Explore  curves with normalised responses above 1.0

In [None]:
%%time
# Number of responses > 1
drug_curves["high_responses"] = drug_curves[response_norm].apply(lambda row: sum(row>1), axis=1)
drug_curves[["high_responses"]+ response_norm]

In [None]:
bad_data = drug_curves[drug_curves["high_responses"]>1]
print("Original data:", drug_curves.shape)
print("Ambiguos data:", bad_data.shape)

### Note: Half of the data can be regarded as ambiguous!!!

In [None]:
bad_data["high_responses"].value_counts()

## Bad data by CCL

## Check whether bad data are actually bad

In [None]:
df = bad_data
ShowResponseCurves(df, plots_in_row=3, plots_in_column=3, \
                   x_columns=conc_columns, y_columns=response_norm, indexes=df.index[:9],
                  drug_dict = drug_names, CCL_dict = CCL_names, upper_limit=1)

## Samples with 2 bad responses

In [None]:
N = 2
df = bad_data[bad_data["high_responses"]==N]
print("Number of samples with %d bad responses: %d" % (N, df.shape[0]))

ShowResponseCurves(df, plots_in_row=3, plots_in_column=3, \
                   x_columns=conc_columns, y_columns=response_norm, indexes=df.index[20:29],
                  drug_dict = drug_names, CCL_dict = CCL_names)

### Among samples with only 2 norm_responses >1 some data are not so bad

In [None]:
specific_samples = [("Etoposide", "HDQ-P1"),
                    ("SNX-2112", "NMC-G1"),
                    ("Shikonin", "JHH-2"),
                    ("SNX-2112", "BT-474")
                   ]
ShowSpecific(bad_data, specific_samples, conc_columns, response_norm, upper_limit=1, lower_limit=0.2)

In [None]:
N = 2
df = bad_data[(bad_data["high_responses"]==N) & (bad_data["norm_cells_9"]<0.2) & (bad_data["norm_cells_8"]<0.2)]
print("Number of samples with %d bad responses, but with 2 last responses below 0.2: %d" % (N, df.shape[0]))

ShowResponseCurves(df, plots_in_row=3, plots_in_column=3, \
                   x_columns=conc_columns, y_columns=response_norm, indexes=df.index[:9],
                  drug_dict = drug_names, CCL_dict = CCL_names, upper_limit=1)

### Conclusion - we can't delete just samples which with 2 responses>1.0
### The question: What is the accuracy of measuring responses? <br>Can we treat samples with responses up to 1.01 as valid ones?

## Samples with 3 bad responses

In [None]:
N = 3
df = bad_data[bad_data["high_responses"]==N]
print("Number of samples with %d bad responses: %d" % (N, df.shape[0]))

ShowResponseCurves(df, plots_in_row=3, plots_in_column=3, \
                   x_columns=conc_columns, y_columns=response_norm, indexes=df.index[30:39],
                  drug_dict = drug_names, CCL_dict = CCL_names)

In [None]:
specific_samples = [("THZ-2-49", "HDQ-P1"),
                    ("Vinorelbine", "NMC-G1"),
                    ("Dacinostat", "JHH-2"),
                    ("Dacinostat", "LU-65")
                   ]
ShowSpecific(bad_data, specific_samples, conc_columns, response_norm, upper_limit=1, lower_limit=0.2)

In [None]:
N = 3
df = bad_data[(bad_data["high_responses"]==N) & (bad_data["norm_cells_9"]<0.2) & (bad_data["norm_cells_8"]<0.2)]
print("Number of samples with %d bad responses, but with 2 last responses below 0.2: %d" % (N, df.shape[0]))

ShowResponseCurves(df, plots_in_row=3, plots_in_column=3, \
                   x_columns=conc_columns, y_columns=response_norm, indexes=df.index[:9],
                  drug_dict = drug_names, CCL_dict = CCL_names, upper_limit=1)

## Samples with 4 bad responses 

In [None]:
N = 4
df = bad_data[bad_data["high_responses"]==N]
print("Number of samples with %d bad responses: %d" % (N, df.shape[0]))

ShowResponseCurves(df, plots_in_row=3, plots_in_column=3, \
                   x_columns=conc_columns, y_columns=response_norm, indexes=df.index[20:29],
                  drug_dict = drug_names, CCL_dict = CCL_names)

In [None]:
specific_samples = [("THZ-2-49", "HDQ-P1"),
                    ("Cabozantinib", "NMC-G1"),
                    ("Dacinostat", "JHH-2"),
                    ("JNK-9L", "BT-474")
                   ]
ShowSpecific(bad_data, specific_samples, conc_columns, response_norm, upper_limit=1, lower_limit=0.2)

In [None]:
N = 4
df = bad_data[(bad_data["high_responses"]==N) & (bad_data["norm_cells_9"]<0.2) & (bad_data["norm_cells_8"]<0.2)]
print("Number of samples with %d bad responses, but with 2 last responses below 0.2: %d" % (N, df.shape[0]))

ShowResponseCurves(df, plots_in_row=3, plots_in_column=3, \
                   x_columns=conc_columns, y_columns=response_norm, indexes=df.index[:9],
                  drug_dict = drug_names, CCL_dict = CCL_names, upper_limit=1)

## Samples with 5+ bad responses

In [None]:
N = 5
df = bad_data[bad_data["high_responses"]>=N]
print("Number of samples with more than %d bad responses: %d" % (N, df.shape[0]))

ShowResponseCurves(df, plots_in_row=3, plots_in_column=3, \
                   x_columns=conc_columns, y_columns=response_norm, indexes=df.index[20:29],
                  drug_dict = drug_names, CCL_dict = CCL_names)

In [None]:
N = 5
df = bad_data[(bad_data["high_responses"]==N) & (bad_data["norm_cells_9"]<0.2) & (bad_data["norm_cells_8"]<0.2)]
print("Number of samples with %d bad responses, but with 2 last responses below 0.2: %d" % (N, df.shape[0]))

ShowResponseCurves(df, plots_in_row=3, plots_in_column=3, \
                   x_columns=conc_columns, y_columns=response_norm, indexes=df.index[:9],
                  drug_dict = drug_names, CCL_dict = CCL_names, upper_limit=1)

In [None]:
N = 6
df = bad_data[(bad_data["high_responses"]==N) & (bad_data["norm_cells_9"]<0.2) & (bad_data["norm_cells_8"]<0.2)]
print("Number of samples with %d bad responses, but with 2 last responses below 0.2: %d" % (N, df.shape[0]))

ShowResponseCurves(df, plots_in_row=3, plots_in_column=3, \
                   x_columns=conc_columns, y_columns=response_norm, indexes=df.index[:9],
                  drug_dict = drug_names, CCL_dict = CCL_names, upper_limit=1)

In [None]:
N = 7
df = bad_data[(bad_data["high_responses"]==N) & (bad_data["norm_cells_9"]<0.2) & (bad_data["norm_cells_8"]<0.2)]
print("Number of samples with %d bad responses, but with 2 last responses below 0.2: %d" % (N, df.shape[0]))

ShowResponseCurves(df, plots_in_row=3, plots_in_column=3, \
                   x_columns=conc_columns, y_columns=response_norm, indexes=df.index[:9],
                  drug_dict = drug_names, CCL_dict = CCL_names, upper_limit=1)

In [None]:
N = 8
df = bad_data[(bad_data["high_responses"]==N) & (bad_data["norm_cells_9"]<0.2) & (bad_data["norm_cells_8"]<0.2)]
print("Number of samples with %d bad responses, but with 2 last responses below 0.2: %d" % (N, df.shape[0]))

ShowResponseCurves(df, plots_in_row=3, plots_in_column=3, \
                   x_columns=conc_columns, y_columns=response_norm, indexes=df.index[:9],
                  drug_dict = drug_names, CCL_dict = CCL_names, upper_limit=1)

In [None]:
N = 9
df = bad_data[(bad_data["high_responses"]==N) & (bad_data["norm_cells_9"]<0.2) & (bad_data["norm_cells_8"]<0.2)]
print("Number of samples with %d bad responses, but with 2 last responses below 0.2: %d" % (N, df.shape[0]))

ShowResponseCurves(df, plots_in_row=3, plots_in_column=3, \
                   x_columns=conc_columns, y_columns=response_norm, indexes=df.index[:9],
                  drug_dict = drug_names, CCL_dict = CCL_names, upper_limit=1)

### Conclusion from Part 3: Curves with up to 7 suspiciously high normalised responses look pretty reasonable

## Part 4: Explore curves with normalised response above 1 but low final response

In [None]:
%%time
drug_curves["low_response_02"] = drug_curves[response_norm].apply(lambda row: sum(row<=0.2), axis=1)
drug_curves["low_response_04"] = drug_curves[response_norm].apply(lambda row: sum(row<=0.4), axis=1)

In [None]:
not_bad_02 = drug_curves[(drug_curves["high_responses"]>1) & (drug_curves["low_response_02"])]
not_bad_04 = drug_curves[(drug_curves["high_responses"]>1) & (drug_curves["low_response_04"])]
print("Number of all suspicious samples:", bad_data.shape[0])
print("\nNumber of potentionally good samples among all bad data:")
print("With responses below 0.2:", not_bad_02.shape[0])
print("With responses below 0.4:", not_bad_04.shape[0])

In [None]:
not_bad_02["low_response_02"].value_counts()

In [None]:
not_bad_04["low_response_04"].value_counts()

In [None]:
df = not_bad_02[not_bad_02["low_response_02"]==7]

ShowResponseCurves(df, plots_in_row=3, plots_in_column=3, \
                   x_columns=conc_columns, y_columns=response_norm, indexes=df.index[:9],
                  drug_dict = drug_names, CCL_dict = CCL_names, upper_limit=1)

# Part 5: Repeat with an additional constrain 
## Among all "middle" datapoints a subsequent point should not be higher than antecedent by some limit

In [None]:
not_bad_02_2 = CutOffOutliers(drug_curves, middle_points_limit=-0.2, response_columns = response_norm)

print("Before filtration: %d, After filtration: %d" % (not_bad_04.shape[0], not_bad_02_2.shape[0]))
not_bad_02_2["low_response_02"].value_counts()

In [None]:
df = not_bad_02_2[not_bad_02_2["low_response_02"]==1]
ShowResponseCurves(df, plots_in_row=3, plots_in_column=3, \
                   x_columns=conc_columns, y_columns=response_norm, indexes=df.index[:9],
                  drug_dict = drug_names, CCL_dict = CCL_names, upper_limit=1)