In [3]:
import pandas as pd
import numpy as np

In [4]:
import time
from sklearn.metrics import r2_score
from scipy.optimize import curve_fit
from scipy.optimize import leastsq
import scipy.optimize as opt
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import gc

from sklearn.model_selection import LeaveOneOut
from sklearn.preprocessing import MinMaxScaler
import os
from data_preprocessing import FilteringCurves, ShowResponseCurves
from fitting_curves import FittingColumn, ShowResponseCurvesWithFitting, compute_r2_score

# from IPython.display import display
#_FOLDER = "results/"
_FOLDER = "/home/acq18mk/master/results/"

### Training and predictions for each drug separately

In [5]:
def PredictionsTunedKernelsByDrug(merged_df, drug_ids, number_coefficients, kernel, train_ratio =0.8, 
                                  column_not_to_use =[], alpha=1, gamma=None, degree=3, coef0=1, 
                                  features_to_scale=[], scaling=False, columns_to_use= []):
    """Training and testing Kernels with the best found hyperparameters"""
    
    param1 = ["param_" +str(i) for i in range(10)]
    param2 = ["param" +str(i) for i in range(10)] 
    norm_response  = ["norm_cells_"+str(i) for i in range(10)]
    con_columns  = ["fd_num_"+str(i) for i in range(10)]

    not_X_columns = param1 + param2 + norm_response + con_columns+column_not_to_use
    X_columns = set(df.columns) - set(not_X_columns)
    print("Number of X_columns:", len(X_columns))
    
    df_errors_test = pd.DataFrame()
    df_model_coef = pd.DataFrame(index=X_columns)

    for drug_id in drug_ids:
        
        merged_df_i = merged_df[merged_df["DRUG_ID"]==drug_id]
        np.random.seed(123)
        indexes = np.random.permutation(merged_df_i.index)
        train_size = int(merged_df_i.shape[0]*train_ratio)
        indexes_train = indexes[:train_size]
        indexes_test= indexes[train_size:]
        
        if scaling:
            train = merged_df_i.loc[indexes_train, X_columns].copy()
            test = merged_df_i.loc[indexes_test, X_columns].copy()
            scaler = MinMaxScaler()
            scaler.fit(train[columns_for_normalisation])
            train[columns_for_normalisation] = scaler.transform(train[columns_for_normalisation])
            X_train = train.values  
            test[columns_for_normalisation] = scaler.transform(test[columns_for_normalisation])
            X_test = test.values
        else:
            X_train = merged_df_i.loc[indexes_train, X_columns].values
            X_test = merged_df_i.loc[indexes_test, X_columns].values
    
        for i in range(number_coefficients):

            y_train = merged_df_i.loc[indexes_train, "param_"+str(i+1)].values
            y_test = merged_df_i.loc[indexes_test, "param_"+str(i+1)].values
            
            #check whether each coefficient needs its own parameters
            if type(alpha)==dict:
                alpha_value = alpha[i+1]
            else:
                alpha_value = alpha
                
            if type(gamma)==dict:
                gamma_value = gamma[i+1]
            else:
                gamma_value = gamma
            
            if type(degree)==dict:
                degree_value = degree[i+1]
            else:
                degree_value = degree
                
            if type(coef0)==dict:
                coef0_value = coef0[i+1]
            else:
                coef0_value = coef0
                
            kr_lin = KernelRidge(kernel = kernel, alpha = alpha_value, gamma=gamma_value, 
                                 degree=degree_value, coef0=coef0_value)
            kr_lin.fit(X_train, y_train)
            y_pred = kr_lin.predict(X_test)
            
            merged_df_i.loc[indexes_test, "pred_param_"+str(i+1)] = y_pred
                           
    return merged_df[columns_to_use]

In [6]:
def PredictionsTunedLassoByDrug(merged_df, drug_ids, number_coefficients, train_ratio=0.8, 
                                column_not_to_use=[], alpha=1, features_to_scale = [], 
                                scaling=False, file_name = "", columns_to_use= []):
    """Training and testing Kernels with the best found hyperparameters"""
    
    param1 = ["param_" +str(i) for i in range(10)]
    param2 = ["param" +str(i) for i in range(10)] 
    norm_response  = ["norm_cells_"+str(i) for i in range(10)]
    con_columns  = ["fd_num_"+str(i) for i in range(10)]

    not_X_columns = param1 + param2 + norm_response + con_columns + column_not_to_use
    X_columns = set(df.columns) - set(not_X_columns)
    print(len(X_columns))
    df_errors_test = pd.DataFrame()
    feature_importance = pd.DataFrame(index=X_columns)

    for drug_id in drug_ids:
        # merged_df_i has lower shape
        merged_df_i = merged_df[merged_df["DRUG_ID"]==drug_id]
        
        np.random.seed(123)
        indexes = np.random.permutation(merged_df_i.index)
        train_size = int(merged_df_i.shape[0]*train_ratio)
        indexes_train = indexes[:train_size]
        indexes_test= indexes[train_size:]
        
        if scaling:
            train = merged_df_i.loc[indexes_train, X_columns].copy()
            test = merged_df_i.loc[indexes_test, X_columns].copy()
            scaler = MinMaxScaler()
            scaler.fit(train[columns_for_normalisation])
            train[columns_for_normalisation] = scaler.transform(train[columns_for_normalisation])
            X_train = train.values  
            test[columns_for_normalisation] = scaler.transform(test[columns_for_normalisation])
            X_test = test.values
        else:
            X_train = merged_df_i.loc[indexes_train, X_columns].values
            X_test = merged_df_i.loc[indexes_test, X_columns].values
    
        for i in range(number_coefficients):
#             param = best_param[i+1]
            y_train = merged_df_i.loc[indexes_train, "param_"+str(i+1)].values
            y_test = merged_df_i.loc[indexes_test, "param_"+str(i+1)].values
            
            #check whether each coefficient needs its own parameters
            if type(alpha)==dict:
                alpha_value = alpha[i+1]
            else:
                alpha_value = alpha
                
            lin_reg = Lasso(alpha = alpha_value)
            lin_reg.fit(X_train, y_train)
            
            feature_importance["coef_"+str(i+1)+"_"+str(drug_id)]=lin_reg.coef_
            
            y_pred = lin_reg.predict(X_test)
            merged_df_i.loc[indexes_test, "pred_param_"+str(i+1)] = y_pred
                                
    return merged_df[columns_to_use]

In [7]:
def PredictionsTunedRidgeByDrug(merged_df, drug_ids, number_coefficients, column_not_to_use=[], alpha=1, solver ="auto", 
                                metrics = "mse", features_to_scale = [], scaling=False, columns_to_use= []):
    """Training and testing Kernels with the best found hyperparameters"""
    
    param1 = ["param_" +str(i) for i in range(10)]
    param2 = ["param" +str(i) for i in range(10)] 
    norm_response  = ["norm_cells_"+str(i) for i in range(10)]
    con_columns  = ["fd_num_"+str(i) for i in range(10)]

    not_X_columns = param1 + param2 + norm_response + con_columns + column_not_to_use
    X_columns = set(df.columns) - set(not_X_columns)
    
    df_errors_test = pd.DataFrame()
    feature_importance = pd.DataFrame(index=X_columns)

    for drug_id in drug_ids:
        # merged_df_i has lower shape
        merged_df_i = merged_df[merged_df["DRUG_ID"]==drug_id]
    
        np.random.seed(123)
        indexes = np.random.permutation(merged_df_i.index)
        train_size = int(merged_df_i.shape[0]*0.8)
        indexes_train = indexes[:train_size]
        indexes_test = indexes[train_size:]
        
        if scaling:
            train=merged_df_i.loc[indexes_train, X_columns].copy()
            test = merged_df_i.loc[indexes_test, X_columns].copy()
            scaler = MinMaxScaler()
            scaler.fit(train[columns_for_normalisation])
            train[columns_for_normalisation] = scaler.transform(train[columns_for_normalisation])
            test[columns_for_normalisation] = scaler.transform(test[columns_for_normalisation])
            X_train = train.values
            X_test = test.values 
        else:
            X_train = merged_df_i.loc[indexes_train, X_columns].values
            X_test = merged_df_i.loc[indexes_test, X_columns].values
    
        for i in range(number_coefficients):
#             param = best_param[i+1]
            y_train = merged_df_i.loc[indexes_train, "param_"+str(i+1)].values
            y_test = merged_df_i.loc[indexes_test, "param_"+str(i+1)].values
            
            #check whether each coefficient needs its own parameters
            if type(alpha)==dict:
                alpha_value = alpha[i+1]
            else:
                alpha_value = alpha
                
            if type(solver)==dict:
                solver_value = solver[i+1]
            else:
                solver_value = solver
                
            lin_reg = Ridge(alpha = alpha_value, solver = solver_value)
            lin_reg.fit(X_train, y_train)
            feature_importance["coef_"+str(i+1)+"_"+str(drug_id)]=lin_reg.coef_
            y_pred = lin_reg.predict(X_test)
            merged_df_i.loc[indexes_test, "pred_param_"+str(i+1)] = y_pred
                                
    return feature_importance, merged_df[columns_to_use]

### Training and predictions for already splitted Train and Test data

In [24]:
def PredictionsKernelsSplittedDataSets(df_train, df_test, number_coefficients, kernel, 
                                       column_not_to_use =[], alpha=1, gamma=None, degree=3, coef0=1, 
                                       features_to_scale=[], scaling=False, columns_to_use= []):
    """Training and testing Kernels with the best found hyperparameters"""
    
    param1 = ["param_" +str(i) for i in range(10)]
    param2 = ["param" +str(i) for i in range(10)] 
    norm_response  = ["norm_cells_"+str(i) for i in range(10)]
    con_columns  = ["fd_num_"+str(i) for i in range(10)]

    not_X_columns = param1 + param2 + norm_response + con_columns+column_not_to_use
    X_columns = set(df_train.columns) - set(not_X_columns)
    print("Number of X_columns:", len(X_columns))
    
    df_errors_test = pd.DataFrame(index=["mse"])
    if scaling:
        train = df_train.copy()
        test = df_test.copy()
        scaler = MinMaxScaler()
        scaler.fit(train[columns_for_normalisation])
        train[columns_for_normalisation] = scaler.transform(train[columns_for_normalisation])
        X_train = train[X_columns].values  
        test[columns_for_normalisation] = scaler.transform(test[columns_for_normalisation])
        X_test = test[X_columns].values
    else:
        X_train = df_train[X_columns].values
        X_test = df_test[X_columns].values
    
    for i in range(number_coefficients):
        y_train = df_train["param_"+str(i+1)].values
        y_test = df_test["param_"+str(i+1)].values
            
        #check whether each coefficient needs its own parameters
        if type(alpha)==dict:
            alpha_value = alpha[i+1]
        else:
            alpha_value = alpha
                
        if type(gamma)==dict:
            gamma_value = gamma[i+1]
        else:
            gamma_value = gamma
            
        if type(degree)==dict:
            degree_value = degree[i+1]
        else:
            degree_value = degree
                
        if type(coef0)==dict:
            coef0_value = coef0[i+1]
        else:
            coef0_value = coef0
                
        kr_lin = KernelRidge(kernel = kernel, alpha = alpha_value, gamma=gamma_value, 
                                 degree=degree_value, coef0=coef0_value)
        kr_lin.fit(X_train, y_train)
        y_pred = kr_lin.predict(X_test)   
        df_test["pred_coef_"+str(i+1)] = y_pred

    return df_test[columns_to_use]

In [55]:
def PredictionsLassoSplittedDataSets(df_train, df_test, number_coefficients,
                                     column_not_to_use=[], alpha=1, features_to_scale = [], 
                                     scaling=False, file_name = "", columns_to_use= []):
    """Training and testing Kernels with the best found hyperparameters"""

    param1 = ["param_" +str(i) for i in range(10)]
    param2 = ["param" +str(i) for i in range(10)] 
    norm_response  = ["norm_cells_"+str(i) for i in range(10)]
    con_columns  = ["fd_num_"+str(i) for i in range(10)]

    not_X_columns = param1 + param2 + norm_response + con_columns+column_not_to_use
    X_columns = set(df_train.columns) - set(not_X_columns)
    print("Number of X_columns:", len(X_columns))
    
    df_errors_test = pd.DataFrame(index=["mse"])
    feature_importance = pd.DataFrame(index=X_columns)
    if scaling:
        train = df_train.copy()
        test = df_test.copy()
        scaler = MinMaxScaler()
        scaler.fit(train[columns_for_normalisation])
        train[columns_for_normalisation] = scaler.transform(train[columns_for_normalisation])
        X_train = train[X_columns].values  
        test[columns_for_normalisation] = scaler.transform(test[columns_for_normalisation])
        X_test = test[X_columns].values
    else:
        X_train = df_train[X_columns].values
        X_test = df_test[X_columns].values
    
    for i in range(number_coefficients):
        y_train = df_train["param_"+str(i+1)].values
        y_test = df_test["param_"+str(i+1)].values
            
        #check whether each coefficient needs its own parameters
        if type(alpha)==dict:
            alpha_value = alpha[i+1]
        else:
            alpha_value = alpha
                
        lin_reg = Lasso(alpha = alpha_value)
        lin_reg.fit(X_train, y_train)  
        feature_importance["coef_"+str(i+1)] = lin_reg.coef_
#         print(lin_reg.coef_)
        y_pred = lin_reg.predict(X_test)
#         print(y_pred)
        df_test["pred_coef_"+str(i+1)] = y_pred

    return feature_importance, df_test[columns_to_use]

In [56]:
def PredictionsRidgeSplittedDataSets(df_train, df_test, number_coefficients,
                                     column_not_to_use=[], alpha=1, solver ="auto",  
                                     features_to_scale = [], scaling=False, columns_to_use= []):
    """Training and testing Kernels with the best found hyperparameters"""
    
    param1 = ["param_" +str(i) for i in range(10)]
    param2 = ["param" +str(i) for i in range(10)] 
    norm_response  = ["norm_cells_"+str(i) for i in range(10)]
    con_columns  = ["fd_num_"+str(i) for i in range(10)]

    not_X_columns = param1 + param2 + norm_response + con_columns+column_not_to_use
    X_columns = set(df_train.columns) - set(not_X_columns)
    print("Number of X_columns:", len(X_columns))
    
    df_errors_test = pd.DataFrame(index=["mse"])
    feature_importance = pd.DataFrame(index=X_columns)
    if scaling:
        train = df_train.copy()
        test = df_test.copy()
        scaler = MinMaxScaler()
        scaler.fit(train[columns_for_normalisation])
        train[columns_for_normalisation] = scaler.transform(train[columns_for_normalisation])
        X_train = train[X_columns].values  
        test[columns_for_normalisation] = scaler.transform(test[columns_for_normalisation])
        X_test = test[X_columns].values
    else:
        X_train = df_train[X_columns].values
        X_test = df_test[X_columns].values
    
    for i in range(number_coefficients):
        y_train = df_train["param_"+str(i+1)].values
        y_test = df_test["param_"+str(i+1)].values
            
        #check whether each coefficient needs its own parameters
        if type(alpha)==dict:
            alpha_value = alpha[i+1]
        else:
            alpha_value = alpha
                
        if type(solver)==dict:
            solver_value = solver[i+1]
        else:
            solver_value = solver
                
        lin_reg = Ridge(alpha = alpha_value, solver = solver_value)
        lin_reg.fit(X_train, y_train)
        feature_importance["coef_"+str(i+1)] = lin_reg.coef_
        y_pred = lin_reg.predict(X_test)
        df_test["pred_coef_"+str(i+1)] = y_pred

    return feature_importance, df_test[columns_to_use]

### Testing function

In [57]:
os.listdir(_FOLDER)

['filtered_drug_profiles_123.csv',
 'filtered_drug_profiles_12.csv',
 'filtered_drug_profiles_13.csv',
 'filtered_drug_profiles_23.csv',
 '.ipynb_checkpoints',
 'kernel_learning_1_2.csv',
 'merged_drug_profiles_sigmoid4_123.csv',
 'drug_features_pubchem_id.csv',
 'drug_features_with_pubchem_properties.csv',
 'merged_fitted_sigmoid4_123_with_drugs_description.csv',
 'merged_fitted_sigmoid4_123_with_drugs_properties.csv',
 'merged_fitted_sigmoid4_123_with_drugs_description_split_target.csv',
 'merged_fitted_sigmoid4_123_with_drugs_properties_split_target.csv',
 'kernel_learning_1.csv',
 'kernel_learning_2.csv',
 'kernel_learning_3.csv',
 'test02_merged_fitted_sigmoid4_123_with_drugs_properties.csv',
 'test02_merged_fitted_sigmoid4_123_with_drugs_description.csv',
 'train08_merged_fitted_sigmoid4_123_with_drugs_description.csv',
 'train08_merged_fitted_sigmoid4_123_with_drugs_properties.csv',
 'kernel_learning_2_2.csv',
 'kernel_learning_3_2.csv',
 'kernel_learning_4.csv',
 'Lasso_1.csv',

In [58]:
train = pd.read_csv(_FOLDER+"train08_merged_fitted_sigmoid4_123_with_drugs_description.csv")
test = pd.read_csv(_FOLDER+"test02_merged_fitted_sigmoid4_123_with_drugs_description.csv")

In [59]:
column_not_to_use = ["Unnamed: 0", "Unnamed: 0.1", "COSMIC_ID", "DRUG_ID", "Drug_Name", "Synonyms", "Target", "deriv_found", "PubChem_ID",
                     "elements", "inchi_key", "canonical_smiles", "inchi_string", "third_target", "first_target", "molecular_formula", "second_target", "Target_Pathway"]
param1 = ["param_" +str(i) for i in range(10)]
param2 = ["param" +str(i) for i in range(10)] 
norm_response  = ["norm_cells_"+str(i) for i in range(10)]
con_columns  = ["fd_num_"+str(i) for i in range(10)]

not_X_columns = param1 + param2 + norm_response + con_columns + column_not_to_use
X_columns = set(train.columns) - set(not_X_columns)

In [60]:
number_coefficients=4 
columns_to_use = ["DRUG_ID", "Drug_Name", "COSMIC_ID"] + ["pred_coef_"+str(i+1) for i in range(number_coefficients)]
feature_importance, results = PredictionsLassoSplittedDataSets(train, test, number_coefficients=number_coefficients,
                                     column_not_to_use=column_not_to_use, alpha=500, 
                                    columns_to_use= columns_to_use)

Number of X_columns: 1073


In [61]:
feature_importance[feature_importance["coef_4"]!=0]

Unnamed: 0,coef_1,coef_2,coef_3,coef_4


In [43]:
results

Unnamed: 0,DRUG_ID,Drug_Name,COSMIC_ID,pred_coef_1,pred_coef_2,pred_coef_3,pred_coef_4
0,11,Paclitaxel,907170,0.587655,0.995964,-18.991084,0.080602
1,11,Paclitaxel,908159,0.587655,0.995964,-18.991084,0.080602
2,11,Paclitaxel,909720,0.587655,0.995964,-18.991084,0.080602
3,11,Paclitaxel,905958,0.587655,0.995964,-18.991084,0.080602
4,41,S-Trityl-L-cysteine,724872,0.587655,0.995964,-18.991084,0.080602
...,...,...,...,...,...,...,...
481,1149,TW 37,688121,0.587655,0.995964,-18.991084,0.080602
482,1149,TW 37,910548,0.587655,0.995964,-18.991084,0.080602
483,1149,TW 37,753531,0.587655,0.995964,-18.991084,0.080602
484,1149,TW 37,1240179,0.587655,0.995964,-18.991084,0.080602


In [8]:
y = df["param_1"]
X=df[X_columns]

In [9]:
model = KernelRidge()

In [11]:
model.fit(X,y)

KernelRidge(alpha=1, coef0=1, degree=3, gamma=None, kernel='linear',
            kernel_params=None)

In [13]:
len(model.dual_coef_)

1817

In [14]:
len(X_columns)

1074

In [15]:
model.kernel

'linear'