In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_selection import SelectKBest, f_regression
from lifelines.utils import concordance_index
from sklearn.model_selection import KFold, GridSearchCV
from sklearn import svm
import helper_data_load

In [2]:
def my_scorer(y_true, y_predicted):
    error = concordance_index(y_true,y_predicted)
    return error

from sklearn.metrics import make_scorer
my_func = make_scorer(my_scorer, greater_is_better=True)

def fit_and_predict(model, X_train, y_train, feat_to_use, X_test, random_state, feat_sel_method):
    model_str = str(model)
    print("in fit_and_predict with {} (type: {}) and feats sel {}: ".format(str(model), type(model), feat_sel_method))
    X_train = X_train[feat_to_use]
    X_test = X_test[feat_to_use]
    print("Use features: ", feat_to_use)

    random_grid = {'C':[0.1, 0.5, 1, 5, 10]}
       
    model = GridSearchCV(estimator = model, param_grid = random_grid, scoring=my_func, refit=True, cv = 5, n_jobs = -1)
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)

    results=pd.DataFrame(model.cv_results_)
    tag='_v1_f_reg_lasso'
    newpath = "../../Data/CV_test/"

    results.to_csv(newpath+'result_full'+tag+ str(len(feat_to_use))+'.csv')

    return model_str, y_predict

In [3]:
def get_kbest(X_train, y_train, score_func, nr_feat, X_test):
    data_transformer = SelectKBest(score_func=score_func, k=nr_feat+1).fit(X_train, y_train)
    mask = data_transformer.get_support()
    k_features = X_train.columns[mask]
    return k_features

def select_feats(feat_sel_method, X_train, y_train, X_test, model, nr_feat, nr_solutions, random_state, predictor):

    if feat_sel_method in  ["f_reg"]:
        if feat_sel_method == "f_reg":
            score_func = f_regression
        feat_to_use = get_kbest(X_train, y_train, score_func, nr_feat, X_test)
        model_str, y_predict = fit_and_predict(model, X_train, y_train, feat_to_use, X_test, random_state, feat_sel_method)

    return model_str, y_predict

In [8]:
def kFold_training(k_fold_nr, random_state, model, data_norm, nr_feat, ensemble, feat_sel_method, nr_solutions, save_pred, predictor, savePath, local, best_nr_feat=None):
    #### kFold for the evaluation of the best number of features and best method for the discovery cohort
    pred_list_one_fold = []
    c_idx_list = [] # to store c-index
    cv = KFold(n_splits=k_fold_nr, random_state=random_state, shuffle=True)
    y_test_list = []
    y_test_cv = []
    y_pred_cv = []
    for train_index, test_index in cv.split(data_norm):
        # get train and test sets for this fold
        X_train, X_test = data_norm.loc[train_index, data_norm.columns != predictor], data_norm.loc[test_index, data_norm.columns != predictor]
        y_train, y_test = data_norm.loc[train_index, predictor], data_norm.loc[test_index, predictor]
        # calculate most correlating features of that fold
        model_str, y_predict = select_feats(feat_sel_method, X_train, y_train, X_test, model, nr_feat, nr_solutions, random_state, predictor)
        y_test_cv.append(y_test)
        y_pred_cv.append(y_predict)
    
    conc_list = []
    for j in range(len(y_test_cv)):
        c_idx_temp = concordance_index(y_test_cv[j], y_pred_cv[j])
        conc_list.append(c_idx_temp)
    conc_mean = np.mean(conc_list)

    y_test_cv = [j for sub in y_test_cv for j in sub]
    y_pred_cv = [j for sub in y_pred_cv for j in sub]
    if ensemble: 
        return  y_pred_cv, y_test_cv

    else:
        if save_pred:
            df_preds = pd.DataFrame({"y_test_cv": y_test_cv, "y_pred_cv": y_pred_cv})
            file_name = predictor +"_cv_preds_"+str(model_str)+"_"+feat_sel_method+str(nr_solutions)+"_nr_feat_"+str(best_nr_feat)+".csv"

            helper_data_load.save_csv(df_preds, file_name, local, savePath, folder="/results/predictions/")            #df_preds.to_csv("../../Data/Results/test_folder_pred/" + "cv_preds_"+str(model)+"_"+feat_sel_method+str(nr_solutions))
            c_idx = concordance_index(y_test_cv, y_pred_cv)
            #c_idx_list.append(c_idx)
            return c_idx, conc_mean
        else:
            c_idx = concordance_index(y_test_cv, y_pred_cv)
            #c_idx_list.append(c_idx)
            return c_idx, conc_mean

In [9]:
k_fold = 5
def not_ensemble_methods(method, df_result, data_norm, random_state, feat_sel_method, nr_sol, k_max, predictor, savePath, local):
    ensemble = False
    save_pred = False
    if feat_sel_method == "mrmr":
        k_max = 10
        
    for nr_feat in range(3,k_max+1): 
        print("Train for k = ", nr_feat)
        c_idx, c_idx_mean = kFold_training(k_fold, random_state, method, data_norm, nr_feat, ensemble, feat_sel_method, nr_sol, save_pred, predictor, savePath, local)
        
        # add errors to df
        dict_keys = df_result.columns
        dict_values = [int(nr_feat), c_idx_mean]
        row_dict = dict(zip(dict_keys, dict_values))
        df_temp = pd.DataFrame(row_dict, index=[0])
        df_result = pd.concat([df_result,df_temp])
    df_result = df_result.reset_index(drop=True)
    df_result = df_result.astype(float)

    ### check here for best nr of features for that model
    save_pred=True
    idx_best_feat = df_result[dict_keys[1]].idxmax()
    best_nr_feat = int(df_result[df_result.index == idx_best_feat]["nr_feat"])
    print("Number of best feat: ", best_nr_feat)

    # gridSearch
    random_grid = {'C':[0.1, 0.5, 1, 5, 10]}
    X_train = data_norm.loc[:, data_norm.columns != predictor]
    y_train = data_norm.loc[:, predictor]
    rf_model = svm.LinearSVR(random_state=42)
    model = GridSearchCV(estimator = rf_model, param_grid = random_grid, scoring=my_func, refit=True, cv = 5, n_jobs = -1)
    model.fit(X_train, y_train)

    results = pd.DataFrame(model.cv_results_)
    tag='_v1_f_reg_lasso_bestk_'
    newpath = "../../Data/CV_test/"

    results.to_csv(newpath+'result_full'+tag+ str(best_nr_feat)+'.csv')
    
    #c_idx_best = kFold_training(k_fold, random_state, method, data_norm, best_nr_feat, ensemble, feat_sel_method, nr_sol, save_pred, predictor, savePath, local, best_nr_feat)
    #print("Best c indx", c_idx_best)
    return df_result

In [10]:
os_target = pd.read_csv("../../Data/chum_target_phnv.csv").drop(columns=["Unnamed: 0"])
os_training = pd.read_csv("../../Data/chum_training_phnv.csv").drop(columns=["Unnamed: 0"])
os_training["os_days"] = os_target["os_days"]

In [11]:
random_state = 42
feat_sel_method = "f_reg"
nr_sol = "-1"
k_max = os_training.shape[1] -2
predictor = "os_days"
savePath = "../../Data/CV_test"
local = True

df_result_SVR= pd.DataFrame(columns=["nr_feat","c_idx_svr"])
rf_model = svm.LinearSVR(random_state=42)
df_result_SVR = not_ensemble_methods(rf_model, df_result_SVR, os_training, random_state, feat_sel_method, nr_sol, k_max, predictor, savePath, local)
print("Trained SVR")
os.system("echo Trained SVR")

Train for k =  3
in fit_and_predict with LinearSVR(random_state=42) (type: <class 'sklearn.svm._classes.LinearSVR'>) and feats sel f_reg: 
Use features:  Index(['wavelet.LLH_glcm_Contrast', 'wavelet.LLH_glszm_GrayLevelVariance',
       'wavelet.LLL_glcm_JointEnergy',
       'wavelet.LLL_gldm_SmallDependenceLowGrayLevelEmphasis'],
      dtype='object')
in fit_and_predict with LinearSVR(random_state=42) (type: <class 'sklearn.svm._classes.LinearSVR'>) and feats sel f_reg: 
Use features:  Index(['original_shape_MinorAxisLength', 'wavelet.LLH_glcm_Contrast',
       'wavelet.HHL_gldm_DependenceNonUniformity',
       'wavelet.LLL_firstorder_10Percentile'],
      dtype='object')
in fit_and_predict with LinearSVR(random_state=42) (type: <class 'sklearn.svm._classes.LinearSVR'>) and feats sel f_reg: 
Use features:  Index(['wavelet.LLH_glcm_Contrast', 'wavelet.LLH_glszm_GrayLevelVariance',
       'wavelet.LLL_firstorder_Uniformity',
       'wavelet.LLL_gldm_SmallDependenceLowGrayLevelEmphasis'],

0

In [12]:
df_result_SVR

Unnamed: 0,nr_feat,c_idx_svr
0,3.0,0.514625
1,4.0,0.505583
2,5.0,0.517348
3,6.0,0.514272
4,7.0,0.543127
5,8.0,0.53434
6,9.0,0.539086
7,10.0,0.537736
8,11.0,0.554156
9,12.0,0.543441


# Read in all inner loop results

In [13]:
all_results_inner = pd.read_csv("../../Data/CV_test/result_full_v1_f_reg_lasso"+ str(4) +".csv")
all_results_inner["nr_k"] = 4
for i in range(4,35):
    new_file = pd.read_csv("../../Data/CV_test/result_full_v1_f_reg_lasso"+ str(i)+".csv")
    new_file["nr_k"] = i

    all_results_inner = pd.concat([all_results_inner, new_file])
all_results_inner.to_csv("../../Data/CV_test/results/all_inner.csv")


In [15]:
all_results_inner.sort_values(by="mean_test_score")

Unnamed: 0.1,Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,nr_k
0,0,0.022602,0.021371,0.018703,0.023727,0.1,{'C': 0.1},0.305040,0.502646,0.457672,0.363395,0.518519,0.429454,0.082375,5,7
0,0,0.038771,0.043820,0.077610,0.086140,0.1,{'C': 0.1},0.294430,0.550265,0.476190,0.360743,0.484127,0.433151,0.092361,5,5
0,0,0.013736,0.005150,0.044852,0.033910,0.1,{'C': 0.1},0.286472,0.513228,0.457672,0.366048,0.550265,0.434737,0.096679,5,12
0,0,0.019405,0.011553,0.009879,0.004019,0.1,{'C': 0.1},0.291777,0.507937,0.465608,0.366048,0.555556,0.437385,0.095947,5,11
0,0,0.018536,0.010900,0.020197,0.030425,0.1,{'C': 0.1},0.320955,0.544974,0.473545,0.355438,0.500000,0.438982,0.086095,5,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,4,0.021751,0.015630,0.031936,0.024343,10.0,{'C': 10},0.716180,0.568783,0.560847,0.671088,0.621693,0.627718,0.059478,1,25
4,4,0.047014,0.043474,0.018064,0.012911,10.0,{'C': 10},0.663130,0.597884,0.568783,0.684350,0.637566,0.630343,0.042160,1,33
4,4,0.023099,0.010725,0.023063,0.008030,10.0,{'C': 10},0.676393,0.589947,0.571429,0.679045,0.640212,0.631405,0.044016,1,32
4,4,0.026493,0.021921,0.028399,0.021647,10.0,{'C': 10},0.684350,0.582011,0.584656,0.673740,0.653439,0.635639,0.043856,1,30
