<a href="https://colab.research.google.com/github/JEMurcia/ACA/blob/master/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Modules

In [2]:
!pip install pySurvival

Collecting pySurvival
[?25l  Downloading https://files.pythonhosted.org/packages/13/dd/d7bf69b6e1e0d1cd243b39577867c15d092404d5bc7afef3ae135b50717f/pysurvival-0.1.2.tar.gz (4.7MB)
[K     |████████████████████████████████| 4.8MB 4.7MB/s 
Collecting progressbar (from pySurvival)
  Downloading https://files.pythonhosted.org/packages/a3/a6/b8e451f6cff1c99b4747a2f7235aa904d2d49e8e1464e0b798272aa84358/progressbar-2.5.tar.gz
Building wheels for collected packages: pySurvival, progressbar
  Building wheel for pySurvival (setup.py) ... [?25l[?25hdone
  Created wheel for pySurvival: filename=pysurvival-0.1.2-cp36-cp36m-linux_x86_64.whl size=3774762 sha256=e478aa20015167c756f953db2e8dae8fe25ffbe4f95f7f3a5d2b3d51258fc03c
  Stored in directory: /root/.cache/pip/wheels/6c/23/e8/6feb0c4432219666bdd5d33828d7d9f429c4726f34c6fa8061
  Building wheel for progressbar (setup.py) ... [?25l[?25hdone
  Created wheel for progressbar: filename=progressbar-2.5-cp36-none-any.whl size=12073 sha256=089bd77647e

In [0]:
# Importing modules
import os
from pathlib import Path
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from pysurvival.datasets import Dataset
from pysurvival.models import survival_forest, non_parametric, semi_parametric, svm
from pysurvival.utils import metrics, display
from pysurvival.utils.display import correlation_matrix
from sklearn.model_selection import train_test_split
from pysurvival.models import *
from pysurvival.utils import *

# Survival Class

In [0]:
class surv_models():
    def __init__(self):
        # ======================================== DATA ADQUISITION ========================================
        self.dataset,self.features,self.time_column,self.event_column = self.build_data()
        # ========================================== DATA SAMPLING =========================================
        self.sample_data()
        self.cox = None
        self.rsf = None
        self.svm = None

    def build_data(self):
        # base_path = Path(__file__).parent.resolve()
        # lab_cac = (base_path / "../Data/Survival/survival_renal_17oct_unders.csv").resolve()

        # Reading Data
        # raw_dataset = pd.read_csv(os.path.abspath(lab_cac))
        raw_dataset = pd.read_csv('survival_renal_17oct_unders.csv')
        print(raw_dataset.head())

        # Creating the time and event columns
        time_column = 'dias'
        event_column = 'outcome'

        # Creating one-hot vectors
        category_columns = ['outcome', 'muerte', 'trr', 'genero', 'hipertension', 'diabetes']
        dataset = pd.get_dummies(raw_dataset, columns=category_columns, drop_first=True)
        dataset.columns = ['bi_id', 'dias', 'edad', 'creatinina', 'peso', 'imc', 'causa', 'outcome', 'muerte', 'trr', 'genero',
                           'hipertension', 'diabetes']
        print(dataset.head())

        # Creating the features
        features = ['genero', 'edad', 'creatinina', 'peso', 'imc', 'hipertension', 'diabetes']

        # ======================================== PREPROCESING ========================================

        # Checking for null values
        N_null = sum(dataset[features].isnull().sum())
        print("The dataset contains {} null values".format(N_null))  # 0 null values
        dataset = dataset.dropna()

        # Removing duplicates if there exist
        N_dupli = sum(dataset.duplicated(keep='first'))
        dataset = dataset.drop_duplicates(keep='first').reset_index(drop=True)
        print("The dataset contains {} duplicates".format(N_dupli))

        # CORRELATION BETWEEN FEATURES
        # correlation_matrix(dataset[features], figure_size=(20, 10), text_fontsize=10)
        return dataset,features,time_column,event_column

    def sample_data(self):
        # Downsampling the dataset to speed up computations
        # indexes_choices = np.random.choice(N, int(N*0.3), replace=False).tolist()

        # Building training and testing sets
        index_train, index_test = train_test_split(list(range(self.dataset.shape[0])), test_size=0.3)
        data_train = self.dataset.loc[index_train].reset_index(drop=True)
        data_test = self.dataset.loc[index_test].reset_index(drop=True)

        # Creating the X, T and E inputs
        self.X_train, self.X_test = data_train[self.features], data_test[self.features]  # FEATURES DATA
        self.T_train, self.T_test = data_train[self.time_column], data_test[self.time_column]  # SURVIVAL TIME
        self.E_train, self.E_test = data_train[self.event_column], data_test[self.event_column]  # EVENT OCURRENCE

    # ======================================== MODELS ========================================
    def train_cox(self):
        cox = semi_parametric.CoxPHModel()
        cox.fit(self.X_train, self.T_train, self.E_train, lr=0.5, l2_reg=1e-2, init_method='zeros')
        self.cox = cox
        return cox

    def train_rsf(self):
        rsf = survival_forest.RandomSurvivalForestModel(num_trees=200)
        rsf.fit(self.X_train, self.T_train, self.E_train, max_features='sqrt', max_depth=5, min_node_size=30)
        self.rsf = rsf
        return rsf

    def train_svm(self):
        svm_model = svm.KernelSVMModel()
        svm_model.fit(self.X_train, self.T_train, self.E_train)
        self.svm = svm_model
        return svm_model

    # ======================================= SAVING/LOADING MODELS =========================================
    # MODEL: pySurvival model
    # MODELNAME : a str specifying the name of the file without format e.g Cox_pySurvival
    def save_m(self,model,modelname):
        base_path = Path(__file__).parent.resolve()
        survival_path = (base_path / "../Data/Survival" / modelname).resolve()
        save_model(model, str(survival_path))

    def load_m(self,modelname):
        # base_path = Path(__file__).parent.resolve()
        # survival_path = (base_path / "../Data/Survival" / modelname).resolve()
        model = load_model(str(modelname) + ".zip")
        return model

    # ======================================== CROSS VALIDATION ========================================
    # C-INDEX METRIC: 1 - good ; 0 poor
    # BRIER SCORE : average discrepancies between the status and the estimated probabilities at a given time
    def cv_cox(self):
        c_index_cox = metrics.concordance_index(self.cox, self.X_test, self.T_test, self.E_test)
        ibs_cox = metrics.integrated_brier_score(self.cox, self.X_test, self.T_test, self.E_test, t_max=max(self.T_test))
        return c_index_cox,ibs_cox

    def cv_rsf(self):
        c_index_rsf = metrics.concordance_index(self.rsf, self.X_test, self.T_test, self.E_test)
        ibs_rsf = metrics.integrated_brier_score(self.rsf, self.X_test, self.T_test, self.E_test, t_max=max(self.T_test))
        return c_index_rsf, ibs_rsf

    def cv_svm(self):
        c_index_svm = metrics.concordance_index(self.svm, self.X_test, self.T_test, self.E_test)
        ibs_svm = metrics.integrated_brier_score(self.svm, self.X_test, self.T_test, self.E_test, t_max=max(self.T_test))
        return c_index_svm, ibs_svm

    def compare_all(self):
        cox_cindex, cox_ibs = self.cv_cox()
        rsf_cindex, rsf_ibs = self.cv_rsf()
        svm_cindex, svm_ibs = self.cv_svm()
        # C-INDEX COMPARISION
        print('C-INDEX \n\t - COX: {:.3f} \n\t - CSF: {:.3f} \n\t - SVM: {:.3f}'.format(cox_cindex,rsf_cindex,svm_cindex))
        # BRIER SCORE
        print('BRIER SCORE (INTEGRATED) \n\t - COX: {:.3f} \n\t - CSF: {:.3f} \n\t - SVM: {:.3f}'.format(cox_ibs,rsf_ibs,svm_ibs))
        # RISK - OVERALL PREDICTIONS
        results_cox = display.compare_to_actual(self.cox, self.X_test, self.T_test, self.E_test, figure_size=(16, 6),
                                                metrics=['rmse', 'mean', 'median'])
        results_csf = display.compare_to_actual(self.rsf, self.X_test, self.T_test, self.E_test, figure_size=(16,6),
                                                metrics=['rmse','mean','median'])
        results_cox = display.compare_to_actual(self.svm, self.X_test, self.T_test, self.E_test, figure_size=(16, 6),
                                                metrics=['rmse', 'mean', 'median'])

In [11]:
survivals = surv_models()
survivals.csf = survivals.load_m('RSF_renal_17oct')

         bi_id  dias  outcome  muerte  ...        imc hipertension  diabetes  causa
0  CC-14942551  2326    False   False  ...  28.178696         True      True      0
1  CC-17068017  2423    False   False  ...  26.229166         True     False      0
2  CC-17194675  2389    False   False  ...  27.142273         True     False      0
3  CC-20009316  2416    False   False  ...  22.922500         True     False      0
4  CC-20209077  2458    False   False  ...  25.299091         True      True      0

[5 rows x 13 columns]
         bi_id  dias  edad  creatinina  ...  trr  genero  hipertension  diabetes
0  CC-14942551  2326    72    1.944375  ...    0       1             1         1
1  CC-17068017  2423    77    1.188000  ...    0       1             1         0
2  CC-17194675  2389    71    1.488889  ...    0       1             1         0
3  CC-20009316  2416    85    0.928333  ...    0       0             1         0
4  CC-20209077  2458    81    1.248667  ...    0       0            

ArrowIOError: ignored