In [None]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from src.preprocessing import *
from src.models.models import *


In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from scipy.stats import ks_2samp
from itertools import product


from sklearn.tree import plot_tree
from sksurv.ensemble import RandomSurvivalForest
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.metrics import concordance_index_censored , concordance_index_ipcw
from sklearn.impute import SimpleImputer, KNNImputer
from sksurv.util import Surv
from sklearn.preprocessing import StandardScaler,StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV, train_test_split
from sksurv.nonparametric import kaplan_meier_estimator

from sksurv.ensemble import ComponentwiseGradientBoostingSurvivalAnalysis, GradientBoostingSurvivalAnalysis

from sklearn.ensemble import RandomForestRegressor

from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
import lightgbm as lgb

from prince import MCA

import optuna

import joblib

%matplotlib inline 
plt.style.use('dark_background')


In [6]:
clinical_train_df = pd.read_csv('../Data/clinical_train.csv')
molecular_train_df = pd.read_csv('../Data/molecular_train.csv')
target_df = pd.read_csv('../Data/target_train.csv')

clinical_test_df = pd.read_csv('../Data/clinical_test.csv')
molecular_test_df = pd.read_csv('../Data/molecular_test.csv')


### Train

In [13]:
target_df.dropna(inplace=True)
target_df.reset_index(drop=True, inplace=True)
target_df['OS_STATUS'] = target_df['OS_STATUS'].astype(bool).map({True:1,False:0})

In [18]:
y = Surv.from_dataframe('OS_STATUS', 'OS_YEARS', target_df)
y.shape

(3173,)

In [19]:
clinical_train_df = clinical_train_df.loc[clinical_train_df['ID'].isin(target_df['ID'])]


In [20]:
X_train = finalize_preprocessing(clinical_train_df,molecular_train_df)




In [21]:
X_train

Unnamed: 0,BM_BLAST,WBC,ANC,MONOCYTES,HB,PLT,GENDER,NUM_ABNORMALITIES,CHROMOSOME_DIFF,Nb mut,...,mutant_aa_delP,mutant_aa_delV,mutant_aa_stop,mutant_aa_None,position,frameshift,stop_gain,stop_distance,deletion,insertion
0,0.153846,0.016861,0.001824,0.015837,0.285714,0.080745,1.0,0.047619,0.000000,0.529412,...,0.0,0.0,0.333333,0.0,0.663303,0.333333,0.333333,0.286845,0.0,0.0
1,0.010989,0.046693,0.021894,0.002262,0.603175,0.027605,0.0,0.000000,0.000000,0.176471,...,0.0,0.0,0.250000,0.0,0.229409,0.222222,0.250000,0.300692,0.0,0.0
2,0.164835,0.022698,0.019157,0.002262,0.809524,0.054520,1.0,0.047619,0.000000,0.176471,...,0.0,0.0,0.166667,0.0,0.156016,0.000000,0.166667,0.000000,0.0,0.0
3,0.010989,0.023995,0.017333,0.002262,0.388889,0.051760,1.0,0.047619,0.000000,0.647059,...,0.0,0.0,0.083333,1.0,0.494265,0.111111,0.083333,0.014837,0.0,0.0
4,0.065934,0.828794,0.088488,0.020362,0.563492,0.133195,0.0,0.047619,0.000000,0.058824,...,0.0,0.0,0.083333,0.0,0.054679,0.111111,0.083333,0.011869,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3168,0.010989,0.014916,0.009305,0.004525,0.492063,0.052450,1.0,0.142857,0.000227,0.235294,...,0.0,0.0,0.000000,0.0,0.213454,0.000000,0.000000,0.000000,0.0,0.0
3169,0.016484,0.051232,0.024266,0.010181,0.579365,0.026225,0.0,0.190476,0.000455,0.117647,...,0.0,0.0,0.000000,0.0,0.082018,0.000000,0.000000,0.000000,0.0,0.0
3170,0.000000,0.010376,0.005017,0.006561,0.428571,0.057971,1.0,0.095238,0.000000,0.352941,...,0.0,0.0,0.250000,0.0,0.240880,0.111111,0.250000,0.000000,0.0,0.0
3171,0.054945,0.007588,0.003375,0.002489,0.587302,0.069013,0.0,0.142857,0.000000,0.235294,...,0.0,0.0,0.000000,0.0,0.373092,0.000000,0.000000,0.000000,0.0,0.0


In [None]:
train_and_save_model('rsf', X_train, y, artifacts_dir='../artifacts')

Training RSF model...
RSF parameters: {'n_estimators': 266, 'min_samples_split': 37, 'min_samples_leaf': 23, 'max_features': 0.983414041321671, 'max_depth': 13}
RSF training complete.
Model saved at: ../../artifacts\rsf_model.pkl


### Test

In [None]:
X_test = finalize_preprocessing(clinical_test_df,molecular_test_df)



In [4]:
X_test

Unnamed: 0,BM_BLAST,WBC,ANC,MONOCYTES,HB,PLT,GENDER,NUM_ABNORMALITIES,CHROMOSOME_DIFF,Nb mut,...,mutant_aa_delP,mutant_aa_delV,mutant_aa_stop,mutant_aa_None,position,frameshift,stop_gain,stop_distance,deletion,insertion
0,0.747253,0.021077,0.005350,0.008869,0.285714,0.031746,1.0,0.047619,0.000227,0.235294,...,0.0,0.0,0.083333,0.333333,0.106253,0.111111,0.0,0.0,0.0,0.0
1,0.384615,0.019326,0.011314,0.006480,0.476190,0.020704,1.0,0.142857,0.000000,0.176471,...,0.0,0.0,0.000000,0.000000,0.084950,0.000000,0.0,0.0,0.0,0.0
2,0.065934,0.079118,0.079183,0.001792,0.658730,0.015873,0.0,0.047619,0.000227,0.176471,...,0.0,0.0,0.083333,0.000000,0.218370,0.111111,0.0,0.0,0.0,0.0
3,0.670330,0.034695,0.018733,0.008701,0.317460,0.028986,0.5,0.000000,0.000000,0.176471,...,0.0,0.0,0.083333,0.333333,0.071669,0.111111,0.0,0.0,0.0,0.0
4,0.021978,0.006550,0.006733,0.011855,0.365079,0.017253,1.0,0.190476,0.000682,0.176471,...,0.0,0.0,0.000000,0.000000,0.128849,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1188,0.065934,0.017510,0.011823,0.006516,0.349206,0.049689,0.5,0.000000,0.000000,0.117647,...,0.0,0.0,0.000000,0.333333,0.051315,0.000000,0.0,0.0,0.0,0.0
1189,0.065934,0.017510,0.011823,0.003665,0.349206,0.049689,0.5,0.000000,0.000000,0.176471,...,0.0,0.0,0.166667,0.000000,0.135058,0.222222,0.0,0.0,0.0,0.0
1190,0.065934,0.017510,0.011823,0.005339,0.349206,0.049689,0.5,0.000000,0.000000,0.058824,...,0.0,0.0,0.000000,0.000000,0.019750,0.000000,0.0,0.0,0.0,0.0
1191,0.065934,0.017510,0.011823,0.008285,0.349206,0.049689,0.5,0.000000,0.000000,0.176471,...,0.0,0.0,0.083333,0.000000,0.083226,0.111111,0.0,0.0,0.0,0.0


In [23]:
rsf_model = load_model('rsf', artifacts_dir='../artifacts')
predictions = rsf_model.predict(X_test)

Loaded RSF model from: ../artifacts\rsf_model.pkl


In [None]:
# Example of creating a DataFrame for submission
predictions = pd.Series(predictions, index=clinical_test_df['ID'], name='risk_score')
predictions

ID
KYW1       1134.599045
KYW2       1147.483520
KYW3        627.934677
KYW4       1052.687232
KYW5       1415.912540
              ...     
KYW1189     654.171694
KYW1190     584.920491
KYW1191    1015.684523
KYW1192     562.263932
KYW1193    1048.131261
Name: risk_score, Length: 1193, dtype: float64