In [1]:
import pandas as pd
import csv
import os
import numpy as np
from pycaret.survival_analysis.oop import SurvivalAnalysisExperiment
from pycaret.regression.oop import RegressionExperiment
import lifelines

In [2]:
# List the files in the current directory
files = os.listdir('./csv')
# Get the absolute path of the current directory
current_dir = os.getcwd()
# Create a DataFrame with the file names and their absolute paths
file_paths = pd.DataFrame({
    'file_name': files,
    'absolute_path': [os.path.join(current_dir, 'csv', file) for file in files]
}) 

In [3]:
# Load the CSV file into a DataFrame 
df = pd.read_csv(file_paths['absolute_path'][0], index_col=0)

In [4]:

features_to_drop = df.columns[16:]
df = df.drop(features_to_drop, axis=1)
df


Unnamed: 0_level_0,ID,age,eventdeath,survival,timerecurrence,chemo,hormonal,amputation,histtype,diam,posnodes,grade,angioinv,lymphinfil,barcode,esr1
Patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
s122,18,43,0,14.817248,14.817248,0,0,1,1,25,0,2,3,1,6274,-0.413955
s123,19,48,0,14.261465,14.261465,0,0,0,1,20,0,3,3,1,6275,0.195251
s124,20,38,0,6.644764,6.644764,0,0,0,1,15,0,2,1,1,6276,0.596177
s125,21,50,0,7.748118,7.748118,0,1,0,1,15,1,2,3,1,6277,0.501286
s126,22,38,0,6.436687,6.318960,0,0,1,1,15,0,2,2,1,6278,-0.066771
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
s71,308,48,1,2.608300,1.982204,1,0,1,1,30,0,3,1,3,4321,-1.235442
s72,309,39,1,5.504100,3.028063,0,0,1,1,30,0,2,1,1,4322,0.062265
s73,310,50,1,2.619200,2.149213,0,0,1,1,27,0,3,1,1,4323,-0.635322
s75,311,52,1,2.290500,2.209446,0,1,1,1,28,0,3,1,1,4325,-1.055801


In [5]:
features_to_drop

Index(['G3PDH_570', 'Contig45645_RC', 'Contig44916_RC', 'D25272', 'J00129',
       'Contig29982_RC', 'Contig56678_RC', 'Contig53047_RC', 'Contig19551',
       'Contig47230_RC',
       ...
       'Contig36312_RC', 'Contig38980_RC', 'NM_000853', 'NM_000854',
       'NM_000860', 'Contig29014_RC', 'Contig46616_RC', 'NM_000888',
       'NM_000898', 'AF067420'],
      dtype='object', length=1553)

In [6]:

exp1 = SurvivalAnalysisExperiment()
exp1.setup(data=df,
            duration="timerecurrence",
            target="eventdeath",
            ignore_features=['ID', 'Patient'],
            log_data=True,
            fold=20
           )

TypeError: SurvivalAnalysisExperiment.setup() got an unexpected keyword argument 'duration'

In [None]:
print(exp1.models())

                                   Name  \
ID                                        
cox                      Cox Regression   
aa                       Aalen Additive   
cs                           CRC Spline   
ctv                    Cox Time Varying   
ggr        Generalized Gamma Regression   
llaft                  Log Logistic AFT   
lnaft                    Log Normal AFT   
pwer   Piecewise Exponential Regression   
waft                        Weibull AFT   

                                               Reference  Turbo  
ID                                                               
cox    lifelines.utils.sklearn_adapter.SkLearnCoxPHFi...   True  
aa     lifelines.utils.sklearn_adapter.SkLearnAalenAd...   True  
cs     lifelines.utils.sklearn_adapter.SkLearnCRCSpli...   True  
ctv    lifelines.utils.sklearn_adapter.SkLearnCoxTime...   True  
ggr    lifelines.utils.sklearn_adapter.SkLearnGeneral...   True  
llaft  lifelines.utils.sklearn_adapter.SkLearnLogLogi...   True 

In [None]:
# cox = exp1.create_model('cox')
model = exp1.create_model('cox')

Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
model = exp1.tune_model(model)

Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 20 folds for each of 4 candidates, totalling 80 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [None]:
prediction = exp1.predict_model(model, data=df)
prediction

Unnamed: 0,age,eventdeath,survival,chemo,hormonal,amputation,histtype,diam,posnodes,grade,angioinv,lymphinfil,barcode,timerecurrence,Label
0,43.0,0.0,14.817248,0.0,0.0,1.0,1.0,25.0,0.0,2.0,3.0,1.0,6274.0,14.817248,17.874729
1,48.0,0.0,14.261465,0.0,0.0,0.0,1.0,20.0,0.0,3.0,3.0,1.0,6275.0,14.261465,17.619638
2,38.0,0.0,6.644764,0.0,0.0,0.0,1.0,15.0,0.0,2.0,1.0,1.0,6276.0,6.644764,7.255338
3,50.0,0.0,7.748118,0.0,1.0,0.0,1.0,15.0,1.0,2.0,3.0,1.0,6277.0,7.748118,9.771463
4,38.0,0.0,6.436687,0.0,0.0,1.0,1.0,15.0,0.0,2.0,2.0,1.0,6278.0,6.318960,6.275861
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267,48.0,1.0,2.608300,1.0,0.0,1.0,1.0,30.0,0.0,3.0,1.0,3.0,4321.0,1.982204,1.749354
268,39.0,1.0,5.504100,0.0,0.0,1.0,1.0,30.0,0.0,2.0,1.0,1.0,4322.0,3.028063,5.011133
269,50.0,1.0,2.619200,0.0,0.0,1.0,1.0,27.0,0.0,3.0,1.0,1.0,4323.0,2.149213,1.221810
270,52.0,1.0,2.290500,0.0,1.0,1.0,1.0,28.0,0.0,3.0,1.0,1.0,4325.0,2.209446,1.594492


In [None]:
exp1.plot_model(model, plot='test')

ValueError: Plot Not Available. Please see docstring for list of available Plots.