In [2]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score

In [3]:
df = pd.read_csv('Clenned_dataset.csv')
df.head()

Unnamed: 0,N_Days,Status,Drug,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,age_y
0,2221,0,1,0,0,1,0,0,0.5,149.0,4.04,227.0,598.0,52.7,57.0,256.0,9.9,1,50
1,1230,0,1,1,1,0,1,0,0.5,219.0,3.93,22.0,663.0,45.0,75.0,220.0,10.8,2,54
2,4184,0,1,0,0,0,0,0,0.5,320.0,3.54,51.0,1243.0,122.45,80.0,225.0,10.0,2,32
3,2090,2,1,0,0,0,0,0,0.7,255.0,3.74,23.0,1024.0,77.5,58.0,151.0,10.2,2,45
4,2105,2,1,0,0,1,0,0,1.9,486.0,3.54,74.0,1052.0,108.5,109.0,151.0,11.5,1,59


In [4]:
x = df.drop('Stage', axis=1)
y = df['Stage']

In [5]:
df.head()

Unnamed: 0,N_Days,Status,Drug,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,age_y
0,2221,0,1,0,0,1,0,0,0.5,149.0,4.04,227.0,598.0,52.7,57.0,256.0,9.9,1,50
1,1230,0,1,1,1,0,1,0,0.5,219.0,3.93,22.0,663.0,45.0,75.0,220.0,10.8,2,54
2,4184,0,1,0,0,0,0,0,0.5,320.0,3.54,51.0,1243.0,122.45,80.0,225.0,10.0,2,32
3,2090,2,1,0,0,0,0,0,0.7,255.0,3.74,23.0,1024.0,77.5,58.0,151.0,10.2,2,45
4,2105,2,1,0,0,1,0,0,1.9,486.0,3.54,74.0,1052.0,108.5,109.0,151.0,11.5,1,59


In [6]:
s_col = ['N_Days','Cholesterol', 'Albumin', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin', 'age_y']
scl = StandardScaler()
x_scl = x.copy()
x_scl[s_col] = scl.fit_transform(x_scl[s_col])
x_scl.head()

Unnamed: 0,N_Days,Status,Drug,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,age_y
0,0.305846,0,1,0,0,1,0,0,0.5,-1.153187,1.454532,1.732843,-0.776983,-1.475838,-1.265931,0.005177,-0.922747,-0.028628
1,-0.601938,0,1,1,1,0,1,0,0.5,-0.791737,1.165424,-1.068339,-0.740849,-1.637106,-0.924927,-0.367397,0.072368,0.360391
2,2.10401,0,1,0,0,0,0,0,0.5,-0.270217,0.140406,-0.672074,-0.418421,-0.015003,-0.830204,-0.31565,-0.812179,-1.779215
3,0.185847,2,1,0,0,0,0,0,0.7,-0.605849,0.666056,-1.054675,-0.540165,-0.95643,-1.246986,-1.081497,-0.591042,-0.514902
4,0.199587,2,1,0,0,1,0,0,1.9,0.586935,0.140406,-0.357795,-0.524599,-0.30717,-0.280808,-1.081497,0.846347,0.846666


In [7]:
x_train, x_test, y_train, y_test = train_test_split(x_scl, y, train_size=0.25, random_state=42)

In [8]:
models = {
    'LogisticRegression' : LogisticRegression(), 
    'KNeighbors' : KNeighborsClassifier(), 
    'Decision Tree' : DecisionTreeClassifier(),
    'SVC' : SVC(probability=True),
    'AdaBoost' : AdaBoostClassifier(),
    'GradientBoosting' : GradientBoostingClassifier()
}

In [9]:
def validation(true, predicted, p_prob=None):
    auc = accuracy_score(true, predicted)
    f1 = f1_score(true, predicted, average='weighted')
    recall = recall_score(true, predicted, average='weighted')
    roc = roc_auc_score(true, p_prob, multi_class='ovr', average='weighted')
    return auc, f1, recall, roc
    

In [10]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train, y_train)
    
    y_tr_p = model.predict(x_train)
    y_te_p = model.predict(x_test)
    
    y_tr_prob = model.predict_proba(x_train)
    y_te_prob = model.predict_proba(x_test)
    
    model_tr_acc, model_tr_f1, model_tr_recall, model_tr_roc = validation(y_train, y_tr_p, y_tr_prob)
    model_te_acc, model_te_f1, model_te_recall, model_te_roc = validation(y_test, y_te_p, y_te_prob)
    
    
    print('--------------------------------------------------------')
    print('Model Selection for training set for ->', list(models.keys())[i])
    print('Accuracy : {:.4f}'.format(model_tr_acc))
    print('F1 : {:.4f}'.format(model_tr_f1))
    print('Recall : {:.4f}'.format(model_tr_recall))
    print('ROC : {:.4f}'.format(model_tr_roc))
    
    
    print('--------------------------------------------------------')
    print('Model Selection for test set for ->', list(models.keys())[i])
    print('Accuracy : {:.4f}'.format(model_te_acc))
    print('F1 : {:.4f}'.format(model_te_f1))
    print('Recall : {:.4f}'.format(model_te_recall))
    print('ROC : {:.4f}'.format(model_te_roc))
    
    

--------------------------------------------------------
Model Selection for training set for -> LogisticRegression
Accuracy : 0.6032
F1 : 0.6001
Recall : 0.6032
ROC : 0.7754
--------------------------------------------------------
Model Selection for test set for -> LogisticRegression
Accuracy : 0.5907
F1 : 0.5870
Recall : 0.5907
ROC : 0.7645
--------------------------------------------------------
Model Selection for training set for -> KNeighbors
Accuracy : 0.8957
F1 : 0.8958
Recall : 0.8957
ROC : 0.9844
--------------------------------------------------------
Model Selection for test set for -> KNeighbors
Accuracy : 0.8472
F1 : 0.8474
Recall : 0.8472
ROC : 0.9440
--------------------------------------------------------
Model Selection for training set for -> Decision Tree
Accuracy : 0.9954
F1 : 0.9954
Recall : 0.9954
ROC : 1.0000
--------------------------------------------------------
Model Selection for test set for -> Decision Tree
Accuracy : 0.8803
F1 : 0.8802
Recall : 0.8803
R

In [11]:
knn_param = {
    'n_neighbors': list(range(1, 31)),               
    'weights': ['uniform', 'distance'],             
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],  
    'p': [1, 2],                                       
    'leaf_size': list(range(10, 61, 10)),             
    'metric': ['minkowski'] 
}

ada_param = {
    'n_estimators': [50, 100, 150, 200, 300, 400, 500],          
    'learning_rate': [0.01,0.2 ,0.05, 0.1, 0.5, 1.0], 
    'algorithm': ['SAMME', 'SAMME.R'], 
    'random_state' :  [12, None, 25, 42]
}

gbc_param = {
    'n_estimators': [100, 200, 300, 400, 500, 600],                 
    'learning_rate': [0.01,0.2, 0.05, 0.1, 0.2, 1.0, 1.05],         
    'max_depth': [3, 4, 5, 6,7, 8, 9],                        
    'min_samples_split': [2, 5, 10, 12],                
    'min_samples_leaf': [1, 3, 5, 7, 9, 11],                    
    'subsample': [0.6, 0.8, 1.0],                     
    'max_features': ['sqrt', 'log2', None]            
}

In [12]:
random_cv = [
    ("KNN", KNeighborsClassifier(), knn_param),
    ('GradientBoost', GradientBoostingClassifier(), gbc_param )
]

In [13]:
models_param={}
for name, model, param in random_cv:
    random = RandomizedSearchCV(estimator=model, param_distributions=param, n_iter=150, cv=4, verbose=2, n_jobs=-1)
    random.fit(x_train, y_train)
    models_param[name] = random.best_params_
    
    
for model_name in models_param:
    print(f'------------------------Best Param for {model_name}-----------------------------------------')
    print(models_param[model_name])

Fitting 4 folds for each of 150 candidates, totalling 600 fits
Fitting 4 folds for each of 150 candidates, totalling 600 fits
------------------------Best Param for KNN-----------------------------------------
{'weights': 'distance', 'p': 1, 'n_neighbors': 5, 'metric': 'minkowski', 'leaf_size': 30, 'algorithm': 'auto'}
------------------------Best Param for GradientBoost-----------------------------------------
{'subsample': 1.0, 'n_estimators': 600, 'min_samples_split': 12, 'min_samples_leaf': 5, 'max_features': None, 'max_depth': 6, 'learning_rate': 0.1}


In [15]:
models_ = {
    'Gbc' : GradientBoostingClassifier(subsample=1.0 , n_estimators= 600, min_samples_split= 12, min_samples_leaf=5, max_features= None, max_depth=7, learning_rate=0.1)
}

for i in range(len(list(models_))):
    mdl = list(models_.values())[i]
    mdl.fit(x_train, y_train)
    
    y_tr_p = mdl.predict(x_train)
    y_te_p = mdl.predict(x_test)
    
    y_tr_prob = mdl.predict_proba(x_train)
    y_te_prob = mdl.predict_proba(x_test)
    
    model_tr_acc, model_tr_f1, model_tr_recall, model_tr_roc = validation(y_train, y_tr_p, y_tr_prob)
    model_te_acc, model_te_f1, model_te_recall, model_te_roc = validation(y_test, y_te_p, y_te_prob)
    
    
    print('--------------------------------------------------------')
    print('Model Selection for training set for ->', list(models_.keys())[i])
    print('Accuracy : {:.4f}'.format(model_tr_acc))
    print('F1 : {:.4f}'.format(model_tr_f1))
    print('Recall : {:.4f}'.format(model_tr_recall))
    print('ROC : {:.4f}'.format(model_tr_roc))
    
    
    print('--------------------------------------------------------')
    print('Model Selection for test set for ->', list(models_.keys())[i])
    print('Accuracy : {:.4f}'.format(model_te_acc))
    print('F1 : {:.4f}'.format(model_te_f1))
    print('Recall : {:.4f}'.format(model_te_recall))
    print('ROC : {:.4f}'.format(model_te_roc))
    

--------------------------------------------------------
Model Selection for training set for -> Gbc
Accuracy : 0.9954
F1 : 0.9954
Recall : 0.9954
ROC : 1.0000
--------------------------------------------------------
Model Selection for test set for -> Gbc
Accuracy : 0.9493
F1 : 0.9493
Recall : 0.9493
ROC : 0.9926


In [16]:
new_patient_data = {
    'N_Days': 1500,            # Number of days followed up
    'Status': 0,               # 0 = Alive, 1 = Censored, 2 = Dead
    'Drug': 1,                 # 1 = D-penicillamine
    'Sex': 0,                  # 0 = Male, 1 = Female
    'Ascites': 0,              # 0 = No, 1 = Yes
    'Hepatomegaly': 1,         # 0 = No, 1 = Yes
    'Spiders': 0,              # 0 = No, 1 = Yes
    'Edema': 0,                # 0 = No, 1 = Yes
    'Bilirubin': 0.8,          # in mg/dL
    'Cholesterol': 250.0,      # in mg/dL
    'Albumin': 3.5,            # in g/dL
    'Copper': 85.0,            # in µg/dL
    'Alk_Phos': 1000.0,        # in U/L
    'SGOT': 80.0,              # in U/mL
    'Tryglicerides': 120.0,    # in mg/dL
    'Platelets': 180.0,        # in 1000s/mm3
    'Prothrombin': 10.5,       # in seconds
    'age_y': 45                # Age in years
}

pt_df = pd.DataFrame([new_patient_data])

s_col = ['N_Days','Cholesterol', 'Albumin', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin', 'age_y']
pt_df[s_col] = scl.transform(pt_df[s_col])

proba = models_['Gbc'].predict_proba(pt_df)

predict = models_['Gbc'].predict(pt_df)[0]

for cls_idx, prb in enumerate(proba[0]):
    print(f'Stage {cls_idx} : {prb*100:.2f}% confidence')
    
print(f"\n🩺 Final Predicted Stage: Stage {predict} ")



Stage 0 : 0.00% confidence
Stage 1 : 99.59% confidence
Stage 2 : 0.40% confidence

🩺 Final Predicted Stage: Stage 2 


In [17]:
final_model = models_['Gbc']
joblib.dump(final_model, "model_s.pkl")
joblib.dump(scl, "scl.pkl")
print("saved!")


saved!
