##WORKING CATBOOST WITH PROPER LIKELIHOOD

In [127]:
!pip install catboost shap joblib



In [128]:
!pip install imbalanced-learn



In [129]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, accuracy_score
from scipy.special import expit
import joblib
from imblearn.over_sampling import SMOTE

In [130]:
  FEATURES = [
    'Glucose','Cholesterol','Hemoglobin','Platelets','White Blood Cells',
    'Red Blood Cells','Hematocrit','Mean Corpuscular Volume',
    'Mean Corpuscular Hemoglobin','Mean Corpuscular Hemoglobin Concentration',
    'Insulin','BMI','Systolic Blood Pressure','Diastolic Blood Pressure',
    'Triglycerides','HbA1c','LDL Cholesterol','HDL Cholesterol',
    'ALT','AST','Heart Rate','Creatinine','Troponin','C-reactive Protein'
]

In [131]:
CLASS_NAMES = ['Anemia','Diabetes','Healthy','Thalasse','Thromboc']
NUM_CLASSES = len(CLASS_NAMES)

In [132]:
TRAIN_PATH = '/content/drive/MyDrive/Datasets/blood_samples_dataset_test.csv'

In [133]:
df1 = pd.read_csv(TRAIN_PATH)

In [134]:
df1.drop_duplicates(inplace=True)

In [135]:
df1.shape

(486, 25)

In [136]:
SECOND_PATH = "/content/drive/MyDrive/Datasets/Blood_samples_dataset_balanced_2(f).csv"

In [137]:
df2 = pd.read_csv(SECOND_PATH)

In [138]:
df2.drop_duplicates(inplace=True)

In [139]:
df2.shape

(65, 25)

In [140]:
df1 = df1[df1['Disease'].isin(CLASS_NAMES)]

In [141]:
df = pd.concat([df1, df2], ignore_index=True)

In [142]:
df.duplicated().sum()

np.int64(0)

In [143]:
df.shape

(512, 25)

In [144]:
le = LabelEncoder()
le.fit(CLASS_NAMES)

In [145]:
df['Disease'] = df['Disease'].astype(str)
disease_labels = df['Disease'].unique()
label_map = {i: disease_labels[i] for i in range(len(disease_labels))}
df['target'] = df['Disease'].map({v:k for k,v in label_map.items()})

In [146]:
feature_cols = df.columns.drop(['Disease', 'target'])
X = df[feature_cols]
y = df['target']

In [147]:
categorical_features = []

In [148]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [149]:
classes = np.unique(y)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y)
class_weights = dict(zip(classes, weights))

In [150]:
model = CatBoostClassifier(
    iterations=500,
    depth=6,
    learning_rate=0.05,
    l2_leaf_reg=5,
    loss_function="MultiClass",
    eval_metric="Accuracy",
    class_weights=class_weights,
    random_strength=1.5,
    colsample_bylevel=0.8,
    verbose=50
)

In [152]:
model.fit(
    X_train, y_train,
    cat_features=categorical_features,
    eval_set=(X_test, y_test),
    early_stopping_rounds=80
)

0:	learn: 0.4876118	test: 0.1765525	best: 0.1765525 (0)	total: 81.4ms	remaining: 40.6s
50:	learn: 0.8974682	test: 0.6868420	best: 0.7120565 (41)	total: 3.52s	remaining: 31s
100:	learn: 0.9617527	test: 0.6599051	best: 0.7573790 (76)	total: 6.02s	remaining: 23.8s
150:	learn: 0.9870622	test: 0.7874523	best: 0.7874523 (141)	total: 8.77s	remaining: 20.3s
200:	learn: 0.9934503	test: 0.8164969	best: 0.8164969 (189)	total: 11.4s	remaining: 17s
250:	learn: 0.9967656	test: 0.8324555	best: 0.8324555 (201)	total: 15.7s	remaining: 15.5s
Stopped by overfitting detector  (80 iterations wait)

bestTest = 0.832455542
bestIteration = 201

Shrink model to first 202 iterations.


<catboost.core.CatBoostClassifier at 0x7e8ddc2c8a40>

In [153]:
pred_train_class = model.predict(X_train).flatten().astype(int)

In [154]:
pred_class = model.predict(X_test).flatten().astype(int)

In [155]:
print("Training Accuracy:", accuracy_score(y_train, pred_train_class))
print(classification_report(y_train, pred_train_class, target_names=[label_map[i] for i in range(len(label_map))]))

Training Accuracy: 0.9853300733496333
              precision    recall  f1-score   support

    Thalasse       0.98      1.00      0.99        49
    Diabetes       1.00      0.98      0.99       248
      Anemia       0.96      0.99      0.98        80
    Thromboc       0.94      1.00      0.97        15
     Healthy       0.94      1.00      0.97        17

    accuracy                           0.99       409
   macro avg       0.97      0.99      0.98       409
weighted avg       0.99      0.99      0.99       409



In [156]:
print("Accuracy:", accuracy_score(y_test, pred_class))
print(classification_report(y_test, pred_class, target_names=[label_map[i] for i in range(len(label_map))]))

Accuracy: 0.941747572815534
              precision    recall  f1-score   support

    Thalasse       0.93      1.00      0.96        13
    Diabetes       0.97      0.97      0.97        62
      Anemia       0.95      0.95      0.95        20
    Thromboc       1.00      0.50      0.67         4
     Healthy       0.60      0.75      0.67         4

    accuracy                           0.94       103
   macro avg       0.89      0.83      0.84       103
weighted avg       0.95      0.94      0.94       103



In [157]:
raw_logits = model.predict(X_test, prediction_type='RawFormulaVal')
independent_probs = expit(raw_logits)

In [158]:
# Example: first patient
i = 0
likelihood_output = {
    label_map[c]: float(independent_probs[i][c])
    for c in range(len(label_map))
}

In [159]:
print(likelihood_output)

{'Thalasse': 0.40320422610408435, 'Diabetes': 0.9028043852844754, 'Anemia': 0.3275776294846421, 'Thromboc': 0.3798787863783129, 'Healthy': 0.34809482514190954}


In [160]:
joblib.dump(model, "mediguard_catboost_scaled.pkl")
print("Model saved as mediguard_catboost_scaled.pkl")

Model saved as mediguard_catboost_scaled.pkl


In [161]:
import numpy as np
import pandas as pd

# Sample input for one patient (all values scaled between 0 and 1)
sample_input = pd.DataFrame([[
    0.55,  # Glucose
    0.40,  # Cholesterol
    0.60,  # Hemoglobin
    0.70,  # Platelets
    0.50,  # White Blood Cells
    0.48,  # Red Blood Cells
    0.52,  # Hematocrit
    0.45,  # Mean Corpuscular Volume
    0.50,  # Mean Corpuscular Hemoglobin
    0.47,  # Mean Corpuscular Hemoglobin Concentration
    0.35,  # Insulin
    0.55,  # BMI
    0.60,  # Systolic Blood Pressure
    0.58,  # Diastolic Blood Pressure
    0.62,  # Triglycerides
    0.50,  # HbA1c
    0.45,  # LDL Cholesterol
    0.50,  # HDL Cholesterol
    0.30,  # ALT
    0.35,  # AST
    0.55,  # Heart Rate
    0.48,  # Creatinine
    0.25,  # Troponin
    0.40   # C-reactive Protein
]], columns=[
    'Glucose','Cholesterol','Hemoglobin','Platelets','White Blood Cells',
    'Red Blood Cells','Hematocrit','Mean Corpuscular Volume',
    'Mean Corpuscular Hemoglobin','Mean Corpuscular Hemoglobin Concentration',
    'Insulin','BMI','Systolic Blood Pressure','Diastolic Blood Pressure',
    'Triglycerides','HbA1c','LDL Cholesterol','HDL Cholesterol',
    'ALT','AST','Heart Rate','Creatinine','Troponin','C-reactive Protein'
])

print(sample_input)

   Glucose  Cholesterol  Hemoglobin  Platelets  White Blood Cells  \
0     0.55          0.4         0.6        0.7                0.5   

   Red Blood Cells  Hematocrit  Mean Corpuscular Volume  \
0             0.48        0.52                     0.45   

   Mean Corpuscular Hemoglobin  Mean Corpuscular Hemoglobin Concentration  \
0                          0.5                                       0.47   

   ...  Triglycerides  HbA1c  LDL Cholesterol  HDL Cholesterol  ALT   AST  \
0  ...           0.62    0.5             0.45              0.5  0.3  0.35   

   Heart Rate  Creatinine  Troponin  C-reactive Protein  
0        0.55        0.48      0.25                 0.4  

[1 rows x 24 columns]


In [162]:
from scipy.special import expit

# Predict raw logits
raw_logits = model.predict(sample_input, prediction_type='RawFormulaVal')

# Convert to independent likelihoods
likelihoods = {label_map[c]: float(expit(raw_logits[0][c])) for c in range(len(label_map))}

print("Predicted Independent Likelihoods:")
print(likelihoods)

Predicted Independent Likelihoods:
{'Thalasse': 0.6461462010626736, 'Diabetes': 0.17806593549150018, 'Anemia': 0.4369730774186615, 'Thromboc': 0.4054270584077437, 'Healthy': 0.8268868722983372}
