In [5]:

import pandas as pd
import numpy as np
import pathlib
import gosdt
from sklearn.ensemble import GradientBoostingClassifier


In [6]:
def balance_check(df):
    count_0 = len(df[df.iloc[:,-1]==0])
    count_1 = len(df[df.iloc[:,-1]==1])
    pct_pos = count_1/(count_0+count_1)

    print("The number of positive samples is: ", count_1,
          "\nThe number of negative samples is: ", count_0,
          "\nThe percentage of positive samples is: ", pct_pos)
    return pct_pos

In [7]:
df = pd.read_csv("diabetes_updated_combined_df.csv")
balance_check(df)
y = df['Outcome']
X = df.drop(columns=['Outcome'])
print(df)

The number of positive samples is:  268 
The number of negative samples is:  500 
The percentage of positive samples is:  0.3489583333333333
     Pregnancies_0  Pregnancies_1  Pregnancies_2  Pregnancies_3  \
0                0              0              0              0   
1                0              1              0              0   
2                0              0              0              0   
3                0              1              0              0   
4                1              0              0              0   
..             ...            ...            ...            ...   
763              0              0              0              0   
764              0              0              1              0   
765              0              0              0              0   
766              0              1              0              0   
767              0              1              0              0   

     Pregnancies_4  Pregnancies_5  Pregnancies_6  Preg

In [8]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score
from sklearn.model_selection import StratifiedKFold, train_test_split
import pandas as pd
import gosdt

# Step 1: Split the dataset into training, validation, and test sets
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1765, random_state=42, stratify=y_temp)

# Reduce the number of folds to 5
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

regularization = [0.01]
hp_grid = pd.DataFrame(columns=['regularization', 'average_validation_accuracy'])

# Lists to store metrics for final summary
all_train_acc = []
all_train_precision = []
all_train_recall = []
all_val_acc = []
all_val_precision = []
all_val_recall = []

for reg in regularization:
    cnt = 1
    val_accuracy = []
    best_model = None
    best_val_acc = 0

    for train_index, val_index in kf.split(X_train, y_train):
        print(f'Regularization: {reg}; Fold {cnt}; Train set: {len(train_index)}, Validation set: {len(val_index)}')

        kfX_train = X_train.iloc[train_index, :]
        kfX_val = X_train.iloc[val_index, :]
        kfy_train = y_train.iloc[train_index]
        kfy_val = y_train.iloc[val_index]

        config = {
            "regularization": reg,
            "depth_budget": 6,  # Increase this value to allow more depth and more leaves
            "min_samples_leaf": 2  # Decrease this value to allow more leaves
        }
        
        model = gosdt.GOSDT(config)
        model.fit(kfX_train, kfy_train)

        # Calculate training and validation accuracy, precision, recall
        train_acc = model.score(kfX_train, kfy_train)
        val_acc = model.score(kfX_val, kfy_val)

        # Predict for precision and recall
        train_preds = model.predict(kfX_train)
        val_preds = model.predict(kfX_val)

        train_precision = precision_score(kfy_train, train_preds, average='macro')
        train_recall = recall_score(kfy_train, train_preds, average='macro')

        val_precision = precision_score(kfy_val, val_preds, average='macro')
        val_recall = recall_score(kfy_val, val_preds, average='macro')

        # Append the metrics to the lists
        all_train_acc.append(train_acc)
        all_train_precision.append(train_precision)
        all_train_recall.append(train_recall)

        all_val_acc.append(val_acc)
        all_val_precision.append(val_precision)
        all_val_recall.append(val_recall)

        # Store validation accuracy and check for the best model
        val_accuracy.append(val_acc)
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model = model  # Keep a reference to the best model

        cnt += 1
    
    # Calculate mean validation accuracy for current regularization value
    mean_val_acc = sum(val_accuracy) / len(val_accuracy)
    hp_grid = hp_grid.append({'regularization': reg, 'average_validation_accuracy': mean_val_acc}, ignore_index=True)

# Final evaluation on the test set
if best_model is not None:
    test_preds = best_model.predict(X_test)  # Make predictions on the test set
    test_acc = accuracy_score(y_test, test_preds)  # Calculate accuracy
    test_precision = precision_score(y_test, test_preds, average='macro')  # Macro precision
    test_recall = recall_score(y_test, test_preds, average='macro')  # Macro recall
    cm = confusion_matrix(y_test, test_preds)  # Confusion matrix

    # Store the final test metrics
    final_metrics = {
        'Test Precision': test_precision,
        'Test Accuracy': test_acc,
        'Test Recall': test_recall,
        'Confusion Matrix': cm
    }

    # Print Final Results
    print("\n--- Final Results ---")
    print(f'Final Test Accuracy with best regularization {reg}: {test_acc:.4f}')
    print(f'Test Precision (macro): {test_precision:.4f}')
    print(f'Test Recall (macro): {test_recall:.4f}')
    print(f'Confusion Matrix:\n{cm}')

    # Summary of metrics at the end (train and validation)
    print("\n--- Summary of Train and Validation Metrics ---")
    print(f"Training Precision (macro): {sum(all_train_precision) / len(all_train_precision):.4f}")
    print(f"Training Accuracy: {sum(all_train_acc) / len(all_train_acc):.4f}")
    print(f"Training Recall (macro): {sum(all_train_recall) / len(all_train_recall):.4f}")
    
    print(f"Validation Precision (macro): {sum(all_val_precision) / len(all_val_precision):.4f}")
    print(f"Validation Accuracy: {sum(all_val_acc) / len(all_val_acc):.4f}")
    print(f"Validation Recall (macro): {sum(all_val_recall) / len(all_val_recall):.4f}")

    # Optionally, print out the best model's details
    print(f"# of leaves: {best_model.leaves()}")
    print(f"# of nodes: {best_model.nodes()}")
    print(f"Confusion Matrix: \n{best_model.confusion}")
    print(f"Tree structure: \n{best_model.tree}")


Regularization: 0.01; Fold 1; Train set: 428, Validation set: 108
gosdt reported successful execution
training completed. 0.000/0.000/21.398 (user, system, wall), mem=0 MB
bounds: [0.322056..0.322056] (0.000000) loss=0.292056, iterations=341746
Regularization: 0.01; Fold 2; Train set: 429, Validation set: 107
gosdt reported successful execution
training completed. 0.000/0.000/16.006 (user, system, wall), mem=0 MB
bounds: [0.316713..0.316713] (0.000000) loss=0.286713, iterations=306173
Regularization: 0.01; Fold 3; Train set: 429, Validation set: 107
gosdt reported successful execution
training completed. 0.000/0.000/21.121 (user, system, wall), mem=0 MB
bounds: [0.310396..0.310396] (0.000000) loss=0.270396, iterations=333670
Regularization: 0.01; Fold 4; Train set: 429, Validation set: 107
gosdt reported successful execution
training completed. 0.000/0.000/21.160 (user, system, wall), mem=0 MB
bounds: [0.308741..0.308741] (0.000000) loss=0.258741, iterations=348094
Regularization: 0.01

  hp_grid = hp_grid.append({'regularization': reg, 'average_validation_accuracy': mean_val_acc}, ignore_index=True)


Regularization: 0.2; Fold 1; Train set: 214, Validation set: 54
gosdt reported successful execution
training completed. 0.000/0.000/0.000 (user, system, wall), mem=0 MB
bounds: [0.545794..0.545794] (0.000000) loss=0.345794, iterations=0
Regularization: 0.2; Fold 1; Training Accuracy: 0.6542, Validation Accuracy: 0.6481, Number of Leaves: 1, Number of Nodes: 1

Regularization: 0.2; Fold 2; Train set: 214, Validation set: 54
gosdt reported successful execution
training completed. 0.000/0.000/0.000 (user, system, wall), mem=0 MB
bounds: [0.545794..0.545794] (0.000000) loss=0.345794, iterations=0
Regularization: 0.2; Fold 2; Training Accuracy: 0.6542, Validation Accuracy: 0.6481, Number of Leaves: 1, Number of Nodes: 1

Regularization: 0.2; Fold 3; Train set: 214, Validation set: 54
gosdt reported successful execution
training completed. 0.000/0.000/0.000 (user, system, wall), mem=0 MB
bounds: [0.545794..0.545794] (0.000000) loss=0.345794, iterations=0
Regularization: 0.2; Fold 3; Training

Regularization: 0.2; Fold 1; Train set: 428, Validation set: 108
gosdt reported successful execution
training completed. 0.000/0.000/0.000 (user, system, wall), mem=0 MB
bounds: [0.548131..0.548131] (0.000000) loss=0.348131, iterations=0
Regularization: 0.2; Fold 1; Training Accuracy: 0.6519, Validation Accuracy: 0.6481, Number of Leaves: 1, Number of Nodes: 1

Regularization: 0.2; Fold 2; Train set: 429, Validation set: 107
gosdt reported successful execution
training completed. 0.000/0.000/0.000 (user, system, wall), mem=0 MB
bounds: [0.547319..0.547319] (0.000000) loss=0.347319, iterations=0
Regularization: 0.2; Fold 2; Training Accuracy: 0.6527, Validation Accuracy: 0.6449, Number of Leaves: 1, Number of Nodes: 1

Regularization: 0.2; Fold 3; Train set: 429, Validation set: 107
gosdt reported successful execution
training completed. 0.000/0.000/0.000 (user, system, wall), mem=0 MB
bounds: [0.549650..0.549650] (0.000000) loss=0.349650, iterations=0
Regularization: 0.2; Fold 3; Train

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  hp_grid = hp_grid.append({'regularization': reg, 'average_validation_accuracy': mean_val_acc}, ignore_index=True)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _w

gosdt reported successful execution
training completed. 0.000/0.000/1.644 (user, system, wall), mem=0 MB
bounds: [0.398131..0.398131] (0.000000) loss=0.348131, iterations=38294
Regularization: 0.05; Fold 1; Training Accuracy: 0.6519, Validation Accuracy: 0.6481, Number of Leaves: 1, Number of Nodes: 1

Regularization: 0.05; Fold 2; Train set: 429, Validation set: 107


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


gosdt reported successful execution
training completed. 0.000/0.000/0.973 (user, system, wall), mem=0 MB
bounds: [0.397319..0.397319] (0.000000) loss=0.347319, iterations=27845
Regularization: 0.05; Fold 2; Training Accuracy: 0.6527, Validation Accuracy: 0.6449, Number of Leaves: 1, Number of Nodes: 1

Regularization: 0.05; Fold 3; Train set: 429, Validation set: 107


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


gosdt reported successful execution
training completed. 0.000/0.000/1.214 (user, system, wall), mem=0 MB
bounds: [0.393706..0.393706] (0.000000) loss=0.293706, iterations=31069
Regularization: 0.05; Fold 3; Training Accuracy: 0.7063, Validation Accuracy: 0.6822, Number of Leaves: 2, Number of Nodes: 3

Regularization: 0.05; Fold 4; Train set: 429, Validation set: 107
gosdt reported successful execution
training completed. 0.000/0.000/1.254 (user, system, wall), mem=0 MB
bounds: [0.389044..0.389044] (0.000000) loss=0.289044, iterations=32427
Regularization: 0.05; Fold 4; Training Accuracy: 0.7110, Validation Accuracy: 0.6636, Number of Leaves: 2, Number of Nodes: 3

Regularization: 0.05; Fold 5; Train set: 429, Validation set: 107


In [None]:
""""from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import pandas as pd
import gosdt

# Step 1: Split the dataset into training, validation, and test sets
X_temp, X_test, y_temp, y_test = train_test_split(Xr, yr, test_size=0.15, random_state=42, stratify=yr)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.15/.85, random_state=42, stratify=y_temp)

# Stratified K-fold Cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Regularization values to test
regularization = [0.02]

# DataFrame to store results
hp_gridr = pd.DataFrame(columns=['regularization', 'average_test_accuracy'])

# Lists to store metrics
all_train_acc = []
all_train_precision = []
all_train_recall = []
all_val_acc = []
all_val_precision = []
all_val_recall = []

# Iterate over each regularization value
for reg in regularization:
    cnt = 1
    test_accuracyr = []

    # Cross-validation loop
    for train_indexr, val_indexr in kf.split(X_train, y_train):
        print(f'Regularization: {reg}; Fold {cnt}; Train set: {len(train_indexr)}, Validation set: {len(val_indexr)}')

        # Create training and validation sets for the current fold
        kfX_trainr = X_train.iloc[train_indexr, :]
        kfX_valr = X_train.iloc[val_indexr, :]
        kfy_trainr = y_train.iloc[train_indexr]
        kfy_valr = y_train.iloc[val_indexr]

        # Configuration for the GOSDT model
        config = {
            "regularization": reg,
            "depth_budget": 5
        }

        # Instantiate and train the model
        model = gosdt.GOSDT(config)
        model.fit(kfX_trainr, kfy_trainr)

        # Calculate training and validation accuracies
        train_accr = model.score(kfX_trainr, kfy_trainr)
        val_accr = model.score(kfX_valr, kfy_valr)
        n_leavesr = model.leaves()
        n_nodesr = model.nodes()

        # Predict for precision and recall
        train_preds = model.predict(kfX_trainr)
        val_preds = model.predict(kfX_valr)

        train_precision = precision_score(kfy_trainr, train_preds, average='macro')
        train_recall = recall_score(kfy_trainr, train_preds, average='macro')

        val_precision = precision_score(kfy_valr, val_preds, average='macro')
        val_recall = recall_score(kfy_valr, val_preds, average='macro')

        # Append metrics to lists
        all_train_acc.append(train_accr)
        all_train_precision.append(train_precision)
        all_train_recall.append(train_recall)
        
        all_val_acc.append(val_accr)
        all_val_precision.append(val_precision)
        all_val_recall.append(val_recall)

        test_accuracyr.append(val_accr)
        print(f'Regularization: {reg}; Fold {cnt}; Training Accuracy: {train_accr}, Validation Accuracy: {val_accr}, '
              f'Number of Leaves: {n_leavesr}, Number of Nodes: {n_nodesr}\n')

        cnt += 1

    # Calculate mean validation accuracy across folds
    mean_val_accr = sum(test_accuracyr) / len(test_accuracyr)
    print(f'Mean Validation Accuracy across folds: {mean_val_accr}----------------------------------------------------------------------\n')

    # Store the result for the current regularization
    hp_gridr = pd.concat([hp_gridr, pd.DataFrame([[reg, mean_val_accr]], columns=hp_gridr.columns)], ignore_index=True)

# Initialize variables for the final evaluation
best_model = None
best_val_acc = 0

# Final evaluation on the test set using the best model from cross-validation
for reg in regularization:
    cnt = 1
    for train_indexr, val_indexr in kf.split(X_train, y_train):
        # Create training and validation sets for the current fold
        kfX_trainr = X_train.iloc[train_indexr, :]
        kfX_valr = X_train.iloc[val_indexr, :]
        kfy_trainr = y_train.iloc[train_indexr]
        kfy_valr = y_train.iloc[val_indexr]

        # Configuration for the GOSDT model
        config = {
            "regularization": reg,
            "depth_budget": 5
        }

        # Instantiate and train the model
        model = gosdt.GOSDT(config)
        model.fit(kfX_trainr, kfy_trainr)

        # Calculate validation accuracy
        val_accr = model.score(kfX_valr, kfy_valr)

        # Track the best model based on validation accuracy
        if val_accr > best_val_acc:
            best_val_acc = val_accr
            best_model = model

        cnt += 1

# If the best model is found, evaluate it on the test set
if best_model is not None:
    # Get predictions on the test set
    test_preds = best_model.predict(X_test)

    # Calculate accuracy, precision, recall, and confusion matrix on the test set
    test_acc = best_model.score(X_test, y_test)
    test_precision = precision_score(y_test, test_preds, average='macro')  # Macro-average precision
    test_recall = recall_score(y_test, test_preds, average='macro')  # Macro-average recall
    cm = confusion_matrix(y_test, test_preds)  # Confusion matrix

    # Print final results
    print("\n--- Final Results ---")
    print(f'Final Test Accuracy with best regularization {reg}: {test_acc:.4f}')
    print(f'Test Precision (macro): {test_precision:.4f}')
    print(f'Test Recall (macro): {test_recall:.4f}')
    print(f'Confusion Matrix:\n{cm}')

    # Print training and validation metrics
    print("\n--- Summary of Train and Validation Metrics ---")
    print(f"Training Accuracy: {sum(all_train_acc) / len(all_train_acc):.4f}")
    print(f"Training Precision (macro): {sum(all_train_precision) / len(all_train_precision):.4f}")
    print(f"Training Recall (macro): {sum(all_train_recall) / len(all_train_recall):.4f}")
    
    print(f"Validation Accuracy: {sum(all_val_acc) / len(all_val_acc):.4f}")
    print(f"Validation Precision (macro): {sum(all_val_precision) / len(all_val_precision):.4f}")
    print(f"Validation Recall (macro): {sum(all_val_recall) / len(all_val_recall):.4f}")

    # Optionally, print out the best model's details
    print(f"# of leaves: {best_model.leaves()}")
    print(f"# of nodes: {best_model.nodes()}")
    print(f"Confusion Matrix: \n{best_model.confusion}")
    print(f"Tree structure: \n{best_model.tree}")


Regularization: 0.02; Fold 1; Train set: 482, Validation set: 54
gosdt reported successful execution
training completed. 0.000/0.000/1.146 (user, system, wall), mem=0 MB
bounds: [0.338755..0.338755] (0.000000) loss=0.298755, iterations=32421
Regularization: 0.02; Fold 1; Training Accuracy: 0.7012448132780082, Validation Accuracy: 0.7037037037037037, Number of Leaves: 2, Number of Nodes: 3

Regularization: 0.02; Fold 2; Train set: 482, Validation set: 54
gosdt reported successful execution
training completed. 0.000/0.000/1.158 (user, system, wall), mem=0 MB
bounds: [0.342905..0.342905] (0.000000) loss=0.302905, iterations=32745
Regularization: 0.02; Fold 2; Training Accuracy: 0.6970954356846473, Validation Accuracy: 0.7407407407407407, Number of Leaves: 2, Number of Nodes: 3

Regularization: 0.02; Fold 3; Train set: 482, Validation set: 54
gosdt reported successful execution
training completed. 0.000/0.000/0.997 (user, system, wall), mem=0 MB
bounds: [0.340830..0.340830] (0.000000) loss

NameError: name 'X' is not defined

In [4]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score
from sklearn.model_selection import StratifiedKFold, train_test_split
import pandas as pd
import gosdt

# Step 1: Split the dataset into training, validation, and test sets
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1765, random_state=42, stratify=y_temp)

# Reduce the number of folds to 5
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

regularization = [0.01]
hp_grid = pd.DataFrame(columns=['regularization', 'average_validation_accuracy'])

# Lists to store metrics for final summary
all_train_acc = []
all_train_precision = []
all_train_recall = []
all_val_acc = []
all_val_precision = []
all_val_recall = []

for reg in regularization:
    cnt = 1
    val_accuracy = []
    best_model = None
    best_val_acc = 0

    for train_index, val_index in kf.split(X_train, y_train):
        print(f'Regularization: {reg}; Fold {cnt}; Train set: {len(train_index)}, Validation set: {len(val_index)}')

        kfX_train = X_train.iloc[train_index, :]
        kfX_val = X_train.iloc[val_index, :]
        kfy_train = y_train.iloc[train_index]
        kfy_val = y_train.iloc[val_index]

        config = {
            "regularization": reg,
            "depth_budget": 6,  # Increase this value to allow more depth and more leaves
            "min_samples_leaf": 2  # Decrease this value to allow more leaves
        }
        
        model = gosdt.GOSDT(config)
        model.fit(kfX_train, kfy_train)

        # Calculate training and validation accuracy, precision, recall
        train_acc = model.score(kfX_train, kfy_train)
        val_acc = model.score(kfX_val, kfy_val)

        # Predict for precision and recall
        train_preds = model.predict(kfX_train)
        val_preds = model.predict(kfX_val)

        train_precision = precision_score(kfy_train, train_preds, average='macro')
        train_recall = recall_score(kfy_train, train_preds, average='macro')

        val_precision = precision_score(kfy_val, val_preds, average='macro')
        val_recall = recall_score(kfy_val, val_preds, average='macro')

        # Append the metrics to the lists
        all_train_acc.append(train_acc)
        all_train_precision.append(train_precision)
        all_train_recall.append(train_recall)

        all_val_acc.append(val_acc)
        all_val_precision.append(val_precision)
        all_val_recall.append(val_recall)

        # Store validation accuracy and check for the best model
        val_accuracy.append(val_acc)
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model = model  # Keep a reference to the best model

        cnt += 1
    
    # Calculate mean validation accuracy for current regularization value
    mean_val_acc = sum(val_accuracy) / len(val_accuracy)
    hp_grid = hp_grid.append({'regularization': reg, 'average_validation_accuracy': mean_val_acc}, ignore_index=True)

# Final evaluation on the test set
if best_model is not None:
    test_preds = best_model.predict(X_test)  # Make predictions on the test set
    test_acc = accuracy_score(y_test, test_preds)  # Calculate accuracy
    test_precision = precision_score(y_test, test_preds, average='macro')  # Macro precision
    test_recall = recall_score(y_test, test_preds, average='macro')  # Macro recall
    cm = confusion_matrix(y_test, test_preds)  # Confusion matrix

    # Store the final test metrics
    final_metrics = {
        'Test Precision': test_precision,
        'Test Accuracy': test_acc,
        'Test Recall': test_recall,
        'Confusion Matrix': cm
    }

    # Print Final Results
    print("\n--- Final Results ---")
    print(f'Final Test Accuracy with best regularization {reg}: {test_acc:.4f}')
    print(f'Test Precision (macro): {test_precision:.4f}')
    print(f'Test Recall (macro): {test_recall:.4f}')
    print(f'Confusion Matrix:\n{cm}')

    # Summary of metrics at the end (train and validation)
    print("\n--- Summary of Train and Validation Metrics ---")
    print(f"Training Precision (macro): {sum(all_train_precision) / len(all_train_precision):.4f}")
    print(f"Training Accuracy: {sum(all_train_acc) / len(all_train_acc):.4f}")
    print(f"Training Recall (macro): {sum(all_train_recall) / len(all_train_recall):.4f}")
    
    print(f"Validation Precision (macro): {sum(all_val_precision) / len(all_val_precision):.4f}")
    print(f"Validation Accuracy: {sum(all_val_acc) / len(all_val_acc):.4f}")
    print(f"Validation Recall (macro): {sum(all_val_recall) / len(all_val_recall):.4f}")

    # Optionally, print out the best model's details
    print(f"# of leaves: {best_model.leaves()}")
    print(f"# of nodes: {best_model.nodes()}")
    print(f"Confusion Matrix: \n{best_model.confusion}")
    print(f"Tree structure: \n{best_model.tree}")


Regularization: 0.01; Fold 1; Train set: 428, Validation set: 108
gosdt reported successful execution
training completed. 0.000/0.000/22.233 (user, system, wall), mem=0 MB
bounds: [0.322056..0.322056] (0.000000) loss=0.292056, iterations=341746
Regularization: 0.01; Fold 2; Train set: 429, Validation set: 107
gosdt reported successful execution
training completed. 0.000/0.000/17.323 (user, system, wall), mem=0 MB
bounds: [0.316713..0.316713] (0.000000) loss=0.286713, iterations=306173
Regularization: 0.01; Fold 3; Train set: 429, Validation set: 107
gosdt reported successful execution
training completed. 0.000/0.000/21.909 (user, system, wall), mem=0 MB
bounds: [0.310396..0.310396] (0.000000) loss=0.270396, iterations=333670
Regularization: 0.01; Fold 4; Train set: 429, Validation set: 107
gosdt reported successful execution
training completed. 0.000/0.000/21.745 (user, system, wall), mem=0 MB
bounds: [0.308741..0.308741] (0.000000) loss=0.258741, iterations=348094
Regularization: 0.01

  hp_grid = hp_grid.append({'regularization': reg, 'average_validation_accuracy': mean_val_acc}, ignore_index=True)


In [None]:
hp_grid.max()
print(hp_grid)

In [None]:
import gosdt

config = {
            "regularization": 0.06,
            "depth_budget": 0
        }
model = gosdt.GOSDT(config)
model.fit(X_train, y_train)

train_acc = model.score(X_train, y_train)
test_acc = model.score(X_test, y_test)
n_leaves = model.leaves()
n_nodes = model.nodes()

print(f'Training Accuracy: {train_acc}, Test Accuracy: {test_acc}, Number of Leaves: {n_leaves}, Number of Nodes: {n_nodes}\n')
print(model.tree)

In [40]:
from rashomon_importance_distribution import RashomonImportanceDistribution

mapping = {
    0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],  # Pregnancies columns
    1: [17, 18, 19, 20, 21, 22],  # Smoking columns
    2: [23],  # Glucose
    3: [24],  # BloodPressure
    4: [25],  # SkinThickness
    5: [26],  # Insulin
    6: [27],  # BMI
    7: [28],  # HbA1c
    8: [29]   # Outcome
}


RID = RashomonImportanceDistribution(
    input_df = df,
    binning_map = mapping,
    db = 5,
    lam = 0.05,
    eps = 0.2,
    vi_metric = 'sub_mr',
    dataset_name = 'binarized_data_set_all_sets_combined',
    n_resamples = 100,
    verbose = True,
    max_par_for_gosdt = 4
)

for v in range(15):
    print(f'Variable {v} ------------')

    print('Box and whiskers mean:', RID.mean(v))
    print('Box and whiskers median:', RID.median(v))
    print('Box and whiskers range:', RID.bwr(v))

Rashomon set has already been computed
Rashomon set has already been computed
Rashomon set has already been computedRashomon set has already been computed

Rashomon set has already been computedRashomon set has already been computed

Rashomon set has already been computed
Rashomon set has already been computedRashomon set has already been computedRashomon set has already been computedRashomon set has already been computed



Rashomon set has already been computedRashomon set has already been computedRashomon set has already been computed
Rashomon set has already been computed


Rashomon set has already been computedRashomon set has already been computedRashomon set has already been computedRashomon set has already been computed



Rashomon set has already been computedRashomon set has already been computedRashomon set has already been computed
Rashomon set has already been computed


Rashomon set has already been computedRashomon set has already been computedRashomon set has already be

Process ForkPoolWorker-3:
Process ForkPoolWorker-2:
Process ForkPoolWorker-26:
Process ForkPoolWorker-1:
Process ForkPoolWorker-54:
Process ForkPoolWorker-55:
Process ForkPoolWorker-46:
Process ForkPoolWorker-47:
Process ForkPoolWorker-52:
Process ForkPoolWorker-43:
Process ForkPoolWorker-53:
Process ForkPoolWorker-50:
Process ForkPoolWorker-36:
Process ForkPoolWorker-38:
Process ForkPoolWorker-45:
Process ForkPoolWorker-13:
Process ForkPoolWorker-44:
Process ForkPoolWorker-9:
Process ForkPoolWorker-32:
Process ForkPoolWorker-24:
Process ForkPoolWorker-29:
Process ForkPoolWorker-14:
Process ForkPoolWorker-37:
Process ForkPoolWorker-35:
Process ForkPoolWorker-48:
Process ForkPoolWorker-12:
Process ForkPoolWorker-39:
Process ForkPoolWorker-49:
Process ForkPoolWorker-4:
Process ForkPoolWorker-42:
Process ForkPoolWorker-21:
Process ForkPoolWorker-6:
Process ForkPoolWorker-31:
Process ForkPoolWorker-16:
Process ForkPoolWorker-10:
Process ForkPoolWorker-25:
Process ForkPoolWorker-41:
Process

Processing ours with counts
Starting var 0
Starting var 1
Starting var 2


KeyboardInterrupt
  File "/home/msr216/.conda/envs/dimacs2024/lib/python3.11/site-packages/multiprocess/synchronize.py", line 101, in __enter__
    return self._semlock.__enter__()
           ^^^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt
KeyboardInterrupt


Starting var 3
Starting var 4
Starting var 5
Starting var 6
Starting var 7
Starting var 8
Variable 0 ------------
Box and whiskers mean: 1.2335811384723961e-17
Box and whiskers median: 0.0
Box and whiskers range: (-4.440892098500626e-16, 4.440892098500626e-16)
Variable 1 ------------
Box and whiskers mean: 1.2335811384723961e-17
Box and whiskers median: 0.0
Box and whiskers range: (-4.440892098500626e-16, 4.440892098500626e-16)
Variable 2 ------------
Box and whiskers mean: 0.06995260683410577


  File "/home/msr216/.conda/envs/dimacs2024/lib/python3.11/site-packages/multiprocess/synchronize.py", line 101, in __enter__
    return self._semlock.__enter__()
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/msr216/.conda/envs/dimacs2024/lib/python3.11/site-packages/multiprocess/synchronize.py", line 101, in __enter__
    return self._semlock.__enter__()
           ^^^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt
  File "/home/msr216/.conda/envs/dimacs2024/lib/python3.11/site-packages/multiprocess/synchronize.py", line 101, in __enter__
    return self._semlock.__enter__()
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/msr216/.conda/envs/dimacs2024/lib/python3.11/site-packages/multiprocess/synchronize.py", line 101, in __enter__
    return self._semlock.__enter__()
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/msr216/.conda/envs/dimacs2024/lib/python3.11/site-packages/multiprocess/synchronize.py", line 101, in __enter__
    return self._semlock.__enter__()
           ^^^^^^^^

Box and whiskers median: 0.058745874587458724
Box and whiskers range: (-0.09966996699669972, 0.2303630363036303)
Variable 3 ------------
Box and whiskers mean: 1.2335811384723961e-17
Box and whiskers median: 0.0
Box and whiskers range: (-4.440892098500626e-16, 4.440892098500626e-16)
Variable 4 ------------
Box and whiskers mean: 1.2335811384723961e-17
Box and whiskers median: 0.0
Box and whiskers range: (-4.440892098500626e-16, 4.440892098500626e-16)
Variable 5 ------------
Box and whiskers mean: 1.2335811384723961e-17
Box and whiskers median: 0.0
Box and whiskers range: (-4.440892098500626e-16, 4.440892098500626e-16)
Variable 6 ------------
Box and whiskers mean: 1.2335811384723961e-17
Box and whiskers median: 0.0
Box and whiskers range: (-4.440892098500626e-16, 4.440892098500626e-16)
Variable 7 ------------
Box and whiskers mean: 1.2335811384723961e-17
Box and whiskers median: 0.0
Box and whiskers range: (-4.440892098500626e-16, 4.440892098500626e-16)
Variable 8 ------------
Box and 

KeyError: 9

In [None]:
for v in range(14):
    print(f'Variable {v} ------------')

    print('Box and whiskers mean:', RID.mean(v))
    print('Box and whiskers median:', RID.median(v))
    print('Box and whiskers range:', RID.bwr(v))

In [37]:
dfr = pd.read_csv("diabetes_updated_combined_df.csv")

import pandas as pd

# Assuming `df` is your original DataFrame

# List of columns to be dropped
Xr = dfr.drop(columns=[
    'HbA1c',
    'SkinThickness',
    'smoking_0', 'smoking_1', 'smoking_2', 'smoking_3', 
    'smoking_4', 'smoking_5', 'Outcome','Insulin',
   
])



yr = dfr['Outcome']


In [38]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold, train_test_split
import pandas as pd
import gosdt
X_temp, X_test, y_temp, y_test = train_test_split(Xr, yr, test_size=0.30, random_state=42, stratify=yr)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp)



In [39]:
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import pandas as pd
import gosdt


# Stratified K-fold Cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Regularization values to test
regularization = [0.05]

# DataFrame to store results
hp_gridr = pd.DataFrame(columns=['regularization', 'average_test_accuracy'])

# Initialize variables for tracking the best model and metrics
best_model = None
best_val_acc = 0
all_train_acc = []
all_train_precision = []
all_train_recall = []
all_val_acc = []
all_val_precision = []
all_val_recall = []

# Iterate over each regularization value
for reg in regularization:
    cnt = 1
    test_accuracyr = []

    # Cross-validation loop
    for train_indexr, val_indexr in kf.split(X_train, y_train):
        print(f'Regularization: {reg}; Fold {cnt}; Train set: {len(train_indexr)}, Validation set: {len(val_indexr)}')

        # Create training and validation sets for the current fold
        kfX_trainr = X_train.iloc[train_indexr, :]
        kfX_valr = X_train.iloc[val_indexr, :]
        kfy_trainr = y_train.iloc[train_indexr]
        kfy_valr = y_train.iloc[val_indexr]

        # Configuration for the GOSDT model
        config = {
            "regularization": reg,
            "depth_budget": 7
        }

        # Instantiate and train the model
        model = gosdt.GOSDT(config)
        model.fit(kfX_trainr, kfy_trainr)

        # Calculate training and validation accuracies
        train_accr = model.score(kfX_trainr, kfy_trainr)
        val_accr = model.score(kfX_valr, kfy_valr)
        n_leavesr = model.leaves()
        n_nodesr = model.nodes()

        # Track the best model based on validation accuracy
        if val_accr > best_val_acc:
            best_val_acc = val_accr
            best_model = model

        # Collect metrics for later reporting
        all_train_acc.append(train_accr)
        all_train_precision.append(precision_score(kfy_trainr, model.predict(kfX_trainr), average='macro'))
        all_train_recall.append(recall_score(kfy_trainr, model.predict(kfX_trainr), average='macro'))

        all_val_acc.append(val_accr)
        all_val_precision.append(precision_score(kfy_valr, model.predict(kfX_valr), average='macro'))
        all_val_recall.append(recall_score(kfy_valr, model.predict(kfX_valr), average='macro'))

        # Store the result for the current regularization
        test_accuracyr.append(val_accr)
        print(f'Regularization: {reg}; Fold {cnt}; Training Accuracy: {train_accr}, Validation Accuracy: {val_accr}, '
              f'Number of Leaves: {n_leavesr}, Number of Nodes: {n_nodesr}\n')

        cnt += 1

    # Calculate mean validation accuracy across folds
    mean_val_accr = sum(test_accuracyr) / len(test_accuracyr)
    print(f'Mean Validation Accuracy across folds: {mean_val_accr}----------------------------------------------------------------------\n')

    # Store the result for the current regularization
    hp_gridr = pd.concat([hp_gridr, pd.DataFrame([[reg, mean_val_accr]], columns=hp_gridr.columns)], ignore_index=True)

# If the best model is found, evaluate it on the test set
if best_model is not None:
    # Get predictions on the test set
    test_preds = best_model.predict(X_test)

    # Calculate accuracy, precision, recall, and confusion matrix on the test set
    test_acc = best_model.score(X_test, y_test)
    test_precision = precision_score(y_test, test_preds, average='macro')
    test_recall = recall_score(y_test, test_preds, average='macro')
    cm = confusion_matrix(y_test, test_preds)

    # Print final results
    print("\n--- Final Results ---")
    print(f'Final Test Accuracy: {test_acc:.4f}')
    print(f'Test Precision (macro): {test_precision:.4f}')
    print(f'Test Recall (macro): {test_recall:.4f}')
    print(f'Confusion Matrix:\n{cm}')

    # Print training and validation metrics
    print("\n--- Summary of Train and Validation Metrics ---")
    print(f"Training Accuracy: {sum(all_train_acc) / len(all_train_acc):.4f}")
    print(f"Training Precision (macro): {sum(all_train_precision) / len(all_train_precision):.4f}")
    print(f"Training Recall (macro): {sum(all_train_recall) / len(all_train_recall):.4f}")
    
    print(f"Validation Accuracy: {sum(all_val_acc) / len(all_val_acc):.4f}")
    print(f"Validation Precision (macro): {sum(all_val_precision) / len(all_val_precision):.4f}")
    print(f"Validation Recall (macro): {sum(all_val_recall) / len(all_val_recall):.4f}")

    # Print model details
    print(f"# of leaves: {best_model.leaves()}")
    print(f"# of nodes: {best_model.nodes()}")
    print(f"Confusion Matrix: \n{best_model.confusion}")
    print(f"Tree structure: \n{best_model.tree}")

Regularization: 0.05; Fold 1; Train set: 241, Validation set: 27
gosdt reported successful execution
training completed. 0.000/0.000/0.000 (user, system, wall), mem=0 MB
bounds: [0.378008..0.378008] (0.000000) loss=0.278008, iterations=33
Regularization: 0.05; Fold 1; Training Accuracy: 0.7219917012448133, Validation Accuracy: 0.6666666666666666, Number of Leaves: 2, Number of Nodes: 3

Regularization: 0.05; Fold 2; Train set: 241, Validation set: 27
gosdt reported successful execution
training completed. 0.000/0.000/0.000 (user, system, wall), mem=0 MB
bounds: [0.390456..0.390456] (0.000000) loss=0.290456, iterations=33
Regularization: 0.05; Fold 2; Training Accuracy: 0.7095435684647303, Validation Accuracy: 0.7777777777777778, Number of Leaves: 2, Number of Nodes: 3

Regularization: 0.05; Fold 3; Train set: 241, Validation set: 27
gosdt reported successful execution
training completed. 0.000/0.000/0.000 (user, system, wall), mem=0 MB
bounds: [0.390456..0.390456] (0.000000) loss=0.290

In [None]:
print(hp_gridr.max())
print(hp_gridr)

In [None]:
import gosdt

config = {
            "regularization": 0.002,
            "depth_budget": 0
        }
model = gosdt.GOSDT(config)
model.fit(X_trainr, y_trainr)

train_accr = model.score(X_trainr, y_trainr)
test_accr = model.score(X_testr, y_testr)
n_leavesr = model.leaves()
n_nodesr = model.nodes()

print(f'Training Accuracy: {train_accr}, Test Accuracy: {test_accr}, Number of Leaves: {n_leavesr}, Number of Nodes: {n_nodesr}\n')
print(model.tree)