In [1]:

import pandas as pd
import numpy as np
import pathlib
import gosdt
from sklearn.ensemble import GradientBoostingClassifier


In [2]:
def balance_check(df):
    count_0 = len(df[df.iloc[:,-1]==0])
    count_1 = len(df[df.iloc[:,-1]==1])
    pct_pos = count_1/(count_0+count_1)

    print("The number of positive samples is: ", count_1,
          "\nThe number of negative samples is: ", count_0,
          "\nThe percentage of positive samples is: ", pct_pos)
    return pct_pos

In [3]:
df = pd.read_csv("binarized_heart_disease.csv")
print(df)
balance_check(df)
y = df['num']
X = df.drop(columns=['num'])

     age  trestbps  chol  fbs  thalch  exang  oldpeak  ca  Heart rate  \
0      1         1     0    1       1      0        1   0           0   
1      1         1     1    0       0      1        1   1           1   
2      1         0     0    0       0      1        1   1           0   
3      0         0     1    0       1      0        1   0           0   
4      0         0     0    0       1      0        1   0           0   
..   ...       ...   ...  ...     ...    ...      ...  ..         ...   
294    1         1     0    1       0      0        1   1           0   
295    1         0     0    0       0      1        1   1           0   
296    1         0     0    0       1      0        0   1           0   
297    0         1     0    0       0      1        1   0           1   
298    1         0     0    0       0      1        1   0           1   

     Systolic blood pressure  ...  restecg_lv hypertrophy  restecg_normal  \
0                          1  ...             

In [4]:
""""from sklearn.metrics import precision_score, recall_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold, train_test_split
import pandas as pd
import gosdt


# Function to visualize the tree
def visualize_tree(model):
    dot_data = model.export_graph()  # This function needs to be supported by your GOSDT implementation
    graph = graphviz.Source(dot_data)
    return graph

# Step 1: Split the dataset into training, validation, and test sets
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.30, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp)  # 0.50 * 0.30 = 0.15

# Reduce the number of folds to 5
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

regularization = [0.2,0.1,.05]
hp_grid = pd.DataFrame(columns=['regularization', 'average_validation_accuracy'])

# Lambda function to compute mean


for reg in regularization:
    cnt = 1
    val_accuracy = []
    best_model = None
    best_val_acc = 0

    for train_index, val_index in kf.split(X_train, y_train):
        print(f'Regularization: {reg}; Fold {cnt}; Train set: {len(train_index)}, Validation set: {len(val_index)}')

        kfX_train = X_train.iloc[train_index, :]
        kfX_val = X_train.iloc[val_index, :]
        kfy_train = y_train.iloc[train_index]
        kfy_val = y_train.iloc[val_index]

        config = {
            "regularization": reg,
            "depth_budget": 6,  # Increase this value to allow more depth and more leaves
            "min_samples_leaf": 2  # Decrease this value to allow more leaves
        }
        
        model = gosdt.GOSDT(config)
        model.fit(kfX_train, kfy_train)

        train_acc = model.score(kfX_train, kfy_train)
        val_acc = model.score(kfX_val, kfy_val)
        n_leaves = model.leaves()
        n_nodes = model.nodes()

        # Store validation accuracy and check for the best model
        val_accuracy.append(val_acc)
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model = model  # Keep a reference to the best model

        print(f'Regularization: {reg}; Fold {cnt}; Training Accuracy: {train_acc:.4f}, Validation Accuracy: {val_acc:.4f}, Number of Leaves: {n_leaves}, Number of Nodes: {n_nodes}\n')
        cnt += 1
    
    # Calculate mean validation accuracy using the lambda function
  

# Final evaluation on the test set
# Final evaluation on the test set
if best_model is not None:
    # Get predictions on the test set
    test_preds = best_model.predict(X_test)

    # Calculate accuracy, precision, recall, and confusion matrix on the test set
    test_acc = best_model.score(X_test, y_test)
    test_precision = precision_score(y_test, test_preds, average='macro')  # Macro-average precision
    test_recall = recall_score(y_test, test_preds, average='macro')  # Macro-average recall
    cm = confusion_matrix(y_test, test_preds)  # Confusion matrix

    # Print results
    print(f'Final Test Accuracy with best regularization {reg}: {test_acc:.4f}')
    print(f'Test Precision (macro): {test_precision:.4f}')
    print(f'Test Recall (macro): {test_recall:.4f}')
    print(f'Confusion Matrix:\n{cm}')

    print(f"# of leaves: {best_model.leaves()}")
    print(f"# of nodes: {best_model.nodes()}")
    print(f"Confusion Matrix: \n{best_model.confusion}")
    print(f"Tree structure: \n{best_model.tree}")


Regularization: 0.2; Fold 1; Train set: 83, Validation set: 21
gosdt reported successful execution
training completed. 0.000/0.000/0.000 (user, system, wall), mem=0 MB
bounds: [0.604819..0.604819] (0.000000) loss=0.204819, iterations=5
Regularization: 0.2; Fold 1; Training Accuracy: 0.7952, Validation Accuracy: 0.6190, Number of Leaves: 2, Number of Nodes: 3

Regularization: 0.2; Fold 2; Train set: 83, Validation set: 21
gosdt reported successful execution
training completed. 0.000/0.000/0.000 (user, system, wall), mem=0 MB
bounds: [0.640964..0.640964] (0.000000) loss=0.240964, iterations=19
Regularization: 0.2; Fold 2; Training Accuracy: 0.7590, Validation Accuracy: 0.8571, Number of Leaves: 2, Number of Nodes: 3

Regularization: 0.2; Fold 3; Train set: 83, Validation set: 21
gosdt reported successful execution
training completed. 0.000/0.000/0.000 (user, system, wall), mem=0 MB
bounds: [0.604819..0.604819] (0.000000) loss=0.204819, iterations=7
Regularization: 0.2; Fold 3; Training A

In [5]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold, train_test_split
import pandas as pd
import gosdt
import graphviz

# Function to visualize the tree
def visualize_tree(model):
    dot_data = model.export_graph()  # This function needs to be supported by your GOSDT implementation
    graph = graphviz.Source(dot_data)
    return graph

# Step 1: Split the dataset into training, validation, and test sets
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.30, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp)  # 0.50 * 0.30 = 0.15

# Reduce the number of folds to 5
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

regularization = [0.2, 0.1, 0.05]
hp_grid = pd.DataFrame(columns=['regularization', 'average_validation_accuracy'])

# Variables to store overall metrics
all_train_acc = []
all_train_precision = []
all_train_recall = []
all_val_acc = []
all_val_precision = []
all_val_recall = []

# Lambda function to compute mean
for reg in regularization:
    cnt = 1
    val_accuracy = []
    best_model = None
    best_val_acc = 0

    for train_index, val_index in kf.split(X_train, y_train):
        print(f'Regularization: {reg}; Fold {cnt}; Train set: {len(train_index)}, Validation set: {len(val_index)}')

        kfX_train = X_train.iloc[train_index, :]
        kfX_val = X_train.iloc[val_index, :]
        kfy_train = y_train.iloc[train_index]
        kfy_val = y_train.iloc[val_index]

        config = {
            "regularization": reg,
            "depth_budget": 6,  # Increase this value to allow more depth and more leaves
            "min_samples_leaf": 2  # Decrease this value to allow more leaves
        }
        
        model = gosdt.GOSDT(config)
        model.fit(kfX_train, kfy_train)

        train_preds = model.predict(kfX_train)
        val_preds = model.predict(kfX_val)

        # Calculate training metrics
        train_acc = model.score(kfX_train, kfy_train)
        train_precision = precision_score(kfy_train, train_preds, average='macro')
        train_recall = recall_score(kfy_train, train_preds, average='macro')
        
        # Calculate validation metrics
        val_acc = model.score(kfX_val, kfy_val)
        val_precision = precision_score(kfy_val, val_preds, average='macro')
        val_recall = recall_score(kfy_val, val_preds, average='macro')

        n_leaves = model.leaves()
        n_nodes = model.nodes()

        # Store validation accuracy and check for the best model
        val_accuracy.append(val_acc)
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model = model  # Keep a reference to the best model

        # Print fold results
        print(f'Regularization: {reg}; Fold {cnt}; '
              f'Training Accuracy: {train_acc:.4f}, Training Precision: {train_precision:.4f}, '
              f'Training Recall: {train_recall:.4f}, '
              f'Validation Accuracy: {val_acc:.4f}, Validation Precision: {val_precision:.4f}, '
              f'Validation Recall: {val_recall:.4f}, '
              f'Number of Leaves: {n_leaves}, Number of Nodes: {n_nodes}\n')
        cnt += 1
    
    # Calculate mean validation accuracy
    avg_val_accuracy = sum(val_accuracy) / len(val_accuracy)
    hp_grid = hp_grid.append({'regularization': reg, 'average_validation_accuracy': avg_val_accuracy}, ignore_index=True)

    # Store metrics for final output
    all_train_acc.append(train_acc)
    all_train_precision.append(train_precision)
    all_train_recall.append(train_recall)
    all_val_acc.append(val_acc)
    all_val_precision.append(val_precision)
    all_val_recall.append(val_recall)

# Final evaluation on the test set
if best_model is not None:
    # Get predictions on the test set
    test_preds = best_model.predict(X_test)

    # Calculate accuracy, precision, recall on the test set
    test_acc = best_model.score(X_test, y_test)
    test_precision = precision_score(y_test, test_preds, average='macro')
    test_recall = recall_score(y_test, test_preds, average='macro')
    cm = confusion_matrix(y_test, test_preds)

    # Print final results
    print("\n--- Final Results ---")
    print(f'Final Test Accuracy with best regularization {reg}: {test_acc:.4f}')
    print(f'Test Precision (macro): {test_precision:.4f}')
    print(f'Test Recall (macro): {test_recall:.4f}')
    print(f'Confusion Matrix:\n{cm}')

    # Print training and validation metrics
    print("\n--- Summary of Train and Validation Metrics ---")
    print(f"Training Accuracy: {sum(all_train_acc) / len(all_train_acc):.4f}")
    print(f"Training Precision (macro): {sum(all_train_precision) / len(all_train_precision):.4f}")
    print(f"Training Recall (macro): {sum(all_train_recall) / len(all_train_recall):.4f}")
    
    print(f"Validation Accuracy: {sum(all_val_acc) / len(all_val_acc):.4f}")
    print(f"Validation Precision (macro): {sum(all_val_precision) / len(all_val_precision):.4f}")
    print(f"Validation Recall (macro): {sum(all_val_recall) / len(all_val_recall):.4f}")

    # Optionally, print out the best model's details
    print(f"# of leaves: {best_model.leaves()}")
    print(f"# of nodes: {best_model.nodes()}")
    print(f"Confusion Matrix: \n{best_model.confusion}")
    print(f"Tree structure: \n{best_model.tree}")



Regularization: 0.2; Fold 1; Train set: 83, Validation set: 21
gosdt reported successful execution
training completed. 0.000/0.000/0.000 (user, system, wall), mem=0 MB
bounds: [0.604819..0.604819] (0.000000) loss=0.204819, iterations=5
Regularization: 0.2; Fold 1; Training Accuracy: 0.7952, Training Precision: 0.7958, Training Recall: 0.7906, Validation Accuracy: 0.6190, Validation Precision: 0.6182, Validation Recall: 0.6182, Number of Leaves: 2, Number of Nodes: 3

Regularization: 0.2; Fold 2; Train set: 83, Validation set: 21
gosdt reported successful execution
training completed. 0.000/0.000/0.000 (user, system, wall), mem=0 MB
bounds: [0.640964..0.640964] (0.000000) loss=0.240964, iterations=19
Regularization: 0.2; Fold 2; Training Accuracy: 0.7590, Training Precision: 0.7596, Training Recall: 0.7614, Validation Accuracy: 0.8571, Validation Precision: 0.8846, Validation Recall: 0.8636, Number of Leaves: 2, Number of Nodes: 3

Regularization: 0.2; Fold 3; Train set: 83, Validation 

  hp_grid = hp_grid.append({'regularization': reg, 'average_validation_accuracy': avg_val_accuracy}, ignore_index=True)
  hp_grid = hp_grid.append({'regularization': reg, 'average_validation_accuracy': avg_val_accuracy}, ignore_index=True)


gosdt reported successful execution
training completed. 0.000/0.000/0.193 (user, system, wall), mem=0 MB
bounds: [0.306626..0.306626] (0.000000) loss=0.156626, iterations=5515
Regularization: 0.05; Fold 2; Training Accuracy: 0.8434, Training Precision: 0.8743, Training Recall: 0.8310, Validation Accuracy: 0.8571, Validation Precision: 0.8611, Validation Recall: 0.8545, Number of Leaves: 3, Number of Nodes: 5

Regularization: 0.05; Fold 3; Train set: 83, Validation set: 21
gosdt reported successful execution
training completed. 0.000/0.000/0.069 (user, system, wall), mem=0 MB
bounds: [0.284337..0.284337] (0.000000) loss=0.084337, iterations=2072
Regularization: 0.05; Fold 3; Training Accuracy: 0.9157, Training Precision: 0.9248, Training Recall: 0.9099, Validation Accuracy: 0.8571, Validation Precision: 0.8611, Validation Recall: 0.8545, Number of Leaves: 4, Number of Nodes: 7

Regularization: 0.05; Fold 4; Train set: 83, Validation set: 21
gosdt reported successful execution
training c

  hp_grid = hp_grid.append({'regularization': reg, 'average_validation_accuracy': avg_val_accuracy}, ignore_index=True)


In [5]:
"""from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import precision_score, recall_score, confusion_matrix
import pandas as pd
import gosdt


# Step 1: Split the dataset into training, validation, and test sets
X_temp, X_test, y_temp, y_test = train_test_split(X_train, y_train, test_size=0.30, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42)  # 0.50 * 0.30 = 0.15

# Reduce the number of folds to 5
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

regularization = [0.2, 0.1]
hp_grid = pd.DataFrame(columns=['regularization', 'average_validation_accuracy'])

# Lambda function to compute mean
for reg in regularization:
    cnt = 1
    val_accuracy = []
    best_model = None
    best_val_acc = 0

    for train_index, val_index in kf.split(X_train, y_train):
        print(f'Regularization: {reg}; Fold {cnt}; Train set: {len(train_index)}, Validation set: {len(val_index)}')

        kfX_train = X_train.iloc[train_index, :]
        kfX_val = X_train.iloc[val_index, :]
        kfy_train = y_train.iloc[train_index]
        kfy_val = y_train.iloc[val_index]

        config = {
            "regularization": reg,
            "depth_budget": 7,  # Increase this value to allow more depth and more leaves
            "min_samples_leaf": 2  # Decrease this value to allow more leaves
        }
        
        model = gosdt.GOSDT(config)
        model.fit(kfX_train, kfy_train)

        train_acc = model.score(kfX_train, kfy_train)
        val_acc = model.score(kfX_val, kfy_val)
        n_leaves = model.leaves()
        n_nodes = model.nodes()

        # Store validation accuracy and check for the best model
        val_accuracy.append(val_acc)
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model = model  # Keep a reference to the best model

        print(f'Regularization: {reg}; Fold {cnt}; Training Accuracy: {train_acc:.4f}, Validation Accuracy: {val_acc:.4f}, Number of Leaves: {n_leaves}, Number of Nodes: {n_nodes}\n')
        cnt += 1

# Final evaluation on the test set
if best_model is not None:
    # Get predictions on the test set
    test_preds = best_model.predict(X_test)

    # Calculate accuracy, precision, recall, and confusion matrix on the test set
    test_acc = best_model.score(X_test, y_test)
    test_precision = precision_score(y_test, test_preds, average='macro')  # Macro-average precision
    test_recall = recall_score(y_test, test_preds, average='macro')  # Macro-average recall
    cm = confusion_matrix(y_test, test_preds)  # Confusion matrix

    # Print results
    print(f'Final Test Accuracy with best regularization {reg}: {test_acc:.4f}')
    print(f'Test Precision (macro): {test_precision:.4f}')
    print(f'Test Recall (macro): {test_recall:.4f}')
    print(f'Confusion Matrix:\n{cm}')

    print(f"# of leaves: {best_model.leaves()}")
    print(f"# of nodes: {best_model.nodes()}")
    print(f"Confusion Matrix: \n{best_model.confusion}")
    print(f"Tree structure: \n{best_model.tree}")"

Regularization: 0.2; Fold 1; Train set: 28, Validation set: 8
gosdt reported successful execution
training completed. 0.000/0.000/0.000 (user, system, wall), mem=0 MB
bounds: [0.578571..0.578571] (0.000000) loss=0.178571, iterations=0
Regularization: 0.2; Fold 1; Training Accuracy: 0.8214, Validation Accuracy: 0.5000, Number of Leaves: 2, Number of Nodes: 3

Regularization: 0.2; Fold 2; Train set: 29, Validation set: 7
gosdt reported successful execution
training completed. 0.000/0.000/0.000 (user, system, wall), mem=0 MB
bounds: [0.606897..0.606897] (0.000000) loss=0.206897, iterations=11
Regularization: 0.2; Fold 2; Training Accuracy: 0.7931, Validation Accuracy: 0.8571, Number of Leaves: 2, Number of Nodes: 3

Regularization: 0.2; Fold 3; Train set: 29, Validation set: 7
gosdt reported successful execution
training completed. 0.000/0.000/0.000 (user, system, wall), mem=0 MB
bounds: [0.606897..0.606897] (0.000000) loss=0.206897, iterations=9
Regularization: 0.2; Fold 3; Training Accu

In [16]:
""""from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import precision_score, recall_score
import pandas as pd
import gosdt
import graphviz

# Function to visualize the tree
def visualize_tree(model):
    dot_data = model.export_graph()  # This function needs to be supported by your GOSDT implementation
    graph = graphviz.Source(dot_data)
    return graph

# Step 1: Split the dataset into training, validation, and test sets
X_temp, X_test, y_temp, y_test = train_test_split(X_train, y_train, test_size=0.30, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42)  # 0.50 * 0.30 = 0.15
# Reduce the number of folds to 5
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

regularization = [0.2, 0.1]
hp_grid = pd.DataFrame(columns=['regularization', 'average_validation_accuracy', 'average_precision', 'average_recall'])

# Lambda function to compute mean
for reg in regularization:
    cnt = 1
    val_accuracy = []
    val_precision = []
    val_recall = []
    best_model = None
    best_val_acc = 0

    for train_index, val_index in kf.split(X_train, y_train):
        print(f'Regularization: {reg}; Fold {cnt}; Train set: {len(train_index)}, Validation set: {len(val_index)}')

        kfX_train = X_train.iloc[train_index, :]
        kfX_val = X_train.iloc[val_index, :]
        kfy_train = y_train.iloc[train_index]
        kfy_val = y_train.iloc[val_index]

        config = {
            "regularization": reg,
            "depth_budget": 5,
            "min_samples_leaf": 5
        }
        
        model = gosdt.GOSDT(config)
        model.fit(kfX_train, kfy_train)

        train_acc = model.score(kfX_train, kfy_train)
        val_acc = model.score(kfX_val, kfy_val)
        val_pred = model.predict(kfX_val)  # Assuming your model has a predict method
        precision = precision_score(kfy_val, val_pred, average='weighted')
        recall = recall_score(kfy_val, val_pred, average='weighted')
        
        n_leaves = model.leaves()
        n_nodes = model.nodes()

        # Store validation accuracy, precision, and recall, and check for the best model
        val_accuracy.append(val_acc)
        val_precision.append(precision)
        val_recall.append(recall)

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model = model  # Keep a reference to the best model

        print(f'Regularization: {reg}; Fold {cnt}; Training Accuracy: {train_acc:.4f}, Validation Accuracy: {val_acc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, Number of Leaves: {n_leaves}, Number of Nodes: {n_nodes}\n')
        cnt += 1
    
    # Calculate mean validation metrics
    hp_grid = hp_grid.append({
        'regularization': reg,
        'average_validation_accuracy': sum(val_accuracy) / len(val_accuracy),
        'average_precision': sum(val_precision) / len(val_precision),
        'average_recall': sum(val_recall) / len(val_recall)
    }, ignore_index=True)

# Final evaluation on the test set
if best_model is not None:
    test_pred = best_model.predict(X_test)
    test_acc = best_model.score(X_test, y_test)
    test_precision = precision_score(y_test, test_pred, average='weighted')
    test_recall = recall_score(y_test, test_pred, average='weighted')

    print(f'Final Test Accuracy with best regularization {reg}: {test_acc:.4f}')
    print(f'Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}')

print("Training accuracy: {}".format(train_acc))
print("Validation accuracy: {}".format(val_acc))
print("Test accuracy: {}".format(test_acc))
print("Test precision: {}".format(test_precision))
print("Test recall: {}".format(test_recall))
print("# of leaves: {}".format(n_leaves))
print("# of nodes: {}".format(n_nodes))
print(model.confusion)
print(model.tree)


Regularization: 0.2; Fold 1; Train set: 58, Validation set: 15
gosdt reported successful execution
training completed. 0.000/0.000/0.000 (user, system, wall), mem=0 MB
bounds: [0.606897..0.606897] (0.000000) loss=0.206897, iterations=9
Regularization: 0.2; Fold 1; Training Accuracy: 0.7931, Validation Accuracy: 0.7333, Precision: 0.7467, Recall: 0.7333, Number of Leaves: 2, Number of Nodes: 3

Regularization: 0.2; Fold 2; Train set: 58, Validation set: 15
gosdt reported successful execution
training completed. 0.000/0.000/0.000 (user, system, wall), mem=0 MB
bounds: [0.606897..0.606897] (0.000000) loss=0.206897, iterations=9
Regularization: 0.2; Fold 2; Training Accuracy: 0.7931, Validation Accuracy: 0.7333, Precision: 0.7333, Recall: 0.7333, Number of Leaves: 2, Number of Nodes: 3

Regularization: 0.2; Fold 3; Train set: 58, Validation set: 15
gosdt reported successful execution
training completed. 0.000/0.000/0.001 (user, system, wall), mem=0 MB
bounds: [0.624138..0.624138] (0.000000

  hp_grid = hp_grid.append({
  hp_grid = hp_grid.append({


In [8]:
hp_grid.max()
print(hp_grid)

   regularization  average_validation_accuracy  average_precision  \
0             0.2                     0.741696            0.74341   
1             0.1                     0.741696            0.74341   

   average_recall  
0        0.741696  
1        0.741696  


In [9]:
import gosdt

config = {
            "regularization": 0.06,
            "depth_budget": 0
        }
model = gosdt.GOSDT(config)
model.fit(X_train, y_train)

train_acc = model.score(X_train, y_train)
test_acc = model.score(X_test, y_test)
n_leaves = model.leaves()
n_nodes = model.nodes()

print(f'Training Accuracy: {train_acc}, Test Accuracy: {test_acc}, Number of Leaves: {n_leaves}, Number of Nodes: {n_nodes}\n')
print(model.tree)

gosdt reported successful execution
training completed. 0.000/0.000/7.682 (user, system, wall), mem=0 MB
bounds: [0.349665..0.349665] (0.000000) loss=0.229665, iterations=80540
Training Accuracy: 0.7703349282296651, Test Accuracy: 0.7333333333333333, Number of Leaves: 2, Number of Nodes: 3

if thal_normal = 1 then:
    predicted class: 0
    misclassification penalty: 0.12
    complexity penalty: 0.06

else if thal_normal != 1 then:
    predicted class: 1
    misclassification penalty: 0.11
    complexity penalty: 0.06


In [5]:
from rashomon_importance_distribution import RashomonImportanceDistribution

mapping={
    0:[0],
    1:[1],
    2:[2],
    3:[3],
    4:[4],
    5:[5],
    6:[6],
    7:[7],
    8:[8],
    9:[9],
    10:[10],
    11:[11],
    12:[12]
   
}

RID = RashomonImportanceDistribution(
    input_df = df,
    binning_map = mapping,
    db = 5,
    lam = 0.05,
    eps = 0.2,
    vi_metric = 'sub_mr',
    dataset_name = 'binarized_data_set_all_sets_combined',
    n_resamples = 100,
    verbose = True,
    max_par_for_gosdt = 4
)

for v in range(15):
    print(f'Variable {v} ------------')

    print('Box and whiskers mean:', RID.mean(v))
    print('Box and whiskers median:', RID.median(v))
    print('Box and whiskers range:', RID.bwr(v))

Rashomon set has already been computed
Rashomon set has already been computed
Rashomon set has already been computed
Rashomon set has already been computedRashomon set has already been computed
Rashomon set has already been computed

Rashomon set has already been computed
Rashomon set has already been computed
Rashomon set has already been computed
Rashomon set has already been computed
Rashomon set has already been computed
Rashomon set has already been computedRashomon set has already been computed

Rashomon set has already been computedRashomon set has already been computed

Rashomon set has already been computed
Rashomon set has already been computed
Rashomon set has already been computed
Rashomon set has already been computed
Rashomon set has already been computed
Rashomon set has already been computed
Rashomon set has already been computed
Rashomon set has already been computed
Rashomon set has already been computed
Rashomon set has already been computed
Rashomon set has already 

KeyError: 13

In [None]:
for v in range(14):
    print(f'Variable {v} ------------')

    print('Box and whiskers mean:', RID.mean(v))
    print('Box and whiskers median:', RID.median(v))
    print('Box and whiskers range:', RID.bwr(v))

In [2]:
dfr = pd.read_csv("binarized_heart_disease.csv")

Xr = dfr.drop(columns=['num','Blood sugar', 'Diastolic blood pressure', 'Heart rate', 'sex_Male', 'thalch', 'Troponin', 'fbs', 'CK-MB', 'thal_reversable defect', 'slope_downsloping', 'restecg_lv hypertrophy', 'dataset_VA Long Beach', 'cp_atypical angina', 'thal_fixed defect', 'restecg_normal', 'dataset_Cleveland', 'cp_typical angina', 'slope_upsloping', 'dataset_Hungary', 'slope_flat', 'sex_Female', 'exang', 'restecg_st-t abnormality'])
yr = dfr['num']


In [3]:
from sklearn.model_selection import train_test_split

X_trainr, X_testr, y_trainr, y_testr = train_test_split(Xr,yr,test_size=0.2,random_state=42)

In [6]:
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import pandas as pd
import gosdt

# Step 1: Split the dataset into training, validation, and test sets
X_temp, X_test, y_temp, y_test = train_test_split(Xr, yr, test_size=0.15, random_state=42, stratify=yr)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.15/.85, random_state=42, stratify=y_temp)

# Stratified K-fold Cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Regularization values to test
regularization = [0.02]

# DataFrame to store results
hp_gridr = pd.DataFrame(columns=['regularization', 'average_test_accuracy'])

# Initialize variables for tracking the best model and metrics
best_model = None
best_val_acc = 0
all_train_acc = []
all_train_precision = []
all_train_recall = []
all_val_acc = []
all_val_precision = []
all_val_recall = []

# Iterate over each regularization value
for reg in regularization:
    cnt = 1
    test_accuracyr = []

    # Cross-validation loop
    for train_indexr, val_indexr in kf.split(X_train, y_train):
        print(f'Regularization: {reg}; Fold {cnt}; Train set: {len(train_indexr)}, Validation set: {len(val_indexr)}')

        # Create training and validation sets for the current fold
        kfX_trainr = X_train.iloc[train_indexr, :]
        kfX_valr = X_train.iloc[val_indexr, :]
        kfy_trainr = y_train.iloc[train_indexr]
        kfy_valr = y_train.iloc[val_indexr]

        # Configuration for the GOSDT model
        config = {
            "regularization": reg,
            "depth_budget": 5
        }

        # Instantiate and train the model
        model = gosdt.GOSDT(config)
        model.fit(kfX_trainr, kfy_trainr)

        # Calculate training and validation accuracies
        train_accr = model.score(kfX_trainr, kfy_trainr)
        val_accr = model.score(kfX_valr, kfy_valr)
        n_leavesr = model.leaves()
        n_nodesr = model.nodes()

        # Track the best model based on validation accuracy
        if val_accr > best_val_acc:
            best_val_acc = val_accr
            best_model = model

        # Collect metrics for later reporting
        all_train_acc.append(train_accr)
        all_train_precision.append(precision_score(kfy_trainr, model.predict(kfX_trainr), average='macro'))
        all_train_recall.append(recall_score(kfy_trainr, model.predict(kfX_trainr), average='macro'))

        all_val_acc.append(val_accr)
        all_val_precision.append(precision_score(kfy_valr, model.predict(kfX_valr), average='macro'))
        all_val_recall.append(recall_score(kfy_valr, model.predict(kfX_valr), average='macro'))

        # Store the result for the current regularization
        test_accuracyr.append(val_accr)
        print(f'Regularization: {reg}; Fold {cnt}; Training Accuracy: {train_accr}, Validation Accuracy: {val_accr}, '
              f'Number of Leaves: {n_leavesr}, Number of Nodes: {n_nodesr}\n')

        cnt += 1

    # Calculate mean validation accuracy across folds
    mean_val_accr = sum(test_accuracyr) / len(test_accuracyr)
    print(f'Mean Validation Accuracy across folds: {mean_val_accr}----------------------------------------------------------------------\n')

    # Store the result for the current regularization
    hp_gridr = pd.concat([hp_gridr, pd.DataFrame([[reg, mean_val_accr]], columns=hp_gridr.columns)], ignore_index=True)

# If the best model is found, evaluate it on the test set
if best_model is not None:
    # Get predictions on the test set
    test_preds = best_model.predict(X_test)

    # Calculate accuracy, precision, recall, and confusion matrix on the test set
    test_acc = best_model.score(X_test, y_test)
    test_precision = precision_score(y_test, test_preds, average='macro')
    test_recall = recall_score(y_test, test_preds, average='macro')
    cm = confusion_matrix(y_test, test_preds)

    # Print final results
    print("\n--- Final Results ---")
    print(f'Final Test Accuracy: {test_acc:.4f}')
    print(f'Test Precision (macro): {test_precision:.4f}')
    print(f'Test Recall (macro): {test_recall:.4f}')
    print(f'Confusion Matrix:\n{cm}')

    # Print training and validation metrics
    print("\n--- Summary of Train and Validation Metrics ---")
    print(f"Training Accuracy: {sum(all_train_acc) / len(all_train_acc):.4f}")
    print(f"Training Precision (macro): {sum(all_train_precision) / len(all_train_precision):.4f}")
    print(f"Training Recall (macro): {sum(all_train_recall) / len(all_train_recall):.4f}")
    
    print(f"Validation Accuracy: {sum(all_val_acc) / len(all_val_acc):.4f}")
    print(f"Validation Precision (macro): {sum(all_val_precision) / len(all_val_precision):.4f}")
    print(f"Validation Recall (macro): {sum(all_val_recall) / len(all_val_recall):.4f}")

    # Print model details
    print(f"# of leaves: {best_model.leaves()}")
    print(f"# of nodes: {best_model.nodes()}")
    print(f"Confusion Matrix: \n{best_model.confusion}")
    print(f"Tree structure: \n{best_model.tree}")


Regularization: 0.02; Fold 1; Train set: 188, Validation set: 21
gosdt reported successful execution
training completed. 0.000/0.000/0.012 (user, system, wall), mem=0 MB
bounds: [0.223617..0.223617] (0.000000) loss=0.143617, iterations=1072
Regularization: 0.02; Fold 1; Training Accuracy: 0.8563829787234043, Validation Accuracy: 0.8095238095238095, Number of Leaves: 4, Number of Nodes: 7

Regularization: 0.02; Fold 2; Train set: 188, Validation set: 21
gosdt reported successful execution
training completed. 0.000/0.000/0.011 (user, system, wall), mem=0 MB
bounds: [0.223617..0.223617] (0.000000) loss=0.143617, iterations=1019
Regularization: 0.02; Fold 2; Training Accuracy: 0.8563829787234043, Validation Accuracy: 0.8095238095238095, Number of Leaves: 4, Number of Nodes: 7

Regularization: 0.02; Fold 3; Train set: 188, Validation set: 21
gosdt reported successful execution
training completed. 0.000/0.000/0.012 (user, system, wall), mem=0 MB
bounds: [0.239574..0.239574] (0.000000) loss=0

In [25]:
print(hp_gridr.max())
print(hp_gridr)

regularization           0.020000
average_test_accuracy    0.851667
dtype: float64
   regularization  average_test_accuracy
0            0.02               0.851667


In [None]:
import gosdt

config = {
            "regularization": 0.002,
            "depth_budget": 0
        }
model = gosdt.GOSDT(config)
model.fit(X_trainr, y_trainr)

train_accr = model.score(X_trainr, y_trainr)
test_accr = model.score(X_testr, y_testr)
n_leavesr = model.leaves()
n_nodesr = model.nodes()

print(f'Training Accuracy: {train_accr}, Test Accuracy: {test_accr}, Number of Leaves: {n_leavesr}, Number of Nodes: {n_nodesr}\n')
print(model.tree)