# **Heart Failure Classification**

## **Data Preparation Step**

* Import the needed libraries

In [58]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

* Load dataset from a CSV file

In [59]:
def load_data(file_path):
    return pd.read_csv(file_path)
data = load_data('heart.csv')

* Split the dataset into features (X) and target (y)

In [60]:
def split_features_target(df, target_column):
    X = df.drop(columns=[target_column])
    y = df[target_column]
    return X, y

* Split dataset into training, validation, and test sets while maintaining class distribution

In [61]:
def split_train_validation_test(X, y, train_size=0.7, test_size=0.2, val_size=0.1, random_state=42):
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=(1 - train_size), stratify=y, random_state=random_state
    )
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=(test_size / (test_size + val_size)), stratify=y_temp, random_state=random_state
    )
    return X_train, X_val, X_test, y_train, y_val, y_test

In [62]:
def split_train_validation_test(X, y, train_size=0.7, test_size=0.2, val_size=0.1, random_state=42):
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=(1 - train_size), stratify=y, random_state=random_state
    )
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=(test_size / (test_size + val_size)), stratify=y_temp, random_state=random_state
    )
    return X_train, X_val, X_test, y_train, y_val, y_test


* Print class distributions for training, validation, and test sets

In [63]:
def print_class_distribution(y_train, y_val, y_test):
    print("Training Class Distribution:\n", y_train.value_counts(normalize=True))
    print("Validation Class Distribution:\n", y_val.value_counts(normalize=True))
    print("Test Class Distribution:\n", y_test.value_counts(normalize=True))

* Perform one-hot encoding for categorical columns

In [64]:
def encode_categorical_columns(df, categorical_columns):
    return pd.get_dummies(df, columns=categorical_columns, dtype='uint8')

* Standardize numerical features using StandardScaler

In [65]:
def standardize_features(df):
    scaler = StandardScaler()
    return pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

* Main

In [66]:
def main():
    # Load dataset
    df = load_data("heart.csv")
    print(df.head())
    
    # Split features and target
    X, y = split_features_target(df, 'HeartDisease')
    
    # Define categorical columns to encode
    categorical_columns = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
    df_encoded = encode_categorical_columns(X, categorical_columns)
    print("Encoded Columns:", df_encoded.columns)

    # Split data into train, validation, and test sets
    X_train, X_val, X_test, y_train, y_val, y_test = split_train_validation_test(df_encoded, y)
    
    # Validate class distribution
    print_class_distribution(y_train, y_val, y_test)
        
    # if needed, Standardize dataset
    df_standardized = standardize_features(df_encoded)
    print(df_standardized.head())
    return X_train, X_val, X_test, y_train, y_val, y_test


X_train, X_val, X_test, y_train, y_val, y_test = main()

   Age Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  MaxHR  \
0   40   M           ATA        140          289          0     Normal    172   
1   49   F           NAP        160          180          0     Normal    156   
2   37   M           ATA        130          283          0         ST     98   
3   48   F           ASY        138          214          0     Normal    108   
4   54   M           NAP        150          195          0     Normal    122   

  ExerciseAngina  Oldpeak ST_Slope  HeartDisease  
0              N      0.0       Up             0  
1              N      1.0     Flat             1  
2              N      0.0       Up             0  
3              Y      1.5     Flat             1  
4              N      0.0       Up             0  
Encoded Columns: Index(['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak',
       'Sex_F', 'Sex_M', 'ChestPainType_ASY', 'ChestPainType_ATA',
       'ChestPainType_NAP', 'ChestPainType_TA', 

## **Decision Tree**

### **Building the model**

In [None]:
class DecisionTreeClassifier:
    class Node:
        def __init__(
            self,
            feature : int  | None = None,
            threshold: float | None = None,
            predicted_class: int | None  = None,
            depth: int | None = None,
            left: "Node | None" = None,
            right: "Node | None" = None,
        ):
            self.feature = feature
            self.threshold = threshold  # <= threshold goes to left, > threshold goes to right
            self.left = left
            self.right = right
            self.predicted_class = predicted_class
            self.depth = depth

    def __init__(self, max_depth: int = 4, min_samples_split: int =10):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None

    def fit(self, X : pd.DataFrame, y : pd.Series):
        self.root = self._build_decision_tree(X, y, 0)

    def predict(self, X : pd.DataFrame) -> list[int]: # classification of each input
        predictions_list = [ self._traverse_tree(row, self.root) for _, row in X.iterrows() ]
        return predictions_list

    def _build_decision_tree(self, X : pd.DataFrame, y : pd.Series, curr_depth : int) -> Node:
        if (curr_depth == self.max_depth or len(set(y)) == 1 or self.min_samples_split > len(y)):
            return self.Node(predicted_class=Counter(y).most_common(1)[0][0], depth=curr_depth)

        feature, threshold = self._best_split(X, y)
        tree_node = self.Node(feature= feature,threshold=threshold,depth=curr_depth)

        X_left_split = X.loc[X.iloc[:, feature] <= threshold, :]
        y_left_split = y[X.iloc[:, feature] <= threshold]
        X_right_split = X.loc[X.iloc[:, feature] > threshold, :]
        y_right_split = y[X.iloc[:, feature] > threshold]

        tree_node.left = self._build_decision_tree(X_left_split, y_left_split, curr_depth + 1)
        tree_node.right = self._build_decision_tree(X_right_split, y_right_split, curr_depth + 1)
        return tree_node

    def _best_split(self, X : pd.DataFrame, y : pd.Series) -> tuple: # feature and thresold are integers
        unique_values_list = [np.unique(X.iloc[:, i].values) for i in range(X.shape[1])]
        mid_points_list = [values[:-1] + np.diff(values) / 2 for values in unique_values_list]

        max_info_gain = {"infoGain": -float("inf"), "feature": -1, "threshold": None}
        for i in range(X.shape[1]):
            local_max_info_gain = { "infoGain": -float("inf"), "feature": -1, "threshold": None}
            for split in mid_points_list[i]:
                left_split = y[X.iloc[:, i] <= split]
                right_split = y[X.iloc[:, i] > split]
                info_gain = self._information_gain(y, left_split, right_split)
                if info_gain > local_max_info_gain["infoGain"]:
                    local_max_info_gain = { "infoGain": info_gain, "feature": i, "threshold": split}
            if local_max_info_gain["infoGain"] > max_info_gain["infoGain"]:
                max_info_gain = local_max_info_gain

        return max_info_gain["feature"], max_info_gain["threshold"]

    def _traverse_tree(self, x : pd.Series, node : Node) -> int: # returns a class
        if node.predicted_class is not None:
            return node.predicted_class
        return self._traverse_tree(x, node.left) if x[node.feature] <= node.threshold else self._traverse_tree(x, node.right)

    def _information_gain(self, y : pd.Series, y1 : pd.Series, y2 : pd.Series) -> float:
        y_entropy = self._entropy(y)
        y_entropy_after_split_1 = self._entropy(y1)
        y_entropy_after_split_2 = self._entropy(y2)
        return (
            y_entropy
            - (len(y1) / len(y)) * y_entropy_after_split_1
            - (len(y2) / len(y)) * y_entropy_after_split_2
        )

    def _entropy(self, y : pd.Series) -> float | int:
        zeroes_number = (y == 0).sum()
        ones_number = (y == 1).sum()
        probs = np.array([ones_number / len(y), zeroes_number / len(y)])
        return -np.sum(probs * np.log2(np.where(probs == 0, 1, probs)))


cls = DecisionTreeClassifier()
cls.fit(X_train, y_train)
predict_list = cls.predict(X_train)

correct = 0
for i in range(len(predict_list)):
    if predict_list[i] == y_train.iloc[i]:
        correct += 1

print(f"Train => correct : {correct}, training set accuracy: {correct / len(y_train)}")

predict_list = cls.predict(X_val)

correct = 0
for i in range(len(predict_list)):
    if predict_list[i] == y_val.iloc[i]:
        correct += 1

print(
    f"Validation => correct : {correct}, training set accuracy: {correct / len(y_val)}"
)

predict_list = cls.predict(X_test)

correct = 0
for i in range(len(predict_list)):
    if predict_list[i] == y_test.iloc[i]:
        correct += 1

print(f"Test => correct : {correct}, training set accuracy: {correct / len(y_test)}")

Train => correct : 549, training set accuracy: 0.8551401869158879
Validation => correct : 80, training set accuracy: 0.8695652173913043
Test => correct : 155, training set accuracy: 0.842391304347826


### **Testing the model to make predictions**

### **Evaluating the model**

### **Visualization**

In [68]:
# plt.figure(figsize=(15,8))
# sns.heatmap(df_encoded.corr(), annot=True, fmt='.1f')
# df_encoded.hist(figsize=(20,20))