# Data Preparation Step

* Import the needed libraries

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense
from tensorflow.keras import callbacks

In [7]:
class HeartDiseaseData:
    def __init__(self, data_path):
        self.data_path = data_path
        self.df = None
        self.X = None
        self.y = None
        self.X_train = None
        self.X_val = None
        self.X_test = None
        self.y_train = None
        self.y_val = None
        self.y_test = None
        self.df_encoded = None
        self.df_standardized = None

    def load_data(self):
        # Load dataset from a CSV file
        self.df = pd.read_csv(self.data_path)
        print(self.df.head())

    def split_features_target(self, target_column):
        # Split the dataset into features (X) and target (y)
        self.X = self.df.drop(columns=[target_column])
        self.y = self.df[target_column]

    def encode_categorical_columns(self, categorical_columns):
        # Perform one-hot encoding for categorical columns
        self.df_encoded = pd.get_dummies(self.X, columns=categorical_columns, dtype='uint8')
        print("Encoded Columns:", self.df_encoded.columns)

    def split_train_validation_test(self, train_size=0.7, test_size=0.2, val_size=0.1, random_state=42):
        # Split dataset into training, validation, and test sets while maintaining class distribution
        X_temp, self.X_test, y_temp, self.y_test = train_test_split(
            self.df_encoded, self.y, test_size=(1 - train_size), stratify=self.y, random_state=random_state
        )
        self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(
            X_temp, y_temp, test_size=(test_size / (test_size + val_size)), stratify=y_temp, random_state=random_state
        )

    def print_class_distribution(self):
        # Split dataset into training, validation, and test sets while maintaining class distribution
        print("Training Class Distribution:\n", self.y_train.value_counts(normalize=True))
        print("Validation Class Distribution:\n", self.y_val.value_counts(normalize=True))
        print("Test Class Distribution:\n", self.y_test.value_counts(normalize=True))

    def standardize_features(self):
        # Standardize numerical features using StandardScaler
        scaler = StandardScaler()
        self.df_standardized = pd.DataFrame(scaler.fit_transform(self.df_encoded), columns=self.df_encoded.columns)
        print(self.df_standardized.head())

    def preprocess_data(self, target_column, categorical_columns):
        # Run preprocessing steps
        self.load_data()
        self.split_features_target(target_column)
        self.encode_categorical_columns(categorical_columns)
        self.split_train_validation_test()
        self.print_class_distribution()
        self.standardize_features()


In [8]:
data_path = "heart.csv"
target_column = 'HeartDisease'
categorical_columns = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

# Initialize the data processor
data_processor = HeartDiseaseData(data_path)

# Preprocess the data
data_processor.preprocess_data(target_column, categorical_columns)

# Data
X_train = data_processor.X_train
X_val = data_processor.X_val
X_test = data_processor.X_test
y_train = data_processor.y_train
y_val = data_processor.y_val
y_test = data_processor.y_test
df_standardized = data_processor.df_standardized

   Age Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  MaxHR  \
0   40   M           ATA        140          289          0     Normal    172   
1   49   F           NAP        160          180          0     Normal    156   
2   37   M           ATA        130          283          0         ST     98   
3   48   F           ASY        138          214          0     Normal    108   
4   54   M           NAP        150          195          0     Normal    122   

  ExerciseAngina  Oldpeak ST_Slope  HeartDisease  
0              N      0.0       Up             0  
1              N      1.0     Flat             1  
2              N      0.0       Up             0  
3              Y      1.5     Flat             1  
4              N      0.0       Up             0  
Encoded Columns: Index(['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak',
       'Sex_F', 'Sex_M', 'ChestPainType_ASY', 'ChestPainType_ATA',
       'ChestPainType_NAP', 'ChestPainType_TA', 

# Bonus section

## Simple Feedforward Neural Network

In [None]:
class TrainingCallbacks:
    def __init__(self, early_stopping_patience=20, reduce_lr_patience=3,
                  reduce_lr_factor=0.2, min_lr=1e-5):
        self.early_stopping_patience = early_stopping_patience
        self.reduce_lr_patience = reduce_lr_patience
        self.reduce_lr_factor = reduce_lr_factor
        self.min_lr = min_lr

    def get_callbacks(self):
        # Early stopping callbacks
        es = callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=self.early_stopping_patience, # Stopping after certain number of epochs with no improvement
            verbose=1,
            mode='max', # because we want to maximize the validation accuracy
            restore_best_weights=True

        )

        # Learning rate reduction callbacks
        rp = callbacks.ReduceLROnPlateau(
            monitor='val_accuracy',
            factor=self.reduce_lr_factor, 
            patience=self.reduce_lr_patience, # wait for certain number of epochs before lr reduction
            verbose=1,
            mode="max",
            min_lr=self.min_lr
        )

        return [es, rp]


In [9]:
class FeedforwardNeuralNetwork:
    def __init__(self, input_size, hidden_neurons = 16, learning_rate = 0.001):
        self.input_size = input_size
        self.hidden_neurons = hidden_neurons
        self.learning_rate = learning_rate
        self.model = self.build_model()

    def build_model(self):
        model = Sequential()
        # Hidden layer
        model.add(Dense(self.hidden_neurons, activation='relu', input_shape=(self.input_size,)))
        # Output layer
        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        return model
    
    def train(self, X_train, y_train, X_val, y_val, epochs=50, batch_size=32, callbacks=None):
        history = self.model.fit(
            X_train, y_train,
            epochs = epochs,
            batch_size=batch_size,
            validation_data=(X_val, y_val),
            callbacks=callbacks,
            verbose=1
        )
        return history
    
    def evaluate(self, X_test, y_test):
        loss, accuracy = self.model.evaluate(X_test, y_test,verbose=0)
        print("Test loss:",loss)
        print("Test accuracy:",accuracy)
        return loss,accuracy
    
    def predict(self,X):
        return self.model.predict(X)
