In [8]:
import numpy as np
import pandas as pd
from AdaBoost import AdaBoost 
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
%run DecisionTree.py


In [9]:
# Load dataset
data = pd.read_csv('heart.csv')

# Binary encoding for 'Sex' and 'ExerciseAngina'
data['Sex'] = data['Sex'].map({'M': 1, 'F': 0})
data['ExerciseAngina'] = data['ExerciseAngina'].map({'Y': 1, 'N': 0})

# Extract features and labels
X = data.drop(columns=['HeartDisease'])
#take first 5 rows for debugging
y = data['HeartDisease']

print(X.head())
print(y.head())



   Age  Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  \
0   40    1           ATA        140          289          0     Normal   
1   49    0           NAP        160          180          0     Normal   
2   37    1           ATA        130          283          0         ST   
3   48    0           ASY        138          214          0     Normal   
4   54    1           NAP        150          195          0     Normal   

   MaxHR  ExerciseAngina  Oldpeak ST_Slope  
0    172               0      0.0       Up  
1    156               0      1.0     Flat  
2     98               0      0.0       Up  
3    108               1      1.5     Flat  
4    122               0      0.0       Up  
0    0
1    1
2    0
3    1
4    0
Name: HeartDisease, dtype: int64


In [10]:
# One-Hot Encoding for non-binary categorical features
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
encoder = OneHotEncoder(sparse_output=False)

if categorical_cols:  # Only encode if there are categorical features
    encoded_array = encoder.fit_transform(X[categorical_cols])
    encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(categorical_cols))
    X = pd.concat([X.drop(columns=categorical_cols), encoded_df], axis=1)

# Convert DataFrame to NumPy array
X = X.to_numpy()
y = y.to_numpy()

print(X)
print(y)

[[ 40.   1. 140. ...   0.   0.   1.]
 [ 49.   0. 160. ...   0.   1.   0.]
 [ 37.   1. 130. ...   0.   0.   1.]
 ...
 [ 57.   1. 130. ...   0.   1.   0.]
 [ 57.   0. 130. ...   0.   1.   0.]
 [ 38.   1. 138. ...   0.   0.   1.]]
[0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 0 1 0 1 1 0 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 1
 0 0 0 0 1 0 0 1 0 0 0 0 1 1 1 0 0 0 0 1 1 0 1 0 0 0 1 0 0 0 0 1 0 1 0 1 0
 1 0 1 0 0 1 0 0 1 0 1 1 1 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0
 1 0 0 0 1 1 1 0 1 1 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1 0 0 1 1 1 1 1 0 1 0 0 0
 0 1 0 0 0 0 0 1 1 0 1 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 1 0 0
 1 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 1 1 0 0 1 0 1 0 0 0 1 1
 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 1 1 1 0 1 1 0 1 0 1 1 1 1 1 1 0 0 1 0 0 0 0
 0 0 0 1 1 1 0 1 0 1 0 0 0 1 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1
 1 1 1 1 1 0 1 1 1 1 1 0 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1

In [11]:
# Split data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.5, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=2/3, random_state=42, stratify=y_temp
)


In [15]:
# Train the AdaBoost model
model = AdaBoost(50)
model.fit(X_train, y_train)

print("Training Complete!")

Training Complete!


In [16]:
val_set = model.predict(X_val)
# compute accuracy
accuracy = np.sum(val_set == y_val) / len(y_val)
print(f"Validation Accuracy: {accuracy:.4f}")

Validation Accuracy: 0.8301


In [17]:
test_set = model.predict(X_test)
# compute accuracy
accuracy = np.sum(test_set == y_test) / len(y_test)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.8660
