In [1]:
import pandas as pd
import numpy as np
import sklearn

In [2]:
df = pd.read_csv("heart_train.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 734 entries, 0 to 733
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             734 non-null    int64  
 1   Sex             734 non-null    object 
 2   ChestPainType   734 non-null    object 
 3   RestingBP       734 non-null    int64  
 4   Cholesterol     734 non-null    int64  
 5   FastingBS       734 non-null    int64  
 6   RestingECG      734 non-null    object 
 7   MaxHR           734 non-null    int64  
 8   ExerciseAngina  734 non-null    object 
 9   Oldpeak         734 non-null    float64
 10  ST_Slope        734 non-null    object 
 11  HeartDisease    734 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 68.9+ KB


In [4]:
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']
# classify the variables between numerical and categorical
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

In [5]:
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer

In [6]:
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore', drop='first')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

In [7]:
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline

In [8]:
model_pipeline = Pipeline(steps=[
    ('pre', preprocessor),
    ('clf', XGBClassifier(eval_metric='logloss'))
])

In [9]:
y.value_counts(normalize=True)

HeartDisease
1    0.553134
0    0.446866
Name: proportion, dtype: float64

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [12]:
y_train.value_counts(normalize=True)

HeartDisease
1    0.553663
0    0.446337
Name: proportion, dtype: float64

In [13]:
print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")


Training set size: 587
Validation set size: 147


In [14]:
model_pipeline.fit(X_train, y_train)
print("Model training complete.")

Model training complete.


In [15]:
print("\nEvaluating model on the validation set...")
y_pred_val = model_pipeline.predict(X_val)
y_pred_proba_val = model_pipeline.predict_proba(X_val)[:, 1]


Evaluating model on the validation set...


In [16]:
from sklearn.metrics import ( # Changed imports for classification metrics
    accuracy_score, confusion_matrix, classification_report,
    roc_auc_score, precision_score, recall_score, f1_score
)

In [17]:
accuracy = accuracy_score(y_val, y_pred_val)
precision = precision_score(y_val, y_pred_val)
recall = recall_score(y_val, y_pred_val)
f1 = f1_score(y_val, y_pred_val)
roc_auc = roc_auc_score(y_val, y_pred_proba_val)
conf_matrix = confusion_matrix(y_val, y_pred_val)
class_report = classification_report(y_val, y_pred_val)
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)

Accuracy: 0.8707
Precision: 0.8523
Recall: 0.9259
F1 Score: 0.8876
ROC AUC Score: 0.9076

Confusion Matrix:
 [[53 13]
 [ 6 75]]

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.80      0.85        66
           1       0.85      0.93      0.89        81

    accuracy                           0.87       147
   macro avg       0.88      0.86      0.87       147
weighted avg       0.87      0.87      0.87       147

