> Baseline model

## BASE MODEL - Logistic Regression


In [108]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, classification_report
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

### Loading Data

In [109]:
data = pd.read_csv('../data/processed/data_abnormal_values_treated_scaled.csv') 
data.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,0,-1.43314,M,ATA,0.463654,0.876016,0,Normal,1.38408,N,-0.851276,Up,0
1,1,-0.478484,F,NAP,1.641229,-1.194233,0,Normal,0.75461,N,0.118532,Flat,1
2,2,-1.751359,M,ATA,-0.125133,0.762057,0,ST,-1.527219,N,-0.851276,Up,0
3,3,-0.584556,F,ASY,0.345897,-0.548467,0,Normal,-1.133801,Y,0.603436,Flat,1
4,4,0.051881,M,NAP,1.052442,-0.909337,0,Normal,-0.583014,N,-0.851276,Up,0


## Seperate X and Y

In [110]:
# seperate X and y dataframes

feature_columns = data.columns[1:-1]

X = data[feature_columns]
y = data['HeartDisease'] 

In [111]:
y.value_counts()

HeartDisease
1    508
0    410
Name: count, dtype: int64


## Feature Engineering
Label Encoding

Use LabelEncoder to assign an integer to each category 

Categorical features: Sex, ChestPainType, FastingBS, RestingECG, ExerciseAngina, ST_Slope


In [112]:
# define categorical columns

cat_cols = ['Sex','ChestPainType','FastingBS','RestingECG','ExerciseAngina','ST_Slope']
num_cols = [col for col in X.columns if col not in cat_cols]


# Impute with the mean (standard approach) and scale (best practice for LR)
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler()) 
])

# Using OrdinalEncoder as you had, but with imputation added for robustness
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # Added imputer
    ('encoder', OrdinalEncoder())
])

# ColumnTransformer
preproc = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, cat_cols),
        # ADDED: Explicit handling for numerical columns
        ('num', numerical_transformer, num_cols) 
    ],
    # Since we are handling all feature types, we can use 'drop'
    remainder='drop' 
)
preproc

In [113]:
# Apply the transformation
X_transformed = preproc.fit_transform(X)

# Reconstruct a DataFrame
new_columns = cat_cols + [col for col in X.columns if col not in cat_cols]
X_transformed_df = pd.DataFrame(X_transformed, columns=new_columns)

X_transformed_df.head()

Unnamed: 0,Sex,ChestPainType,FastingBS,RestingECG,ExerciseAngina,ST_Slope,Age,RestingBP,Cholesterol,MaxHR,Oldpeak
0,1.0,1.0,0.0,1.0,0.0,2.0,-1.43314,0.463654,0.971771,1.38408,-0.851276
1,0.0,2.0,0.0,1.0,0.0,1.0,-0.478484,1.641229,-1.324772,0.75461,0.118532
2,1.0,1.0,0.0,2.0,0.0,2.0,-1.751359,-0.125133,0.845356,-1.527219,-0.851276
3,0.0,0.0,0.0,1.0,1.0,1.0,-0.584556,0.345897,-0.608419,-1.133801,0.603436
4,1.0,2.0,0.0,1.0,0.0,2.0,0.051881,1.052442,-1.008734,-0.583014,-0.851276


## Pipeline

In [114]:
# 4. Re-define the Model Pipeline
model_pipeline = Pipeline([
    ('Feature Engineering', preproc),
    ('classifier', LogisticRegression(random_state=42)) 
])

model_pipeline

## Train / Test Split

In [115]:
# train/test splitting

X_train, X_test, y_train, y_test = train_test_split(
    X, y, # dataset
    train_size = 0.8, # the size of training set = 80%
    shuffle = True, #to avoid ordering effect
    stratify = y, # maintain the distribution of y classes in both training and test sets
    random_state = 42
)

## Cross Validation

In [116]:

# Define the scoring metrics
scoring = [
    'accuracy',
    'precision',
    'recall',
    'f1',
    'roc_auc'
]

result_dict = cross_validate(model_pipeline, X_train, y_train, cv=5, scoring=scoring)

result = pd.DataFrame(result_dict)
print("\nCross-Validation Results:")
print(result)


Cross-Validation Results:
   fit_time  score_time  test_accuracy  test_precision  test_recall   test_f1  \
0  0.037488    0.020547       0.877551        0.890244     0.890244  0.890244   
1  0.012473    0.019775       0.857143        0.884615     0.851852  0.867925   
2  0.104532    0.050429       0.829932        0.868421     0.814815  0.840764   
3  0.009674    0.032005       0.843537        0.871795     0.839506  0.855346   
4  0.016666    0.016723       0.787671        0.784091     0.851852  0.816568   

   test_roc_auc  
0      0.953659  
1      0.927984  
2      0.920127  
3      0.895249  
4      0.844255  


## Model Performance

In [117]:
print(f"\nTraining Logistic Regression model...")
model_pipeline.fit(X_train, y_train)
print("Model training complete.")

# --- 6. Evaluate Baseline Model ---
y_pred = model_pipeline.predict(X_test)
y_proba = model_pipeline.predict_proba(X_test)[:, 1] # Probability for the positive class (1)

# Calculate key metrics
accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)
f1 = f1_score(y_test, y_pred)

print("\n" + "="*40)
print("BASELINE MODEL PERFORMANCE (Logistic Regression)")
print("="*40)
print(f"Accuracy:        {accuracy:.4f}")
print(f"AUC Score:       {auc:.4f}")
print(f"F1 Score:        {f1:.4f}")
print("-" * 40)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("-" * 40)




Training Logistic Regression model...
Model training complete.

BASELINE MODEL PERFORMANCE (Logistic Regression)
Accuracy:        0.8424
AUC Score:       0.9002
F1 Score:        0.8599
----------------------------------------

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.80      0.82        82
           1       0.85      0.87      0.86       102

    accuracy                           0.84       184
   macro avg       0.84      0.84      0.84       184
weighted avg       0.84      0.84      0.84       184

----------------------------------------
