In [1]:
import pandas as pd
import sklearn
import numpy as np

In [2]:
df = pd.read_csv("heart_train.csv")

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 734 entries, 0 to 733
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             734 non-null    int64  
 1   Sex             734 non-null    object 
 2   ChestPainType   734 non-null    object 
 3   RestingBP       734 non-null    int64  
 4   Cholesterol     734 non-null    int64  
 5   FastingBS       734 non-null    int64  
 6   RestingECG      734 non-null    object 
 7   MaxHR           734 non-null    int64  
 8   ExerciseAngina  734 non-null    object 
 9   Oldpeak         734 non-null    float64
 10  ST_Slope        734 non-null    object 
 11  HeartDisease    734 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 68.9+ KB


In [3]:
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']
# classify the variables between numerical and categorical
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

In [None]:
# Preprocessing

In [4]:
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer

In [5]:
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore', drop='first') # drop='first' can help reduce multicollinearity

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough' # Keep any columns not specified (though we specified all here)
)


Creating Linear regression model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [None]:
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),('classifier', LogisticRegression(random_state=42, max_iter=1000, solver='liblinear'))])

Splitting the dataset into smaller testcases to train our model on

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
y.value_counts(normalize=True)

Unnamed: 0_level_0,proportion
HeartDisease,Unnamed: 1_level_1
1,0.553134
0,0.446866


In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
y_train.value_counts(normalize=True)

Unnamed: 0_level_0,proportion
HeartDisease,Unnamed: 1_level_1
1,0.553663
0,0.446337


In [None]:
y_val.value_counts(normalize=True)

Unnamed: 0_level_0,proportion
HeartDisease,Unnamed: 1_level_1
1,0.55102
0,0.44898


In [None]:
print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")

print("\nTraining the Logistic Regression model...")
model_pipeline.fit(X_train, y_train)
print("Model training complete.")


Training set size: 587
Validation set size: 147

Training the Logistic Regression model...
Model training complete.


In [None]:
print("\nEvaluating model on the validation set...")
y_pred_val = model_pipeline.predict(X_val)
y_pred_proba_val = model_pipeline.predict_proba(X_val)[:, 1]


Evaluating model on the validation set...


In [None]:
from sklearn.metrics import ( # Changed imports for classification metrics
    accuracy_score, confusion_matrix, classification_report,
    roc_auc_score, precision_score, recall_score, f1_score
)

In [None]:
accuracy = accuracy_score(y_val, y_pred_val)
precision = precision_score(y_val, y_pred_val)
recall = recall_score(y_val, y_pred_val)
f1 = f1_score(y_val, y_pred_val)
roc_auc = roc_auc_score(y_val, y_pred_proba_val)
conf_matrix = confusion_matrix(y_val, y_pred_val)
class_report = classification_report(y_val, y_pred_val)
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)

Accuracy: 0.8163
Precision: 0.8140
Recall: 0.8642
F1 Score: 0.8383
ROC AUC Score: 0.9031

Confusion Matrix:
 [[50 16]
 [11 70]]

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.76      0.79        66
           1       0.81      0.86      0.84        81

    accuracy                           0.82       147
   macro avg       0.82      0.81      0.81       147
weighted avg       0.82      0.82      0.82       147



In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__max_iter'  : [100,1000,2500,5000]
}

grid_search = GridSearchCV(model_pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X, y)

print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best parameters: {'classifier__C': 0.1, 'classifier__max_iter': 100, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
Best score: 0.8568912496505451


In [None]:
from sklearn.feature_selection import SelectFromModel
selector_model = LogisticRegression(penalty='l1', solver='liblinear', C=0.1, random_state=42, max_iter=100)
feature_selector = SelectFromModel(estimator=selector_model)

In [None]:
selector = model_pipeline.named_steps['feature_selection']
mask = selector.get_support()

# Assuming you're using pandas and your preprocessor outputs feature names
feature_names = model_pipeline.named_steps['preprocessor'].get_feature_names_out()
selected_features = feature_names[mask]

print("Selected Features:\n", selected_features)

Selected Features:
 ['num__Cholesterol' 'num__FastingBS' 'num__MaxHR' 'num__Oldpeak'
 'cat__Sex_M' 'cat__ChestPainType_ATA' 'cat__ChestPainType_NAP'
 'cat__ExerciseAngina_Y' 'cat__ST_Slope_Flat' 'cat__ST_Slope_Up']


In [None]:
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', feature_selector),
    ('classifier', LogisticRegression(random_state=42, max_iter=100, solver='liblinear'))
])

In [None]:
print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")

print("\nTraining the Logistic Regression model...")
model_pipeline.fit(X_train, y_train)
print("Model training complete.")


Training set size: 587
Validation set size: 147

Training the Logistic Regression model...
Model training complete.


In [None]:
print("\nEvaluating model on the validation set...")
y_pred_val = model_pipeline.predict(X_val)
y_pred_proba_val = model_pipeline.predict_proba(X_val)[:, 1]


Evaluating model on the validation set...


In [None]:
accuracy = accuracy_score(y_val, y_pred_val)
precision = precision_score(y_val, y_pred_val)
recall = recall_score(y_val, y_pred_val)
f1 = f1_score(y_val, y_pred_val)
roc_auc = roc_auc_score(y_val, y_pred_proba_val)
conf_matrix = confusion_matrix(y_val, y_pred_val)
class_report = classification_report(y_val, y_pred_val)
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)

Accuracy: 0.8299
Precision: 0.8256
Recall: 0.8765
F1 Score: 0.8503
ROC AUC Score: 0.9067

Confusion Matrix:
 [[51 15]
 [10 71]]

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.77      0.80        66
           1       0.83      0.88      0.85        81

    accuracy                           0.83       147
   macro avg       0.83      0.82      0.83       147
weighted avg       0.83      0.83      0.83       147

