In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler



### Heart Disease Prediction Project

In [3]:
def inspect_dataset(heart_data:pd.DataFrame)->None:
    # Inspect what the dataset looks like
    print("Data head:\n",heart_data.head(),'\n')
    print("Data shape:\n",heart_data.shape,"\n\nData info:")
    print("\n",heart_data.info(),"\n")

    # Check for missing values
    print("Null values:\n",heart_data.isnull().sum(),'\n')

    # statistical measures about the data
    print("Statistics:\n",heart_data.describe(),'\n')

    # Distribution of Target Variable
    print("Target distribution:\n",heart_data['target'].value_counts(),'\n')
    print("1: defective heart\n0: healthy heart")
    print("===========================================================================================")
    return


In [4]:
def split_features_targets(heart_data:pd.DataFrame)->tuple[pd.DataFrame,pd.DataFrame,pd.DataFrame,pd.DataFrame]:
    X = heart_data.drop(columns='target', axis=1)
    Y = heart_data['target']
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)
    return X_train, X_test, Y_train, Y_test

In [5]:
def develop_model(X_train:pd.DataFrame, X_test:pd.DataFrame, Y_train:pd.DataFrame, Y_test:pd.DataFrame)->None:
    model = LogisticRegression(max_iter=500)
    pipe = make_pipeline(StandardScaler(), model)
    
    # training the LogisticRegression model with Training data
    pipe.fit(X_train, Y_train)  # apply scaling on training data
    print("Accuracy on test data: ",pipe.score(X_test, Y_test))  # apply scaling on testing data, without leaking training data.
    
#     # accuracy on training data
#     X_train_prediction = model.predict(X_train)
#     training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
#     print('Accuracy on Training data : ', training_data_accuracy)
    
#     # accuracy on test data
#     X_test_prediction = model.predict(X_test)
#     test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
#     print('Accuracy on Test data : ', test_data_accuracy)
    
    return

In [6]:
def main():
    # Load dataset
    heart_data = pd.read_csv('heart_disease_data.csv')
    inspect_dataset(heart_data)
    X_train, X_test, Y_train, Y_test = split_features_targets(heart_data)
    develop_model(X_train, X_test, Y_train, Y_test)
    return

In [7]:
if __name__=='__main__':
    main()

Data head:
    age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   63    1   3       145   233    1        0      150      0      2.3      0   
1   37    1   2       130   250    0        1      187      0      3.5      0   
2   41    0   1       130   204    0        0      172      0      1.4      2   
3   56    1   1       120   236    0        1      178      0      0.8      2   
4   57    0   0       120   354    0        1      163      1      0.6      2   

   ca  thal  target  
0   0     1       1  
1   0     2       1  
2   0     2       1  
3   0     2       1  
4   0     2       1   

Data shape:
 (303, 14) 

Data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    