In [1]:
# Import the required modules
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

## Step 1: Read in the dataset

In [23]:
# Read the usage_stats.csv file from the Resources folder into a Pandas DataFrame
heart_df = pd.read_csv(
    Path("Resources/heart.csv")
)

# Review the DataFrame
display(heart_df.head())
display(heart_df.tail())

Unnamed: 0,age,sex (0= F ;1=M),chest_pain_type,resting_blood_pressure (in mm Hg),cholesterol,fasting_blood_sugar > 120 mg/dl,resting_ecg_result,max_heart_rate_achieved,exercise_induced_angina (1 = Y ; 0= N),oldpeak,slp,caa,thall,output (0 = less chance of HA ; 1 = more chance of HA)
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


Unnamed: 0,age,sex (0= F ;1=M),chest_pain_type,resting_blood_pressure (in mm Hg),cholesterol,fasting_blood_sugar > 120 mg/dl,resting_ecg_result,max_heart_rate_achieved,exercise_induced_angina (1 = Y ; 0= N),oldpeak,slp,caa,thall,output (0 = less chance of HA ; 1 = more chance of HA)
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0
302,57,0,1,130,236,0,0,174,0,0.0,1,1,2,0


In [24]:
## Rename long column names 
heart_df.rename(columns={'sex (0= F ;1=M) ': 'sex', 
                         'exercise_induced_angina (1 = Y ; 0= N)': 'exercise_induced_angina', 
                         'output (0 = less chance of HA ; 1 = more chance of HA)': 'output', 
                         'resting_blood_pressure (in mm Hg)': 'resting_blood_pressure', 
                         'fasting_blood_sugar > 120 mg/dl': 'is_high_blood_sugar'}, 
                         inplace=True)

heart_df.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,cholesterol,is_high_blood_sugar,resting_ecg_result,max_heart_rate_achieved,exercise_induced_angina,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [25]:
heart_df.dtypes

age                          int64
sex                          int64
chest_pain_type              int64
resting_blood_pressure       int64
cholesterol                  int64
is_high_blood_sugar          int64
resting_ecg_result           int64
max_heart_rate_achieved      int64
exercise_induced_angina      int64
oldpeak                    float64
slp                          int64
caa                          int64
thall                        int64
output                       int64
dtype: object

In [26]:
## Convert columns to categorical 
heart_df_clean = heart_df
heart_df_clean['chest_pain_type'] = heart_df_clean['chest_pain_type'].astype('category')
heart_df_clean['is_high_blood_sugar'] = heart_df_clean['is_high_blood_sugar'].astype('category')
heart_df_clean['resting_ecg_result'] = heart_df_clean['resting_ecg_result'].astype('category')
heart_df_clean['exercise_induced_angina'] = heart_df_clean['exercise_induced_angina'].astype('category')
heart_df_clean['slp'] = heart_df_clean['slp'].astype('category')
heart_df_clean['caa'] = heart_df_clean['caa'].astype('category')
heart_df_clean['thall'] = heart_df_clean['thall'].astype('category')
heart_df_clean['output'] = heart_df_clean['output'].astype('category')

display(heart_df_clean.head())

print(heart_df_clean.dtypes)

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,cholesterol,is_high_blood_sugar,resting_ecg_result,max_heart_rate_achieved,exercise_induced_angina,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


age                           int64
sex                           int64
chest_pain_type            category
resting_blood_pressure        int64
cholesterol                   int64
is_high_blood_sugar        category
resting_ecg_result         category
max_heart_rate_achieved       int64
exercise_induced_angina    category
oldpeak                     float64
slp                        category
caa                        category
thall                      category
output                     category
dtype: object


In [30]:
# Seperate the features, X,  from the target variable, y
y = heart_df_clean['output']
X = heart_df_clean.drop(columns='output')

In [31]:
# Preview the features data
X.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,cholesterol,is_high_blood_sugar,resting_ecg_result,max_heart_rate_achieved,exercise_induced_angina,oldpeak,slp,caa,thall
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [32]:
# Preview the first five entries for the target variable
y[:5]

0    1
1    1
2    1
3    1
4    1
Name: output, dtype: category
Categories (2, int64): [0, 1]

## Step 2: Split the data into X and y and then into testing and training sets.

In [33]:
# Split the data into X (features) and y (target)
# The y variable should focus on the target column
y = heart_df_clean['output']

# The X variable should include all features except the target
X = heart_df_clean.drop(columns=['output'])


In [34]:
# Split into testing and training sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Step 3: Fit a logistic regression classifier.

In [37]:
# Declare a logistic regression model.
# Apply a random_state of 9 to the model
logistic_regression_model = LogisticRegression(random_state=9, max_iter=1000)

# Fit and save the logistic regression model using the training data
lr_model = logistic_regression_model.fit(X_train, y_train)

## Step 4: Create the predicted values for the testing and the training data.

In [38]:
#Generate training predictions
training_predictions = lr_model.predict(X_train)

#Generate testing predictions
testing_predictions = logistic_regression_model.predict(X_test)


## Step 5: Print a confusion matrix for the training data.

In [39]:
# Import the model for sklearn confusion matrix
from sklearn.metrics import confusion_matrix

# Create and save the confusion matrix for the training data
training_matrix = confusion_matrix(y_train, training_predictions)

# Print the confusion matrix for the training data
print(training_matrix)

[[ 84  22]
 [ 11 110]]


## Step 6: Pring a confusion matrix for the testing data.

In [40]:
# Create and save the confusion matrix for the testing data
test_matrix = confusion_matrix(y_test, testing_predictions)

# Print the confusion matrix for the testing data
print(test_matrix)

[[23  9]
 [ 5 39]]


## Step 7: Print the training classification report.

In [41]:
# Create and save the training classification report
training_report = classification_report(y_train, training_predictions)

# Print the training classification report
print(training_report)

              precision    recall  f1-score   support

           0       0.88      0.79      0.84       106
           1       0.83      0.91      0.87       121

    accuracy                           0.85       227
   macro avg       0.86      0.85      0.85       227
weighted avg       0.86      0.85      0.85       227



## Step 8: Print the testing classification report.

In [42]:
# Create and save the testing classification report
testing_report = classification_report(y_test, testing_predictions)

# Print the testing classification report
print(testing_report)

              precision    recall  f1-score   support

           0       0.82      0.72      0.77        32
           1       0.81      0.89      0.85        44

    accuracy                           0.82        76
   macro avg       0.82      0.80      0.81        76
weighted avg       0.82      0.82      0.81        76



## Step 9: Hyperparameter Tuning

Use hyperparameter tuning (through 5-fold cross validation and grid search) to find the best combination of hyperparameters that optimize model performance. 

In [56]:
# Import the necessary libraries
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Define the parameter grid to search over
param_grid = {
    'penalty': ['l1', 'l2', 'none'], 
    'C': [0.001, 0.01, 0.1, 1, 10, 100]
}

# Create a logistic regression model
model = LogisticRegression(max_iter=1000)

# Define the cross validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Create the grid search object
grid_search = GridSearchCV(model, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)

# Fit the grid search object to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and the corresponding accuracy score
print('Best hyperparameters:', grid_search.best_params_)
print('Best cross-validation accuracy score:', grid_search.best_score_)


Best hyperparameters: {'C': 0.1, 'penalty': 'l2'}
Best cross-validation accuracy score: 0.8408695652173914


30 fits failed out of a total of 90.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\sakib\anaconda3\envs\PythonData\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\sakib\anaconda3\envs\PythonData\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\sakib\anaconda3\envs\PythonData\lib\site-packages\sklearn\linear_model\_logistic.py", line 449, in _check_solver
    % (solver, penalty)
ValueError: Solver lbfgs supports only 'l2' or 'none' penalt

In [57]:
## Fit new model using the "best" hyperparameters found in the above grid search 
best_model = LogisticRegression(C = 0.1, penalty = 'l2', max_iter=1000)

best_model.fit(X_train, y_train)

LogisticRegression(C=0.1, max_iter=1000)

In [59]:
#Generate training predictions using the "best model"
training_predictions = best_model.predict(X_train)

#Generate testing predictions using the "best model"
testing_predictions = best_model.predict(X_test)

In [54]:
# Create and save the testing classification report
training_report = classification_report(y_train, training_predictions)

print(training_report)

              precision    recall  f1-score   support

           0       0.91      0.77      0.84       106
           1       0.82      0.93      0.88       121

    accuracy                           0.86       227
   macro avg       0.87      0.85      0.86       227
weighted avg       0.87      0.86      0.86       227



In [55]:
# Create and save the testing classification report
testing_report = classification_report(y_test, testing_predictions)

print(testing_report)

              precision    recall  f1-score   support

           0       0.85      0.72      0.78        32
           1       0.82      0.91      0.86        44

    accuracy                           0.83        76
   macro avg       0.83      0.81      0.82        76
weighted avg       0.83      0.83      0.83        76



The "best" hyperparameters improved test set accuracy by 0.01 (e.g., 1%). Although this increase in prediction accuracy might seem immaterial, the hyperparameter tuning resulted in improved F1 scores for both the `0` class and the `1` class. 