In [1]:
# Import the required modules
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import numpy as np
from sklearn import linear_model, decomposition, datasets
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

## Read In Data

In [2]:
# Read in the data
heart_df = pd.read_csv(
    Path("Resources/heart.csv")
)

# Review the DataFrame
display(heart_df.head())
display(heart_df.tail())

Unnamed: 0,age,sex (0= F ;1=M),chest_pain_type,resting_blood_pressure (in mm Hg),cholesterol,fasting_blood_sugar > 120 mg/dl,resting_ecg_result,max_heart_rate_achieved,exercise_induced_angina (1 = Y ; 0= N),oldpeak,slp,caa,thall,output (0 = less chance of HA ; 1 = more chance of HA)
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


Unnamed: 0,age,sex (0= F ;1=M),chest_pain_type,resting_blood_pressure (in mm Hg),cholesterol,fasting_blood_sugar > 120 mg/dl,resting_ecg_result,max_heart_rate_achieved,exercise_induced_angina (1 = Y ; 0= N),oldpeak,slp,caa,thall,output (0 = less chance of HA ; 1 = more chance of HA)
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0
302,57,0,1,130,236,0,0,174,0,0.0,1,1,2,0


In [3]:
## Rename long column names 
heart_df.rename(columns={'sex (0= F ;1=M) ': 'sex', 
                         'exercise_induced_angina (1 = Y ; 0= N)': 'exercise_induced_angina', 
                         'output (0 = less chance of HA ; 1 = more chance of HA)': 'output', 
                         'resting_blood_pressure (in mm Hg)': 'resting_blood_pressure', 
                         'fasting_blood_sugar > 120 mg/dl': 'is_high_blood_sugar'}, 
                         inplace=True)

heart_df.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,cholesterol,is_high_blood_sugar,resting_ecg_result,max_heart_rate_achieved,exercise_induced_angina,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
heart_df.dtypes

age                          int64
sex                          int64
chest_pain_type              int64
resting_blood_pressure       int64
cholesterol                  int64
is_high_blood_sugar          int64
resting_ecg_result           int64
max_heart_rate_achieved      int64
exercise_induced_angina      int64
oldpeak                    float64
slp                          int64
caa                          int64
thall                        int64
output                       int64
dtype: object

In [5]:
## Convert columns to categorical 
heart_df_clean = heart_df
heart_df_clean['chest_pain_type'] = heart_df_clean['chest_pain_type'].astype('category')
heart_df_clean['is_high_blood_sugar'] = heart_df_clean['is_high_blood_sugar'].astype('category')
heart_df_clean['resting_ecg_result'] = heart_df_clean['resting_ecg_result'].astype('category')
heart_df_clean['exercise_induced_angina'] = heart_df_clean['exercise_induced_angina'].astype('category')
heart_df_clean['slp'] = heart_df_clean['slp'].astype('category')
heart_df_clean['caa'] = heart_df_clean['caa'].astype('category')
heart_df_clean['thall'] = heart_df_clean['thall'].astype('category')
heart_df_clean['output'] = heart_df_clean['output'].astype('category')

display(heart_df_clean.head())

print(heart_df_clean.dtypes)

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,cholesterol,is_high_blood_sugar,resting_ecg_result,max_heart_rate_achieved,exercise_induced_angina,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


age                           int64
sex                           int64
chest_pain_type            category
resting_blood_pressure        int64
cholesterol                   int64
is_high_blood_sugar        category
resting_ecg_result         category
max_heart_rate_achieved       int64
exercise_induced_angina    category
oldpeak                     float64
slp                        category
caa                        category
thall                      category
output                     category
dtype: object


## Split Data Into Training/Testing

In [6]:
# Split the data into X (features) and y (target)
y = heart_df_clean['output']
X = heart_df_clean.drop(columns='output')

In [7]:
# Preview the features data
X.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,cholesterol,is_high_blood_sugar,resting_ecg_result,max_heart_rate_achieved,exercise_induced_angina,oldpeak,slp,caa,thall
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [8]:
# Preview the first five entries for the target variable
y[:5]

0    1
1    1
2    1
3    1
4    1
Name: output, dtype: category
Categories (2, int64): [0, 1]

In [9]:
# Split into testing and training sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Apply Feature Selection

In [10]:
# SelectKBest
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectKBest, chi2
# Create an instance of SelectKBest
selector = SelectKBest(chi2, k=5)

In [11]:
# Fit the selector to the training data
X_train_kbest = selector.fit_transform(X_train, y_train)

In [12]:
# Transform the testing data
X_test_kbest = selector.transform(X_test)

In [13]:
# Recursive feature elimination
# Create an instance of the logistic regression model
logistic_regression_model = LogisticRegression(random_state=42)

In [14]:
# Create an instance of RFE
from sklearn.feature_selection import RFE
rfe = RFE(logistic_regression_model, n_features_to_select=5)

In [15]:
# Fit RFE to the training data
X_train_rfe = rfe.fit_transform(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [16]:
# Transform the testing data
X_test_rfe = rfe.transform(X_test)

## Apply regularization

In [17]:
# L1 penalty
logistic_regression_model_l1 = LogisticRegression(penalty='l1', solver='liblinear')

In [18]:
# Fit the L1 model to the training data
l1_model = logistic_regression_model_l1.fit(X_train, y_train)


In [19]:
# L2 penalty
logistic_regression_model_l2 = LogisticRegression(penalty='l2')

In [20]:
# Fit the L2 model to the training data
l2_model = logistic_regression_model_l2.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
# Fit and save the logistic regression model using the training data
logistic_regression_model = LogisticRegression(random_state=42)
lr_model = logistic_regression_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [22]:
# Generate training predictions
training_predictions = lr_model.predict(X_train)

In [23]:
# Generate testing predictions
testing_predictions = lr_model.predict(X_test)


In [24]:
# Create and print the confusion matrix for the training data
from sklearn.metrics import confusion_matrix
training_matrix = confusion_matrix(y_train, training_predictions)
print(training_matrix)

[[ 81  22]
 [ 10 114]]


In [25]:
# Create and print the confusion matrix for the testing data
testing_matrix = confusion_matrix(y_test, testing_predictions)
print(testing_matrix)

[[29  6]
 [ 3 38]]


In [26]:
# Create and save the training classification report
training_report = classification_report(y_train, training_predictions)
print(training_report)

              precision    recall  f1-score   support

           0       0.89      0.79      0.84       103
           1       0.84      0.92      0.88       124

    accuracy                           0.86       227
   macro avg       0.86      0.85      0.86       227
weighted avg       0.86      0.86      0.86       227



In [27]:
# Create and save the testing classification report
testing_report = classification_report(y_test, testing_predictions)
print(testing_report)

              precision    recall  f1-score   support

           0       0.91      0.83      0.87        35
           1       0.86      0.93      0.89        41

    accuracy                           0.88        76
   macro avg       0.88      0.88      0.88        76
weighted avg       0.88      0.88      0.88        76



## Step 9: Hyperparameter Tuning

In [28]:
# Import the necessary libraries
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Define the parameter grid to search over
param_grid = {
    'penalty': ['l1', 'l2', 'none'], 
    'C': [0.001, 0.01, 0.1, 1, 10, 100]
}

# Create a logistic regression model
model = LogisticRegression(max_iter=1000)

# Define the cross validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Create the grid search object
grid_search = GridSearchCV(model, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)

# Fit the grid search object to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and the corresponding accuracy score
print('Best hyperparameters:', grid_search.best_params_)
print('Best cross-validation accuracy score:', grid_search.best_score_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best hyperparameters: {'C': 0.1, 'penalty': 'l2'}
Best cross-validation accuracy score: 0.8150724637681158


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
30 fits failed out of a total of 90.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.9/site

In [29]:
## Fit new model using the "best" hyperparameters found in the above grid search 
best_model = LogisticRegression(C = 0.1, penalty = 'l2', max_iter=1000)

best_model.fit(X_train, y_train)

LogisticRegression(C=0.1, max_iter=1000)

In [30]:
#Generate training predictions using the "best model"
training_predictions = best_model.predict(X_train)

#Generate testing predictions using the "best model"
testing_predictions = best_model.predict(X_test)


In [31]:
# Create and save the testing classification report
training_report = classification_report(y_train, training_predictions)

print(training_report)

              precision    recall  f1-score   support

           0       0.90      0.74      0.81       103
           1       0.81      0.94      0.87       124

    accuracy                           0.85       227
   macro avg       0.86      0.84      0.84       227
weighted avg       0.85      0.85      0.84       227



In [32]:
# Create and save the testing classification report
testing_report = classification_report(y_test, testing_predictions)

print(testing_report)

              precision    recall  f1-score   support

           0       0.91      0.86      0.88        35
           1       0.88      0.93      0.90        41

    accuracy                           0.89        76
   macro avg       0.90      0.89      0.89        76
weighted avg       0.90      0.89      0.89        76

