In [5]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [7]:

# Load the dataset
dataset = pd.read_csv('heart-dataset.csv')

In [9]:
dataset

Unnamed: 0,age,sex,chest_pain_type,resting_bp,serum_cholesterol,fasting_blood_sugar,resting_ecg,max_heart_rate,exercise_induced_angina,oldpeak,slope,num_major_vessels,thal,smoking,diabetes,obesity,family_history,target
0,58,1,4,150,247,0,0,148,0,2.127886,1,0,1,0,0,0,1,0
1,52,1,3,123,266,0,1,149,0,0.274316,1,0,0,0,0,0,0,0
2,59,1,3,116,153,0,0,149,0,0.589638,1,0,0,0,0,0,0,0
3,67,1,2,125,274,0,1,178,0,0.000000,2,0,2,0,0,0,0,1
4,51,1,2,130,169,0,0,125,1,0.000000,2,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3547,67,1,3,131,223,1,0,132,0,2.522862,2,0,0,1,0,0,0,0
3548,43,0,2,142,236,0,1,172,0,3.169873,2,0,2,0,0,0,0,0
3549,53,1,3,146,226,0,2,138,1,0.660851,1,2,0,0,1,0,0,1
3550,49,1,4,144,324,1,0,146,0,2.042293,3,0,0,0,0,0,0,1


In [11]:
# Handling missing values (imputing with the mean for numerical values)
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
dataset.iloc[:, :] = imputer.fit_transform(dataset)

In [13]:
# Removing duplicate rows
dataset.drop_duplicates(inplace=True)

In [15]:
dataset

Unnamed: 0,age,sex,chest_pain_type,resting_bp,serum_cholesterol,fasting_blood_sugar,resting_ecg,max_heart_rate,exercise_induced_angina,oldpeak,slope,num_major_vessels,thal,smoking,diabetes,obesity,family_history,target
0,58,1,4,150,247,0,0,148,0,2.127886,1,0,1,0,0,0,1,0
1,52,1,3,123,266,0,1,149,0,0.274316,1,0,0,0,0,0,0,0
2,59,1,3,116,153,0,0,149,0,0.589638,1,0,0,0,0,0,0,0
3,67,1,2,125,274,0,1,178,0,0.000000,2,0,2,0,0,0,0,1
4,51,1,2,130,169,0,0,125,1,0.000000,2,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3547,67,1,3,131,223,1,0,132,0,2.522862,2,0,0,1,0,0,0,0
3548,43,0,2,142,236,0,1,172,0,3.169873,2,0,2,0,0,0,0,0
3549,53,1,3,146,226,0,2,138,1,0.660851,1,2,0,0,1,0,0,1
3550,49,1,4,144,324,1,0,146,0,2.042293,3,0,0,0,0,0,0,1


In [17]:
# Splitting the features and target variable
X = dataset.drop('target', axis=1)
y = dataset['target']

In [19]:
# Encoding categorical data (if necessary)
# Let's assume 'sex', 'chest_pain_type', and 'thal' are categorical
label_encoder = LabelEncoder()
X['sex'] = label_encoder.fit_transform(X['sex'])
X['chest_pain_type'] = label_encoder.fit_transform(X['chest_pain_type'])
X['thal'] = label_encoder.fit_transform(X['thal'])

In [21]:
# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [23]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [25]:
# Initialize a dictionary to store model accuracies and confusion matrices
results_dict = {}

In [27]:
# Logistic Regression
logistic_model = LogisticRegression(random_state=42)
logistic_model.fit(X_train, y_train)
y_pred_logistic = logistic_model.predict(X_test)
logistic_accuracy = accuracy_score(y_test, y_pred_logistic)
results_dict['Logistic Regression'] = {
    'accuracy': logistic_accuracy,
    'conf_matrix': confusion_matrix(y_test, y_pred_logistic),
    'classification_report': classification_report(y_test, y_pred_logistic)
}

In [29]:
# Support Vector Classifier (SVC)
svc_model = SVC(kernel='linear', random_state=42)
svc_model.fit(X_train, y_train)
y_pred_svc = svc_model.predict(X_test)
svc_accuracy = accuracy_score(y_test, y_pred_svc)
results_dict['SVC'] = {
    'accuracy': svc_accuracy,
    'conf_matrix': confusion_matrix(y_test, y_pred_svc),
    'classification_report': classification_report(y_test, y_pred_svc)
}

In [30]:
# Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
dt_accuracy = accuracy_score(y_test, y_pred_dt)
results_dict['Decision Tree'] = {
    'accuracy': dt_accuracy,
    'conf_matrix': confusion_matrix(y_test, y_pred_dt),
    'classification_report': classification_report(y_test, y_pred_dt)
}

In [33]:
# Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
results_dict['Random Forest'] = {
    'accuracy': rf_accuracy,
    'conf_matrix': confusion_matrix(y_test, y_pred_rf),
    'classification_report': classification_report(y_test, y_pred_rf)
}

In [34]:
# Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)
gb_accuracy = accuracy_score(y_test, y_pred_gb)
results_dict['Gradient Boosting'] = {
    'accuracy': gb_accuracy,
    'conf_matrix': confusion_matrix(y_test, y_pred_gb),
    'classification_report': classification_report(y_test, y_pred_gb)
}

In [36]:
# Display results
print("\nBaseline Accuracy of all algorithms:")
for model, result in results_dict.items():
    print(f"{model}: {result['accuracy'] * 100:.2f}%")
    print(f"Confusion Matrix for {model}:\n{result['conf_matrix']}")
    print(f"Classification Report for {model}:\n{result['classification_report']}")


Baseline Accuracy of all algorithms:
Logistic Regression: 80.45%
Confusion Matrix for Logistic Regression:
[[391  47]
 [ 92 181]]
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.81      0.89      0.85       438
           1       0.79      0.66      0.72       273

    accuracy                           0.80       711
   macro avg       0.80      0.78      0.79       711
weighted avg       0.80      0.80      0.80       711

SVC: 80.73%
Confusion Matrix for SVC:
[[395  43]
 [ 94 179]]
Classification Report for SVC:
              precision    recall  f1-score   support

           0       0.81      0.90      0.85       438
           1       0.81      0.66      0.72       273

    accuracy                           0.81       711
   macro avg       0.81      0.78      0.79       711
weighted avg       0.81      0.81      0.80       711

Decision Tree: 63.99%
Confusion Matrix for Decision Tree:
[[313 125]
 [131 1

In [39]:
# Hyperparameter Tuning to Improve Accuracy:

from sklearn.model_selection import GridSearchCV

In [41]:
# Logistic Regression - Hyperparameter tuning
param_grid_lr = {'C': [0.1, 1, 10], 'penalty': ['l1', 'l2'], 'solver': ['liblinear']}
grid_search_lr = GridSearchCV(LogisticRegression(random_state=42), param_grid_lr, cv=5, scoring='accuracy')
grid_search_lr.fit(X_train, y_train)
best_lr_model = grid_search_lr.best_estimator_

In [43]:
# SVC - Hyperparameter tuning
param_grid_svc = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}
grid_search_svc = GridSearchCV(SVC(random_state=42), param_grid_svc, cv=5, scoring='accuracy')
grid_search_svc.fit(X_train, y_train)
best_svc_model = grid_search_svc.best_estimator_

In [44]:
# Decision Tree - Hyperparameter tuning
param_grid_dt = {'max_depth': [3, 5, 10], 'min_samples_split': [2, 5, 10], 'criterion': ['gini', 'entropy']}
grid_search_dt = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid_dt, cv=5, scoring='accuracy')
grid_search_dt.fit(X_train, y_train)
best_dt_model = grid_search_dt.best_estimator_

In [45]:
# Random Forest - Hyperparameter tuning
param_grid_rf = {'n_estimators': [100, 200], 'max_depth': [5, 10], 'min_samples_split': [2, 5, 10]}
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)
best_rf_model = grid_search_rf.best_estimator_

In [46]:
# Gradient Boosting - Hyperparameter tuning
param_grid_gb = {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.5], 'max_depth': [3, 5, 10]}
grid_search_gb = GridSearchCV(GradientBoostingClassifier(random_state=42), param_grid_gb, cv=5, scoring='accuracy')
grid_search_gb.fit(X_train, y_train)
best_gb_model = grid_search_gb.best_estimator_

In [47]:
# Assuming the tuned models are already obtained using GridSearchCV
best_lr_model = grid_search_lr.best_estimator_
best_svc_model = grid_search_svc.best_estimator_
best_dt_model = grid_search_dt.best_estimator_
best_rf_model = grid_search_rf.best_estimator_
best_gb_model = grid_search_gb.best_estimator_

In [48]:
# Create a dictionary of tuned models
models = {
    'Logistic Regression': best_lr_model,
    'SVC': best_svc_model,
    'Decision Tree': best_dt_model,
    'Random Forest': best_rf_model,
    'Gradient Boosting': best_gb_model
}


In [49]:
# Now evaluate the tuned models and store results
tuned_results_dict = {}
for model_name, model in models.items():
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    tuned_results_dict[model_name] = {
        'accuracy': accuracy,
        'conf_matrix': conf_matrix,
        'classification_report': report
    }

In [50]:
# You can now display the results
print("\nAccuracy after Hyperparameter Tuning:")
for model, result in tuned_results_dict.items():
    print(f"{model}: {result['accuracy'] * 100:.2f}%")
    print(f"Confusion Matrix for {model}:\n{result['conf_matrix']}")
    print(f"Classification Report for {model}:\n{result['classification_report']}")



Accuracy after Hyperparameter Tuning:
Logistic Regression: 80.73%
Confusion Matrix for Logistic Regression:
[[392  46]
 [ 91 182]]
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.81      0.89      0.85       438
           1       0.80      0.67      0.73       273

    accuracy                           0.81       711
   macro avg       0.80      0.78      0.79       711
weighted avg       0.81      0.81      0.80       711

SVC: 80.73%
Confusion Matrix for SVC:
[[396  42]
 [ 95 178]]
Classification Report for SVC:
              precision    recall  f1-score   support

           0       0.81      0.90      0.85       438
           1       0.81      0.65      0.72       273

    accuracy                           0.81       711
   macro avg       0.81      0.78      0.79       711
weighted avg       0.81      0.81      0.80       711

Decision Tree: 66.81%
Confusion Matrix for Decision Tree:
[[350  88]
 [148 

In [51]:
# Create a DataFrame for metrics before hyperparameter tuning
import pandas as pd

metrics_before_tuning = []

for model, result in results_dict.items():
    conf_matrix = result['conf_matrix']
    TP = conf_matrix[1, 1]
    TN = conf_matrix[0, 0]
    FP = conf_matrix[0, 1]
    FN = conf_matrix[1, 0]
    metrics_before_tuning.append({
        'Model': model,
        'Accuracy': f"{result['accuracy'] * 100:.2f}%",
        'True Positives (TP)': TP,
        'True Negatives (TN)': TN,
        'False Positives (FP)': FP,
        'False Negatives (FN)': FN
    })

df_before_tuning = pd.DataFrame(metrics_before_tuning)
print("\nMetrics Before Hyperparameter Tuning:")
print(df_before_tuning)



Metrics Before Hyperparameter Tuning:
                 Model Accuracy  True Positives (TP)  True Negatives (TN)  \
0  Logistic Regression   80.45%                  181                  391   
1                  SVC   80.73%                  179                  395   
2        Decision Tree   63.99%                  142                  313   
3        Random Forest   72.71%                  134                  383   
4    Gradient Boosting   74.68%                  149                  382   

   False Positives (FP)  False Negatives (FN)  
0                    47                    92  
1                    43                    94  
2                   125                   131  
3                    55                   139  
4                    56                   124  


In [52]:
# Create a DataFrame for metrics after hyperparameter tuning
metrics_after_tuning = []

for model, result in tuned_results_dict.items():
    conf_matrix = result['conf_matrix']
    TP = conf_matrix[1, 1]
    TN = conf_matrix[0, 0]
    FP = conf_matrix[0, 1]
    FN = conf_matrix[1, 0]
    metrics_after_tuning.append({
        'Model': model,
        'Accuracy': f"{result['accuracy'] * 100:.2f}%",
        'True Positives (TP)': TP,
        'True Negatives (TN)': TN,
        'False Positives (FP)': FP,
        'False Negatives (FN)': FN
    })

df_after_tuning = pd.DataFrame(metrics_after_tuning)
print("\nMetrics After Hyperparameter Tuning:")
print(df_after_tuning)



Metrics After Hyperparameter Tuning:
                 Model Accuracy  True Positives (TP)  True Negatives (TN)  \
0  Logistic Regression   80.73%                  182                  392   
1                  SVC   80.73%                  178                  396   
2        Decision Tree   66.81%                  125                  350   
3        Random Forest   73.00%                  135                  384   
4    Gradient Boosting   74.68%                  149                  382   

   False Positives (FP)  False Negatives (FN)  
0                    46                    91  
1                    42                    95  
2                    88                   148  
3                    54                   138  
4                    56                   124  
