In [13]:
# import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, TimeSeriesSplit
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, mean_squared_error, precision_score, recall_score, f1_score, roc_auc_score

# Load the data set
df = pd.read_csv("heart.csv")

# Define the target and features
target = 'target'
features = df.columns.tolist()
features.remove(target)

# Split the data into training and testing sets
X = df[features]
y = df[target]

In [3]:
# Data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

# Create and train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8688524590163934


In [5]:
# Cross validation
model = RandomForestClassifier()
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X_train, y_train, cv=cv, scoring="accuracy")

print("Cross Validation Accuracy: ", scores.mean())


Cross Validation Accuracy:  0.81828231292517


In [None]:
# Training and Evaluating the model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Test score accuracy: ", accuracy_score(y_test, y_pred))

In [8]:
# Classification Accuracy of the algorithms

# initialize models
models = {
    'Logistic Regression': LogisticRegression(max_iter=10000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(probability=True, random_state=42)
}

# Function to evaluate models
def evaluate_models(X_train, X_test, y_train, y_test, models):
    results = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        results[name] = accuracy
    return results


# TRAINING AND TESTING SPLIT
_Purpose_
The primary purpose of splitting the dataset into training and testing sets is to evaluate the model's performance on unseen data. This helps in understanding how well the model generalizes to new data.

_How it works_
1.Training Set: A portion of the dataset used to train the model. The model learns the underlying patterns and relationships from this data.

2.Testing Set: A separate portion of the dataset used to evaluate the model's performance. This data is not seen by the model during training, providing an unbiased assessment of its generalization capability.

_Typical split ratio_
A common split ratio is 70% for training and 30% for testing. Other popular ratios include 80/20 and 75/25.
The choice of ratio can depend on the size of the dataset. For very large datasets, even a small percentage can provide a substantial test set.

In [9]:
# 1. Training and Testing Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
train_test_results = evaluate_models(X_train, X_test, y_train, y_test, models)
print("Training and Testing Split Results:", train_test_results)

Training and Testing Split Results: {'Logistic Regression': 0.8131868131868132, 'Decision Tree': 0.7362637362637363, 'Random Forest': 0.8241758241758241, 'KNN': 0.6593406593406593, 'SVM': 0.7032967032967034}


# CROSS VALIDATION
_Purpose_:
Cross-validation is used to assess the generalizability of a model. Instead of splitting the dataset into a single training set and a single test set, cross-validation divides the dataset into multiple folds, trains the model on some folds, and tests it on the remaining fold. This process is repeated several times, and the results are averaged to provide a more reliable estimate of model performance.

_How it works_
1.The dataset is divided into k subsets (folds).
2.The model is trained on k-1 folds and tested on the remaining fold.
3.This process is repeated k times, with each fold used exactly once as the test set.
4.The performance metrics (e.g., accuracy, precision, recall) are averaged over all k runs to get a final evaluation score.


In [10]:
# 2. Cross-Validation
cross_val_results = {}
for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=5)
    cross_val_results[name] = scores.mean()
print("Cross-Validation Results:", cross_val_results)

Cross-Validation Results: {'Logistic Regression': 0.8282513661202187, 'Decision Tree': 0.755464480874317, 'Random Forest': 0.8381967213114756, 'KNN': 0.643879781420765, 'SVM': 0.6434972677595628}


# STRATIFIED SPLITTING / STRATIFIED CROSS VALIDATION
_Purpose_:
Stratified splitting ensures that each fold of the dataset maintains the same proportion of classes as the original dataset. This is particularly important for classification problems where the class distribution is imbalanced.

_How it works_
The dataset is divided into k folds in such a way that each fold has approximately the same percentage of samples of each target class as the original dataset.
The model is trained and validated using these stratified folds.

Note: Imbalanced Dataset;
      is a dataset where the distribution of classes is not uniform. In other words, one class significantly outnumbers the other class(es).
      e.g:Medical Diagnosis: In medical datasets, the number of patients with a rare disease (positive class) is often much smaller than the number of healthy patients (negative class).

In [11]:
# 3. Stratified Splitting
skf = StratifiedKFold(n_splits=5)
stratified_results = {}
for name, model in models.items():
    accuracies = []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracies.append(accuracy_score(y_test, y_pred))
    stratified_results[name] = np.mean(accuracies)
print("Stratified Cross-Validation Results:", stratified_results)

Stratified Cross-Validation Results: {'Logistic Regression': 0.8282513661202187, 'Decision Tree': 0.755464480874317, 'Random Forest': 0.8381967213114756, 'KNN': 0.643879781420765, 'SVM': 0.6434972677595628}


In [15]:
# Evaluate the metrics
metrics = {
    'Accuracy': accuracy_score,
    'Precision': precision_score,
    'Recall': recall_score,
    'F1_score': f1_score,
    'ROC AUC': roc_auc_score
}

In [17]:
"""_summary_
steps:
1.Train the Models: Fit each model to the training data.
2.Predict on Test Data: Use the trained model to predict the target variable for the test data.
3.Calculate Metrics: Calculate the performance metrics using the true and predicted values of the target variable.
"""

# Split the data into training and testing sets
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Dictionary to store the results
results = {}

for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
    
    # Calculate metrics
    model_results = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1_score': f1_score(y_test, y_pred),
        'ROC AUC': roc_auc_score(y_test, y_prob) if y_prob is not None else 'N/A'
    }
    
    # Store the results
    results[name] = model_results


In [18]:
for model_name, model_results in results.items():
    print(f"Results for {model_name}:")
    for metric_name, value in model_results.items():
        print(f"  {metric_name}: {value:.4f}")
    print()

Results for Logistic Regression:
  Accuracy: 0.8132
  Precision: 0.8235
  Recall: 0.8400
  F1_score: 0.8317
  ROC AUC: 0.8820

Results for Decision Tree:
  Accuracy: 0.7363
  Precision: 0.7955
  Recall: 0.7000
  F1_score: 0.7447
  ROC AUC: 0.7402

Results for Random Forest:
  Accuracy: 0.8242
  Precision: 0.8400
  Recall: 0.8400
  F1_score: 0.8400
  ROC AUC: 0.9095

Results for KNN:
  Accuracy: 0.6593
  Precision: 0.6792
  Recall: 0.7200
  F1_score: 0.6990
  ROC AUC: 0.7132

Results for SVM:
  Accuracy: 0.7033
  Precision: 0.6716
  Recall: 0.9000
  F1_score: 0.7692
  ROC AUC: 0.7946



In [None]:
# Actionable insight of the models
# LR
# Best performed model : Logistic Regression
# Precision (0.08), Recall (0.90), f1_score (0.89), ROC AUC (0.91)

# Balance model performance:
#
# KNN

# SVM

# DT

# RF

# Conclusion
# Logistic regression is the best performer model with the higest core metrics

#

# Recommendation
# Models considered for deployment

# Furtur funing of the models

# Ensemble methods, enhancing performance nd robustness 