In [None]:
#Introduction to MLOps with Scikit-learn Pipelines - Explore how Scikit-learn pipelines can be used in MLOps Workflows.
#Introduction to MLOps with Scikit-learn Pipelines Using Heart Disease Dataset
# Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from joblib import dump, load

In [None]:
# Step 1: Load the Heart Disease Dataset
# You can download the dataset from https://archive.ics.uci.edu/ml/datasets/heart+Disease
#url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/HeartDisease.csv"
#I am using dataset from kaggle https://www.kaggle.com/datasets/johnsmith88/heart-disease-dataset
data = pd.read_csv(r'/content/heart.csv')
#Age, Sex, ChestPainType, RestingBP, Cholesterol, FastingBS, RestingECG, MaxHR, ExerciseAngina, Oldpeak, ST_Slope, etc

In [None]:
# Step 2: Data Preparation
# Drop rows with missing values (if any)
data = data.dropna()

In [None]:
# Features and target variable
X = data.drop(columns=['target'])  # 'target' is the column for heart disease presence (0 or 1)
y = data['target']

In [None]:
# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 4: Create an MLOps-ready pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),               # Standardize the features
    ('model', LogisticRegression())            # Logistic Regression for binary classification
])

In [None]:
# Step 5: Train the pipeline
pipeline.fit(X_train, y_train)

In [None]:
# Step 6: Evaluate the pipeline
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on Test Data: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy on Test Data: 0.80

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.72      0.78       102
           1       0.76      0.87      0.81       103

    accuracy                           0.80       205
   macro avg       0.80      0.79      0.79       205
weighted avg       0.80      0.80      0.79       205



In [None]:
# Step 7: Save the pipeline (Model Deployment)
model_path = 'heart_disease_pipeline.joblib'
dump(pipeline, model_path)
print(f"Pipeline saved to {model_path}")

Pipeline saved to heart_disease_pipeline.joblib


In [None]:
# Step 8: Load the pipeline and test predictions
loaded_pipeline = load(model_path)
sample_predictions = loaded_pipeline.predict(X_test[:5])   #Test predictions on the first 5 samples in the test set
print(f"Sample Predictions: {sample_predictions}")

Sample Predictions: [1 1 0 1 0]


In [None]:
# Step 8: Load the pipeline and test predictions
loaded_pipeline = load(model_path)
sample_predictions = loaded_pipeline.predict(X_test[5:10])   #Test predictions on the next 5 samples in the test set
print(f"Sample Predictions: {sample_predictions}")

Sample Predictions: [1 0 0 1 0]


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


In [None]:
# Make predictions on the test set
y_pred = loaded_pipeline.predict(X_test)

In [None]:
# Evaluate the model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

In [None]:
# Print the evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.7951
Precision: 0.7563
Recall: 0.8738
F1 Score: 0.8108
Confusion Matrix:
[[73 29]
 [13 90]]
