In [1]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score, confusion_matrix
import joblib
import mlflow
import mlflow.sklearn
from sqlalchemy import create_engine
import streamlit as st
from fastapi import FastAPI
import docker
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Configure MLflow for DagsHub
os.environ["MLFLOW_TRACKING_USERNAME"] = "HarshithReddy-Audipudi"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "Qwerty@123"
mlflow.set_tracking_uri("https://dagshub.com/HarshithReddy-Audipudi/HAUDIPUD-P.mlflow")
mlflow.set_experiment("Heart Disease Prediction")

<Experiment: artifact_location='mlflow-artifacts:/77c639e548884420900845f5f83a5b10', creation_time=1734605230090, experiment_id='2', last_update_time=1734605230090, lifecycle_stage='active', name='Heart Disease Prediction', tags={}>

In [5]:
# Load the dataset
data_path = "heart.csv"
df = pd.read_csv(data_path)


In [7]:
# Normalize and Create Database
engine = create_engine('sqlite:///heart_disease.db')
# Split data into normalized tables for 3NF
patients = df[['age', 'sex', 'cp']].drop_duplicates()
heart_stats = df[['age', 'trestbps', 'chol', 'thalach', 'target']]
# Save normalized tables
patients.to_sql('patients', engine, if_exists='replace', index=False)
heart_stats.to_sql('heart_stats', engine, if_exists='replace', index=False)

303

In [9]:
# Fetch Data with SQL Join Example
query = """
SELECT p.age, p.sex, p.cp, hs.trestbps, hs.chol, hs.thalach, hs.target 
FROM patients p
JOIN heart_stats hs ON p.age = hs.age
"""
df = pd.read_sql(query, engine)


In [11]:
# Data Exploration
def explore_data(df):
    print(df.info())
    print(df.describe())
    print(df.isnull().sum())

    # Correlation matrix
    correlation_matrix = df.corr()
    sns.heatmap(correlation_matrix, annot=True, fmt=".2f")
    plt.title("Correlation Matrix")
    plt.show()

    # Distributions
    df.hist(bins=20, figsize=(15, 10))
    plt.tight_layout()
    plt.show()

    # Capped values
    capped_features = [col for col in df.columns if df[col].max() > df[col].quantile(0.99)]
    print("Potential capped features:", capped_features)

In [142]:
# Cleanup tasks
print("Recommended cleanup tasks:")
print("1. Handle missing values.")
print("2. Investigate and address capped features.")
print("3. Standardize numerical features.")
explore_data(df)

Recommended cleanup tasks:
1. Handle missing values.
2. Investigate and address capped features.
3. Standardize numerical features.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1402 entries, 0 to 1401
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   age       1402 non-null   int64
 1   sex       1402 non-null   int64
 2   cp        1402 non-null   int64
 3   trestbps  1402 non-null   int64
 4   chol      1402 non-null   int64
 5   thalach   1402 non-null   int64
 6   target    1402 non-null   int64
dtypes: int64(7)
memory usage: 76.8 KB
None
               age          sex           cp     trestbps         chol  \
count  1402.000000  1402.000000  1402.000000  1402.000000  1402.000000   
mean     54.611983     0.586305     1.186876   131.679743   247.375178   
std       8.132684     0.492671     1.036465    17.767095    51.294664   
min      29.000000     0.000000     0.000000    94.000000   126.000000   
25%      49.000

  plt.show()


Potential capped features: ['age', 'trestbps', 'chol', 'thalach']


  plt.show()


In [13]:
# Train-Test Split
df['target'] = df['target'].astype('int')  # Ensure target column is integer
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [15]:
# Apply Feature Engineering to Both Train and Test Data
X_train['bmi_age_comb'] = X_train['age'] * X_train['trestbps']
X_test['bmi_age_comb'] = X_test['age'] * X_test['trestbps']


In [17]:
# Ensure active MLflow runs are closed
def close_active_run():
    if mlflow.active_run():
        mlflow.end_run()

In [19]:
def run_pipeline(classifier, experiment_name, cv_folds=3):
    close_active_run()  # Ensure no active runs before starting
    with mlflow.start_run(run_name=experiment_name):
        numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
        numeric_transformer = Pipeline(steps=[
            ('scaler', StandardScaler())
        ])

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features)
            ])

        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                    ('classifier', classifier)])

        # Cross-validation
        skf = StratifiedKFold(n_splits=cv_folds)
        cv_results = cross_val_score(pipeline, X_train, y_train, cv=skf, scoring='f1')
        print(f"CV Results (mean/std): {cv_results.mean()} / {cv_results.std()}")

        pipeline.fit(X_train, y_train)

        y_pred = pipeline.predict(X_test)
        f1 = f1_score(y_test, y_pred)

        cm = confusion_matrix(y_test, y_pred)
        tn, fp, fn, tp = cm.ravel()

        # Debugging log statements
        print(f"Logging metrics for experiment: {experiment_name}")
        print(f"F1-score: {f1}, TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}")

        # Log Parameters
        if hasattr(classifier, 'get_params'):
            params = classifier.get_params()
            mlflow.log_params(params)


        mlflow.log_metric("F1-score", f1)
        mlflow.log_metric("True Positives", tp)
        mlflow.log_metric("True Negatives", tn)
        mlflow.log_metric("False Positives", fp)
        mlflow.log_metric("False Negatives", fn)

        mlflow.sklearn.log_model(pipeline, "model")
    close_active_run()


In [21]:
# Experiment #1: Logistic Regression with Preprocessing
run_pipeline(LogisticRegression(), "Logistic Regression Experiment")

CV Results (mean/std): 0.7284314361287723 / 0.01970018548163284
Logging metrics for experiment: Logistic Regression Experiment
F1-score: 0.7094594594594594, TP: 105, TN: 90, FP: 41, FN: 45




🏃 View run Logistic Regression Experiment at: https://dagshub.com/HarshithReddy-Audipudi/HAUDIPUD-P.mlflow/#/experiments/2/runs/cd1bc1ee06824bd18aef53594ce63b1f
🧪 View experiment at: https://dagshub.com/HarshithReddy-Audipudi/HAUDIPUD-P.mlflow/#/experiments/2


In [25]:
# Experiment #2: Multiple Classifiers
for clf, name in [(RidgeClassifier(), "Ridge Classifier Experiment"),
                  (RandomForestClassifier(), "Random Forest Experiment"),
                  (XGBClassifier(), "XGBoost Experiment")]:
    run_pipeline(clf, name)


CV Results (mean/std): 0.7262400050047231 / 0.01905962474061427
Logging metrics for experiment: Ridge Classifier Experiment
F1-score: 0.7070707070707071, TP: 105, TN: 89, FP: 42, FN: 45




🏃 View run Ridge Classifier Experiment at: https://dagshub.com/HarshithReddy-Audipudi/HAUDIPUD-P.mlflow/#/experiments/2/runs/8e1c31b4380f44d7ae2ae91cae23b7e8
🧪 View experiment at: https://dagshub.com/HarshithReddy-Audipudi/HAUDIPUD-P.mlflow/#/experiments/2
CV Results (mean/std): 0.963323216743785 / 0.0025011108755076516
Logging metrics for experiment: Random Forest Experiment
F1-score: 0.9966777408637874, TP: 150, TN: 130, FP: 1, FN: 0




🏃 View run Random Forest Experiment at: https://dagshub.com/HarshithReddy-Audipudi/HAUDIPUD-P.mlflow/#/experiments/2/runs/abfa3f25154b43e1bace9ebe4e95abbc
🧪 View experiment at: https://dagshub.com/HarshithReddy-Audipudi/HAUDIPUD-P.mlflow/#/experiments/2
CV Results (mean/std): 0.9741671319560766 / 0.0066630483258624815
Logging metrics for experiment: XGBoost Experiment
F1-score: 0.9966555183946488, TP: 149, TN: 131, FP: 0, FN: 1




🏃 View run XGBoost Experiment at: https://dagshub.com/HarshithReddy-Audipudi/HAUDIPUD-P.mlflow/#/experiments/2/runs/8ecf6ff5b9284848a3df0dfe0d85ff75
🧪 View experiment at: https://dagshub.com/HarshithReddy-Audipudi/HAUDIPUD-P.mlflow/#/experiments/2


In [26]:
# Experiment #3: Feature Engineering
run_pipeline(LogisticRegression(), "Feature Engineering Experiment")

CV Results (mean/std): 0.7284314361287723 / 0.01970018548163284
Logging metrics for experiment: Feature Engineering Experiment
F1-score: 0.7094594594594594, TP: 105, TN: 90, FP: 41, FN: 45




🏃 View run Feature Engineering Experiment at: https://dagshub.com/HarshithReddy-Audipudi/HAUDIPUD-P.mlflow/#/experiments/2/runs/889c2a85eff8460fbe63acd04e7b489a
🧪 View experiment at: https://dagshub.com/HarshithReddy-Audipudi/HAUDIPUD-P.mlflow/#/experiments/2


In [28]:
# Experiment #4: Feature Selection
correlation_threshold = 0.8
corr_matrix = X_train.corr()
high_corr_features = [column for column in corr_matrix.columns if any(corr_matrix[column] > correlation_threshold)]
X_train_selected = X_train.drop(high_corr_features, axis=1)
X_test_selected = X_test.drop(high_corr_features, axis=1)
run_pipeline(LogisticRegression(), "Feature Selection Experiment")

CV Results (mean/std): 0.7284314361287723 / 0.01970018548163284
Logging metrics for experiment: Feature Selection Experiment
F1-score: 0.7094594594594594, TP: 105, TN: 90, FP: 41, FN: 45




🏃 View run Feature Selection Experiment at: https://dagshub.com/HarshithReddy-Audipudi/HAUDIPUD-P.mlflow/#/experiments/2/runs/0c5fb70969ab4136b802289c587eb1ab
🧪 View experiment at: https://dagshub.com/HarshithReddy-Audipudi/HAUDIPUD-P.mlflow/#/experiments/2


In [31]:
# Experiment #5: PCA
pca = PCA(n_components=5)
X_train_pca = pca.fit_transform(StandardScaler().fit_transform(X_train))
X_test_pca = pca.transform(StandardScaler().fit_transform(X_test))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.title("Scree Plot")
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Explained Variance")
plt.show()
run_pipeline(LogisticRegression(), "PCA Experiment")

  plt.show()


CV Results (mean/std): 0.7284314361287723 / 0.01970018548163284
Logging metrics for experiment: PCA Experiment
F1-score: 0.7094594594594594, TP: 105, TN: 90, FP: 41, FN: 45




🏃 View run PCA Experiment at: https://dagshub.com/HarshithReddy-Audipudi/HAUDIPUD-P.mlflow/#/experiments/2/runs/c3077f7a90bf4ef88ada74d3ecac7051
🧪 View experiment at: https://dagshub.com/HarshithReddy-Audipudi/HAUDIPUD-P.mlflow/#/experiments/2


In [32]:
# Experiment #6: Custom Experiment (e.g., Hyperparameter Tuning)
from sklearn.model_selection import GridSearchCV
param_grid = {'classifier__C': [0.1, 1, 10]}
search = GridSearchCV(Pipeline(steps=[('preprocessor', StandardScaler()),
                                       ('classifier', LogisticRegression())]),
                      param_grid, cv=3)
search.fit(X_train, y_train)
mlflow.log_params(search.best_params_)
run_pipeline(search.best_estimator_, "Custom Experiment 1")


🏃 View run clean-shad-804 at: https://dagshub.com/HarshithReddy-Audipudi/HAUDIPUD-P.mlflow/#/experiments/2/runs/9d362ad7bc85439998c6a62d3b963caa
🧪 View experiment at: https://dagshub.com/HarshithReddy-Audipudi/HAUDIPUD-P.mlflow/#/experiments/2
CV Results (mean/std): 0.7284314361287723 / 0.01970018548163284
Logging metrics for experiment: Custom Experiment 1
F1-score: 0.7094594594594594, TP: 105, TN: 90, FP: 41, FN: 45




🏃 View run Custom Experiment 1 at: https://dagshub.com/HarshithReddy-Audipudi/HAUDIPUD-P.mlflow/#/experiments/2/runs/d10e4715283d45f68d78d12c3d20089c
🧪 View experiment at: https://dagshub.com/HarshithReddy-Audipudi/HAUDIPUD-P.mlflow/#/experiments/2


In [34]:
# Experiment #7: Another Custom Experiment
run_pipeline(RandomForestClassifier(n_estimators=200, max_depth=10), "Custom Experiment 2")

CV Results (mean/std): 0.9596049431126917 / 0.0011288158409103557
Logging metrics for experiment: Custom Experiment 2
F1-score: 0.9933774834437086, TP: 150, TN: 129, FP: 2, FN: 0




🏃 View run Custom Experiment 2 at: https://dagshub.com/HarshithReddy-Audipudi/HAUDIPUD-P.mlflow/#/experiments/2/runs/6c1ad8d551974ed683b71cc937f0d5b6
🧪 View experiment at: https://dagshub.com/HarshithReddy-Audipudi/HAUDIPUD-P.mlflow/#/experiments/2


In [35]:
# Select and Save Best Model
def select_and_save_best_model():
    experiments = [
        "Logistic Regression Experiment", "Ridge Classifier Experiment", "Random Forest Experiment",
        "XGBoost Experiment", "Feature Engineering Experiment", "Feature Selection Experiment",
        "PCA Experiment", "Custom Experiment 1", "Custom Experiment 2"
    ]
    client = mlflow.tracking.MlflowClient()

    best_f1_score = 0
    best_model_uri = None
    best_experiment = None

    # Fetch the F1-scores for all experiments
    for exp in experiments:
        runs = client.search_runs(
            experiment_ids=[mlflow.get_experiment_by_name("Heart Disease Prediction").experiment_id],
            filter_string=f"tags.mlflow.runName = '{exp}'"
        )

        if runs:
            metrics = runs[0].data.metrics
            f1_score = metrics.get("F1-score", 0)
            if f1_score > best_f1_score:
                best_f1_score = f1_score
                best_model_uri = runs[0].info.artifact_uri + "/model"
                best_experiment = exp

    if best_model_uri:
        print(f"Best Model: {best_experiment} with F1-score: {best_f1_score}")
        # Load the best model and save it locally
        best_model = mlflow.sklearn.load_model(best_model_uri)
        joblib.dump(best_model, 'best_heart_model.pkl')
        print("Best model saved as 'best_heart_model.pkl'")

select_and_save_best_model()


Best Model: Random Forest Experiment with F1-score: 0.9966777408637874


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Best model saved as 'best_heart_model.pkl'


In [36]:
# Generate F1-score comparison plot
def generate_f1_plot():
    experiments = [
        "Logistic Regression Experiment", "Ridge Classifier Experiment", "Random Forest Experiment",
        "XGBoost Experiment", "Feature Engineering Experiment", "Feature Selection Experiment",
        "PCA Experiment", "Custom Experiment 1", "Custom Experiment 2"
    ]
    f1_scores = []
    client = mlflow.tracking.MlflowClient()

    # Fetching run IDs for the given experiment names
    for exp in experiments:
        runs = client.search_runs(
            experiment_ids=[mlflow.get_experiment_by_name("Heart Disease Prediction").experiment_id],
            filter_string=f"tags.mlflow.runName = '{exp}'"
        )

        # If there are runs for the experiment, get the latest one
        if runs:
            metrics = runs[0].data.metrics
            f1_scores.append(metrics.get("F1-score", 0))
        else:
            f1_scores.append(0)  # Default if no run found

    # Plot F1-scores
    plt.figure(figsize=(10, 6))
    plt.barh(experiments, f1_scores, color='skyblue')
    plt.xlabel("F1-score")
    plt.title("F1-score Comparison Across Experiments")
    plt.tight_layout()
    plt.savefig("f1_score_comparison.png")
    print("F1-score comparison plot saved as 'f1_score_comparison.png'")
    plt.show()

generate_f1_plot()

F1-score comparison plot saved as 'f1_score_comparison.png'


  plt.show()


In [37]:
def generate_f1_plot():
    experiments = [
        "Logistic Regression Experiment", "Ridge Classifier Experiment", "Random Forest Experiment",
        "XGBoost Experiment", "Feature Engineering Experiment", "Feature Selection Experiment",
        "PCA Experiment", "Custom Experiment 1", "Custom Experiment 2"
    ]
    f1_scores = []
    client = mlflow.tracking.MlflowClient()

    # Fetching run IDs for the given experiment names
    for exp in experiments:
        print(f"Fetching metrics for experiment: {exp}")
        runs = client.search_runs(
            experiment_ids=[mlflow.get_experiment_by_name("Heart Disease Prediction").experiment_id],
            filter_string=f"tags.mlflow.runName = '{exp}'"
        )

        # If there are runs for the experiment, get the latest one
        if runs:
            metrics = runs[0].data.metrics
            print(f"Metrics for {exp}: {metrics}")
            f1_scores.append(metrics.get("F1-score", 0))
        else:
            print(f"No runs found for {exp}")
            f1_scores.append(0)  # Default if no run found

    # Plot F1-scores
    plt.figure(figsize=(10, 6))
    plt.barh(experiments, f1_scores, color='skyblue')
    plt.xlabel("F1-score")
    plt.title("F1-score Comparison Across Experiments")
    plt.tight_layout()
    plt.savefig("f1_score_comparison_debug.png")
    print("F1-score comparison plot saved as 'f1_score_comparison_debug.png'")
    plt.show()

generate_f1_plot()


Fetching metrics for experiment: Logistic Regression Experiment
Metrics for Logistic Regression Experiment: {'F1-score': 0.7094594594594594, 'True Positives': 105.0, 'False Positives': 41.0, 'True Negatives': 90.0, 'False Negatives': 45.0}
Fetching metrics for experiment: Ridge Classifier Experiment
Metrics for Ridge Classifier Experiment: {'False Negatives': 45.0, 'False Positives': 42.0, 'True Negatives': 89.0, 'True Positives': 105.0, 'F1-score': 0.7070707070707071}
Fetching metrics for experiment: Random Forest Experiment
Metrics for Random Forest Experiment: {'True Negatives': 130.0, 'True Positives': 150.0, 'F1-score': 0.9966777408637874, 'False Positives': 1.0, 'False Negatives': 0.0}
Fetching metrics for experiment: XGBoost Experiment
Metrics for XGBoost Experiment: {'True Negatives': 131.0, 'False Positives': 0.0, 'False Negatives': 1.0, 'True Positives': 149.0, 'F1-score': 0.9966555183946488}
Fetching metrics for experiment: Feature Engineering Experiment
Metrics for Feature 

  plt.show()


2024-12-19 07:54:42.027 
  command:

    streamlit run /opt/anaconda3/lib/python3.12/site-packages/ipykernel_launcher.py [ARGUMENTS]
