In [7]:
# Step 1: Create a normalized database (3NF)
import sqlite3
import pandas as pd

# Define database connection
db_conn = sqlite3.connect('health.db')

# Create Patients table
db_conn.execute('''CREATE TABLE IF NOT EXISTS Patients (
    patient_id INTEGER PRIMARY KEY AUTOINCREMENT,
    name TEXT,
    age INTEGER,
    sex TEXT
)''')

# Create TestResults table
db_conn.execute('''CREATE TABLE IF NOT EXISTS TestResults (
    result_id INTEGER PRIMARY KEY AUTOINCREMENT,
    patient_id INTEGER,
    disease TEXT,
    test_features TEXT,
    FOREIGN KEY (patient_id) REFERENCES Patients(patient_id)
)''')

# Create DiseaseStatus table
db_conn.execute('''CREATE TABLE IF NOT EXISTS DiseaseStatus (
    status_id INTEGER PRIMARY KEY AUTOINCREMENT,
    patient_id INTEGER,
    disease TEXT,
    target_status INTEGER,
    FOREIGN KEY (patient_id) REFERENCES Patients(patient_id)
)''')

db_conn.commit()

In [9]:
# Load datasets
diabetes_df = pd.read_csv('diabetes.csv')
heart_df = pd.read_csv('heart.csv')
parkinsons_df = pd.read_csv('parkinsons.csv')

# Insert data into tables (example for diabetes dataset)
for _, row in diabetes_df.iterrows():
    db_conn.execute("INSERT INTO Patients (name, age, sex) VALUES (?, ?, ?)", 
                   ("Unknown", row['Age'], "Unknown"))
    patient_id = db_conn.execute("SELECT last_insert_rowid()").fetchone()[0]
    db_conn.execute("INSERT INTO TestResults (patient_id, disease, test_features) VALUES (?, ?, ?)", 
                   (patient_id, "Diabetes", str(row.to_dict())))
    db_conn.execute("INSERT INTO DiseaseStatus (patient_id, disease, target_status) VALUES (?, ?, ?)", 
                   (patient_id, "Diabetes", row['Outcome']))

db_conn.commit()

db_conn.close()

In [11]:
# Step 2: SQL join statement to fetch data into Pandas DataFrame
def fetch_data():
    db_conn = sqlite3.connect('health.db')
    query = '''
    SELECT p.patient_id, p.name, p.age, p.sex, t.disease, t.test_features, d.target_status
    FROM Patients p
    JOIN TestResults t ON p.patient_id = t.patient_id
    JOIN DiseaseStatus d ON p.patient_id = d.patient_id
    '''
    df = pd.read_sql_query(query, db_conn)
    db_conn.close()
    return df

data = fetch_data()

In [13]:
# Step 3: Explore data for stratification
from sklearn.model_selection import train_test_split

X = data.drop(columns=['target_status'])
y = data['target_status']

In [19]:
# Stratified split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [27]:
# Step 4: Data exploration with yprofile and correlation matrix
from ydata_profiling import ProfileReport
import seaborn as sns
import matplotlib.pyplot as plt

# Generate yprofile report
profile = ProfileReport(X_train, title="Data Profile Report")
profile.to_file("eda_report.html")

# Correlation matrix
# Select only numeric columns
numeric_columns = X_train.select_dtypes(include=['float64', 'int64'])
corr_matrix = numeric_columns.corr()

# Plot correlation matrix
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix for Numeric Features")
plt.savefig("correlation_matrix.png")
plt.close()

print("Correlation matrix heatmap saved as 'correlation_matrix.png'")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Correlation matrix heatmap saved as 'correlation_matrix.png'


In [31]:
# Ensure no missing values in X or y
print(f"Shape of X before dropping NaNs: {X.shape}")
print(f"Shape of y before dropping NaNs: {y.shape}")

# Drop rows with NaN values in the target column to align X and y
X = X.dropna()
y = y.loc[X.index]

# Stratified train-test split with alignment
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print(f"Shape of X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"Shape of X_test: {X_test.shape}, y_test: {y_test.shape}")

# Updated Preprocessing Pipeline and Logistic Regression
numerical_features = ['age']  # Add relevant numerical features
categorical_features = ['sex']  # Add relevant categorical features

preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(), categorical_features)
])

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

# Cross-validation
cv_results = cross_val_score(pipeline, X_train, y_train, cv=10, scoring='f1')
print(f"F1-score: Mean={cv_results.mean():.4f}, Std={cv_results.std():.4f}")


Shape of X before dropping NaNs: (768, 6)
Shape of y before dropping NaNs: (768,)
Shape of X_train: (614, 6), y_train: (614,)
Shape of X_test: (154, 6), y_test: (154,)
F1-score: Mean=0.2537, Std=0.0961


In [38]:
mlflow.set_tracking_uri("https://dagshub.com/HarshithReddy-Audipudi/MDP.mlflow")


In [40]:
import os
os.environ["MLFLOW_TRACKING_USERNAME"] = "HarshithReddy-Audipudi"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "Qwerty@123"


In [42]:
# Step 6: Log results in MLFlow
import mlflow
import mlflow.sklearn

mlflow.set_tracking_uri("https://dagshub.com/HarshithReddy-Audipudi/MDP.mlflow")
mlflow.set_experiment("Multiple Disease Prediction")

with mlflow.start_run():
    pipeline.fit(X_train, y_train)
    mlflow.log_param("model_type", "Logistic Regression")
    mlflow.log_metric("cv_f1_mean", cv_results.mean())
    mlflow.log_metric("cv_f1_std", cv_results.std())
    mlflow.sklearn.log_model(pipeline, "logistic_pipeline")

2024/12/17 23:19:50 INFO mlflow.tracking.fluent: Experiment with name 'Multiple Disease Prediction' does not exist. Creating a new experiment.


🏃 View run illustrious-lamb-347 at: https://dagshub.com/HarshithReddy-Audipudi/MDP.mlflow/#/experiments/0/runs/9e731eded9dd468696ee31649123aa1c
🧪 View experiment at: https://dagshub.com/HarshithReddy-Audipudi/MDP.mlflow/#/experiments/0


In [44]:
# Step 7: Experiment #2 with multiple classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier
from xgboost import XGBClassifier

classifiers = {
    "Logistic Regression": LogisticRegression(),
    "Ridge Classifier": RidgeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost Classifier": XGBClassifier()
}

for name, clf in classifiers.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', clf)
    ])

    cv_results = cross_val_score(pipeline, X_train, y_train, cv=10, scoring='f1')

    with mlflow.start_run():
        pipeline.fit(X_train, y_train)
        mlflow.log_param("model_type", name)
        mlflow.log_metric("cv_f1_mean", cv_results.mean())
        mlflow.log_metric("cv_f1_std", cv_results.std())
        mlflow.sklearn.log_model(pipeline, f"{name}_pipeline")



🏃 View run fortunate-gnat-846 at: https://dagshub.com/HarshithReddy-Audipudi/MDP.mlflow/#/experiments/0/runs/c6303df1537348c29e5b02e1d5c4f537
🧪 View experiment at: https://dagshub.com/HarshithReddy-Audipudi/MDP.mlflow/#/experiments/0




🏃 View run thoughtful-jay-903 at: https://dagshub.com/HarshithReddy-Audipudi/MDP.mlflow/#/experiments/0/runs/3fcbb03df0b24ab0bcaccc2e54f6bc4b
🧪 View experiment at: https://dagshub.com/HarshithReddy-Audipudi/MDP.mlflow/#/experiments/0




🏃 View run kindly-hound-390 at: https://dagshub.com/HarshithReddy-Audipudi/MDP.mlflow/#/experiments/0/runs/04a48f0f12fa44608066155ba407e8fc
🧪 View experiment at: https://dagshub.com/HarshithReddy-Audipudi/MDP.mlflow/#/experiments/0




🏃 View run inquisitive-gull-841 at: https://dagshub.com/HarshithReddy-Audipudi/MDP.mlflow/#/experiments/0/runs/405002a2c2c74ef7839bb1f41251e723
🧪 View experiment at: https://dagshub.com/HarshithReddy-Audipudi/MDP.mlflow/#/experiments/0


In [46]:
# Step 8: Feature Engineering and PCA (Experiment #3 and #5)
from sklearn.decomposition import PCA

# Perform PCA
pca = PCA()
X_train_pca = pca.fit_transform(X_train.select_dtypes(include=['float64', 'int64']))
explained_variance = pca.explained_variance_ratio_

# Scree Plot
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(explained_variance) + 1), explained_variance, marker='o')
plt.title("Scree Plot")
plt.xlabel("Principal Component")
plt.ylabel("Explained Variance Ratio")
plt.savefig("scree_plot.png")
plt.close()

print("Scree plot saved as 'scree_plot.png'")

# Log PCA results
with mlflow.start_run():
    for i, ev in enumerate(explained_variance):
        mlflow.log_metric(f"PCA_Component_{i+1}", ev)


Scree plot saved as 'scree_plot.png'
🏃 View run classy-asp-494 at: https://dagshub.com/HarshithReddy-Audipudi/MDP.mlflow/#/experiments/0/runs/f99bfb86b37f4cb0821e8071efc38fb3
🧪 View experiment at: https://dagshub.com/HarshithReddy-Audipudi/MDP.mlflow/#/experiments/0


In [48]:
# Step 9: Save the final model
import joblib

final_model = pipeline.fit(X_train, y_train)
joblib.dump(final_model, "final_model.joblib")


['final_model.joblib']

In [62]:
# Step 10: FastAPI application
from fastapi import FastAPI
from pydantic import BaseModel
import joblib

app = FastAPI()
model = joblib.load("final_model.joblib")

class ModelInput(BaseModel):
    features: list

@app.post("/predict")
def predict(input: ModelInput):
    prediction = model.predict([input.features])
    return {"prediction": prediction.tolist()}


In [52]:
# Step 11: Dockerize the application
# Create Dockerfile as follows:
# FROM python:3.9-slim
# WORKDIR /app
# COPY . .
# RUN pip install fastapi uvicorn scikit-learn joblib pydantic
# CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]

# Step 12: Deploy the containerized API
# Push the Docker container to Docker Hub and deploy it on a cloud platform like AWS/GCP/Azure.

In [64]:
# Step 13: Streamlit app for real-time interaction
import streamlit as st
import requests

st.title("Multiple Disease Prediction")

features = st.text_input("Enter test features (comma-separated):")

if st.button("Predict"):
    try:
        # Convert input features to a list of floats
        feature_list = [float(x.strip()) for x in features.split(",")]
        
        # Send request to FastAPI endpoint
        response = requests.post(
            "http://localhost:8000/predict",
            json={"features": feature_list}
        )
        
        # Display prediction result
        if response.status_code == 200:
            prediction = response.json()['prediction']
            st.success(f"Prediction: {prediction}")
        else:
            st.error(f"Error: {response.text}")
    except ValueError:
        st.error("Please enter valid numeric inputs separated by commas.")



In [56]:
# Final Steps: Create presentation video and JupyterBook
# Record a video walkthrough of the project and compile the code, results, and video into a JupyterBook website.
