In [1]:
# notebooks/02_Model_Training.ipynb

In [2]:
# Step 1: Setup path and imports
import pandas as pd
import numpy as np
import sys
import os
import logging
from joblib import dump, load # For saving/loading pipeline
sys.path.append(os.path.abspath(".."))

from src import config
from src.data_processing import fetch_and_prepare_data
from src.train import train_model

logger = logging.getLogger(__name__)
logger.info("Notebook 02: Model Training Started")

2025-04-23 20:37:00,727 - INFO - 3239302400 - Notebook 02: Model Training Started


In [3]:
# Cell 2: Load Data (or re-fetch)
# Fetch again to ensure consistency, especially if EDA notebook changed things
try:
    X_train, X_test, y_train, y_test = fetch_and_prepare_data()
    logger.info("Data loaded/fetched for training.")
except Exception as e:
    logger.error(f"Failed to load data for training: {e}", exc_info=True)
    # Stop notebook execution or handle error
    raise SystemExit("Data loading failed, stopping notebook.")

2025-04-23 20:37:00,842 - INFO - data_processing - Fetching dataset ID: 45
2025-04-23 20:37:04,839 - INFO - data_processing - Dataset fetched.
2025-04-23 20:37:04,867 - INFO - data_processing - Raw data saved to C:\Users\CCLeyton\bio-ml-jupyter\data\heart_disease_raw.csv
2025-04-23 20:37:04,883 - INFO - data_processing - Created binary target column 'target'. Distribution:
target
0    0.541254
1    0.458746
Name: proportion, dtype: float64
ca      4
thal    2
dtype: int64
2025-04-23 20:37:04,899 - INFO - data_processing - Splitting data (test ratio=0.2, random_state=42)
2025-04-23 20:37:04,919 - INFO - data_processing - Split complete. Train shape: (242, 13), Test shape: (61, 13)
2025-04-23 20:37:04,922 - INFO - 1711198423 - Data loaded/fetched for training.


In [4]:
# Cell 3: Train the Model
# This calls the function which includes preprocessing and saving
model_type_to_train = "logistic_regression" # Or change to test others
try:
    trained_pipeline = train_model(X_train, y_train, model_name=model_type_to_train)
    logger.info(f"'{model_type_to_train}' model trained and pipeline saved.")
    print("Trained Pipeline object:", trained_pipeline)
except Exception as e:
    logger.error(f"Failed to train model: {e}", exc_info=True)
    raise SystemExit("Model training failed, stopping notebook.")

2025-04-23 20:37:04,954 - INFO - train - Starting model training: logistic_regression
2025-04-23 20:37:04,958 - INFO - train - Fitting the full pipeline...
2025-04-23 20:37:05,032 - INFO - train - Pipeline training completed.
2025-04-23 20:37:05,051 - INFO - train - Trained pipeline saved to: C:\Users\CCLeyton\bio-ml-jupyter\models\logistic_regression_pipeline.joblib
2025-04-23 20:37:05,060 - INFO - 1946664891 - 'logistic_regression' model trained and pipeline saved.


Trained Pipeline object: Pipeline(steps=[('preprocessing',
                 Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                                 ('scaler', StandardScaler())])),
                ('classifier',
                 LogisticRegression(class_weight='balanced', max_iter=1000,
                                    random_state=42, solver='liblinear'))])
