In [None]:
#Overarching importation
import sys, os
PROJECT_ROOT = os.path.abspath("..")  # go up one directory from notebooks/
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)
os.chdir(PROJECT_ROOT)

In [None]:
from src.Data_ingestor import DataIngestorFactory
from src.Missing_value_handling import MissingValueHandler,FillMissingValue,DropMissingValue

#-------#
# To help with reading and manipulating data
import pandas as pd
import numpy as np

# To help with data visualization
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# To be used for missing value imputation
from sklearn.impute import SimpleImputer

# To help with model building
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    AdaBoostClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    BaggingClassifier,
)
from xgboost import XGBClassifier

# To get different metric scores, and split data
from sklearn import metrics
#---#
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    recall_score,
    precision_score,
    confusion_matrix,
    roc_auc_score
)

# To oversample and undersample data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# To be used for data scaling and one hot encoding
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

# To be used for tuning the model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from lightgbm import LGBMClassifier

# To use statistical functions
import scipy.stats as stats

# To be used for creating pipelines and personalizing them
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)

# To supress scientific notations for a dataframe
pd.set_option("display.float_format", lambda x: "%.3f" % x)

# To supress warnings
import warnings

warnings.filterwarnings("ignore")

In [None]:
#Load Data
BASE_URL = "http://127.0.0.1:8000"
TRAIN_ENDPOINT = f"{BASE_URL}/train"
TEST_ENDPOINT = f"{BASE_URL}/test"

In [None]:
#Load Data
BASE_URL = "http://127.0.0.1:8000"
TRAIN_ENDPOINT = f"{BASE_URL}/train"
TEST_ENDPOINT = f"{BASE_URL}/test"
df_train = DataIngestorFactory.get_data_ingestor(TRAIN_ENDPOINT).ingest(TRAIN_ENDPOINT)
df_test = DataIngestorFactory.get_data_ingestor(TEST_ENDPOINT).ingest(TEST_ENDPOINT)

In [None]:
handler = MissingValueHandler(FillMissingValue,method = 'mean')

handler.handle_missing_values(df_train)

In [None]:
from src.Data_Samplier import SamplerFactory

samplier = SamplerFactory.create('smoteenn')
df_scaled = samplier.impute(df_train,target_col= 'Target')

In [None]:
#If we want to scale up Trainig Set
from src.Data_Samplier import SMOTEENNSampler
df_scaled = SMOTEENNSampler().impute(df_train,target_col='Target')
df_scaled['Target'].value_counts()

In [None]:
from src.Model_Selector import CrossValidationEvaluation,ModelEvaluator

In [None]:
X = df_scaled.drop('Target', axis=1)
y = df_scaled['Target']

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=0.2,     
    random_state=42,    
    stratify=y           
)


In [None]:
# #Model Selection
models = [
    ("log", LogisticRegression(solver="newton-cg", random_state=42)),
    ("DecisionTreeClassifier", DecisionTreeClassifier(random_state=42)),
    ("XGBClassifier", XGBClassifier(random_state=42, eval_metric="logloss", device='cpu'))
]

In [None]:
# Step 1: create evaluation strategy
strategy = CrossValidationEvaluation(n_splits=5, random_state=42)

# Step 2: create the evaluator with that strategy
evaluator = ModelEvaluator(strategy=strategy)

# Step 3: evaluate  models
results = evaluator.evaluate_models(models=models, X=X_train, y=y_train)

# Step 4: Return best model name and best model attribute
best_model_name,best_model = evaluator.get_best_model(models)


In [None]:
#Model Tunning
from src.Model_Tuner import OptunaTuning

#Step 1 Define Tuning configuration
tuner = OptunaTuning(config_dir="config", n_trials=50, cv_folds=5)

#Step 2 Run Tuning to get final model, parameter and best score
best_model, best_params, best_score = tuner.tune("xgbclassifier", XGBClassifier, X_train, y_train)


In [None]:
tuner.save_tuned_model("xgbclassifier", best_model)

In [None]:
from src.Model_Evaluator import ClassificationModelEvaluator

In [None]:
eval_map = {
        "classification": '123'
       # "regression": 
}

In [1]:
BASE_URL = "http://127.0.0.1:8000"
TRAIN_ENDPOINT = f"{BASE_URL}/train"
TEST_ENDPOINT = f"{BASE_URL}/test"

In [2]:
#Overarching importation
import sys, os
PROJECT_ROOT = os.path.abspath("..")  # go up one directory from notebooks/
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)
os.chdir(PROJECT_ROOT)
from zenml import pipeline
from steps.Data_Ingestion_Steps import data_ingestion_step
from steps.Data_Split_Steps import data_split_step
from steps.Data_Sampling_Steps import data_sampling_step
from steps.Model_Selection_Steps import model_selection_step
from steps.Model_Tuning_Steps import model_tuning_step
from steps.Missing_data_Handling_Steps import missing_value_step
import os
import sys
import time
import traceback
from datetime import datetime
from src.Get_Logging_Config import get_logger
import joblib

In [3]:
logger = get_logger(__name__)

In [4]:
#Data Ingestion Step
logger.info("Starting data ingestion")
raw_data = data_ingestion_step(TRAIN_ENDPOINT)
logger.info(f"Data ingestion completed: {raw_data.shape}")

2025-10-22 21:26:06 [INFO] __main__: Starting data ingestion
[1;35mStarting data ingestion[0m
[1;35mRunning single step pipeline to execute step [0m[1;36mdata_ingestion_step[1;35m[0m
[1;35mInitiating a new run for the pipeline: [0m[1;36mdata_ingestion_step[1;35m.[0m
[33mIn a future release, the default Python package installer used by ZenML to build container images for your containerized pipelines will change from 'pip' to 'uv'. To maintain current behavior, you can explicitly set [0m[1;36mpython_package_installer=PythonPackageInstaller.PIP[33m in your DockerSettings.[0m
[1;35mCaching is disabled by default for [0m[1;36mdata_ingestion_step[1;35m.[0m
[1;35mUsing user: [0m[1;36mdefault[1;35m[0m
[1;35mUsing stack: [0m[1;36mdefault[1;35m[0m
[1;35m  artifact_store: [0m[1;36mdefault[1;35m[0m
[1;35m  orchestrator: [0m[1;36mdefault[1;35m[0m
[1;35mYou can visualize your pipeline runs in the [0m[1;36mZenML Dashboard[1;35m. In order to try it locally

In [12]:
# === Step 2: Missing Data Handling ===
logger.info(" Handling missing data...")
clean_data = missing_value_step(raw_data,strategy='fill',method = 'mode')
logger.info(f"Missing data handled: {clean_data.shape}")

2025-10-22 21:56:09 [INFO] __main__:  Handling missing data...
[1;35m Handling missing data...[0m
2025-10-22 21:56:09 [INFO] src.Missing_value_handling: Executing missing value handling strategy.
[1;35mExecuting missing value handling strategy.[0m
2025-10-22 21:56:09 [INFO] src.Missing_value_handling: Filling value with mode strategy
[1;35mFilling value with mode strategy[0m
2025-10-22 21:56:10 [INFO] src.Missing_value_handling: Missing values filled.
[1;35mMissing values filled.[0m
2025-10-22 21:56:10 [INFO] __main__: Missing data handled: (40000, 41)
[1;35mMissing data handled: (40000, 41)[0m


In [15]:
# === Step 3: Sampling (if needed) ===
logger.info("Performing data sampling...")
sampled_data = data_sampling_step(method='smoteenn',df = clean_data,target_col='Target')
logger.info(f"Sampling completed: {sampled_data.shape}")


2025-10-22 22:02:21 [INFO] __main__: Performing data sampling...
[1;35mPerforming data sampling...[0m
2025-10-22 22:02:21 [INFO] src.Data_Samplier: Applying SMOTEENN hybrid resampling. Input size: (40000, 41)
[1;35mApplying SMOTEENN hybrid resampling. Input size: (40000, 41)[0m
2025-10-22 22:02:26 [INFO] src.Data_Samplier: SMOTEENN complete. Output size: (73091, 41)
[1;35mSMOTEENN complete. Output size: (73091, 41)[0m
2025-10-22 22:02:26 [INFO] src.Data_Samplier: y Class: Target
1.0    37613
0.0    35478
Name: count, dtype: int64
[1;35my Class: Target
1.0    37613
0.0    35478
Name: count, dtype: int64[0m
2025-10-22 22:02:26 [INFO] steps.Data_Sampling_Steps: Applying sampling. using smoteenn
[1;35mApplying sampling. using smoteenn[0m
2025-10-22 22:02:26 [INFO] __main__: Sampling completed: (73091, 41)
[1;35mSampling completed: (73091, 41)[0m


In [None]:
# === Step 4: Data Split ===
logger.info(" Splitting data into train/test sets...")
X_train, X_test, y_train, y_test = data_split_step(df = sampled_data,target_col = 'Target',test_size = 0.2)
logger.info(f"Split done: Train={len(X_train)}, Test={len(X_test)}")

2025-10-22 22:17:00 [INFO] __main__:  Splitting data into train/test sets...
[1;35m Splitting data into train/test sets...[0m
[1;35mRunning single step pipeline to execute step [0m[1;36mdata_split_step[1;35m[0m
[33mUsing an external artifact as step input currently invalidates caching for the step and all downstream steps. Future releases will introduce hashing of artifacts which will improve this behavior.[0m
[1;35mInitiating a new run for the pipeline: [0m[1;36mdata_split_step[1;35m.[0m
[1;35mUploading external artifact to 'external_artifacts/external_5514d922-a640-40a3-8bc9-1276465e2f8a'.[0m
[1;35mFinished uploading external artifact 1cce898e-53a7-4417-9706-b764695aaba2.[0m
[1;35mCaching is disabled by default for [0m[1;36mdata_split_step[1;35m.[0m
[1;35mUsing user: [0m[1;36mdefault[1;35m[0m
[1;35mUsing stack: [0m[1;36mdefault[1;35m[0m
[1;35m  artifact_store: [0m[1;36mdefault[1;35m[0m
[1;35m  orchestrator: [0m[1;36mdefault[1;35m[0m
[1;35mY

In [19]:
# === Step 5: Model Selection ===
logger.info("🤖 Selecting the best model...")
best_model_name, best_model = model_selection_step(X_train,y_train)
logger.info(f"Best model: {best_model_name}")


2025-10-22 23:01:47 [INFO] __main__: 🤖 Selecting the best model...
[1;35m🤖 Selecting the best model...[0m
[1;35mRunning single step pipeline to execute step [0m[1;36mmodel_selection_step[1;35m[0m
[33mUsing an external artifact as step input currently invalidates caching for the step and all downstream steps. Future releases will introduce hashing of artifacts which will improve this behavior.[0m
[33mUsing an external artifact as step input currently invalidates caching for the step and all downstream steps. Future releases will introduce hashing of artifacts which will improve this behavior.[0m
[1;35mInitiating a new run for the pipeline: [0m[1;36mmodel_selection_step[1;35m.[0m
[1;35mUploading external artifact to 'external_artifacts/external_98ae98ec-14dc-4bd1-a972-85cc5d3ede52'.[0m
[1;35mFinished uploading external artifact ec44fa61-d82b-4364-bf34-ff0c11af3fcc.[0m
[1;35mUploading external artifact to 'external_artifacts/external_2c40aada-2f50-48e3-a1c0-e4ed864845

In [None]:
#Model Tunning
from src.Model_Tuner import OptunaTuning

#Step 1 Define Tuning configuration
tuner = OptunaTuning(config_dir="config", n_trials=50, cv_folds=5)

#Step 2 Run Tuning to get final model, parameter and best score
best_model, best_params, best_score = tuner.tune("xgbclassifier", XGBClassifier, X_train, y_train)


In [None]:
# === Step 6: Model Tuning ===
logger.info("Tuning best model...")
tuned_model, best_params, best_score = model_tuning_step(best_model_name=best_model_name,best_model = best_model,X_train=X_train,y_train=y_train)
logger.info(f"Model tuning done: {tuned_model.__class__.__name__} - Final score: {best_score:.2f}")


2025-10-22 23:17:03 [INFO] __main__: Tuning best model...
[1;35mTuning best model...[0m
[1;35mRunning single step pipeline to execute step [0m[1;36mmodel_tuning_step[1;35m[0m
[33mUsing an external artifact as step input currently invalidates caching for the step and all downstream steps. Future releases will introduce hashing of artifacts which will improve this behavior.[0m
[33mUsing an external artifact as step input currently invalidates caching for the step and all downstream steps. Future releases will introduce hashing of artifacts which will improve this behavior.[0m
[33mUsing an external artifact as step input currently invalidates caching for the step and all downstream steps. Future releases will introduce hashing of artifacts which will improve this behavior.[0m
[1;35mInitiating a new run for the pipeline: [0m[1;36mmodel_tuning_step[1;35m.[0m
[1;35mUploading external artifact to 'external_artifacts/external_df8fb76f-4b93-40fb-bf32-a8a42df43d7a'.[0m
[1;35

[model_tuning_step] [I 2025-10-22 23:17:06,295] A new study created in memory with name: xgbclassifier_optuna_tuning


  0%|          | 0/50 [00:00<?, ?it/s]

[model_tuning_step] [I 2025-10-22 23:17:11,984] Trial 0 finished with value: 0.984499892403494 and parameters: {'max_depth': 10, 'learning_rate': 0.1745205335405329, 'n_estimators': 1425, 'subsample': 0.6188338544440873, 'colsample_bytree': 0.9549710323520944, 'gamma': 0.18462668954740136, 'min_child_weight': 8, 'reg_alpha': 0.5411819568860157, 'reg_lambda': 0.234058363334949}. Best is trial 0 with value: 0.984499892403494.
[model_tuning_step] [I 2025-10-22 23:17:13,868] Trial 1 finished with value: 0.9585875724452763 and parameters: {'max_depth': 3, 'learning_rate': 0.29087979044703577, 'n_estimators': 448, 'subsample': 0.9941911803922239, 'colsample_bytree': 0.7841158667666361, 'gamma': 0.17559777740408233, 'min_child_weight': 4, 'reg_alpha': 0.9232564061243445, 'reg_lambda': 0.9255221629903605}. Best is trial 0 with value: 0.984499892403494.
[model_tuning_step] [I 2025-10-22 23:17:17,664] Trial 2 finished with value: 0.9583598901523706 and parameters: {'max_depth': 11, 'learning_rat

In [23]:
artifacts_dir = os.path.join(os.getcwd(), "artifacts")
os.makedirs(artifacts_dir, exist_ok=True)


In [24]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model_path = os.path.join(artifacts_dir, f"best_model_{timestamp}.pkl")
joblib.dump(tuned_model, model_path)
logger.info(f"💾 Model saved at {model_path}")

2025-10-22 23:23:21 [INFO] __main__: 💾 Model saved at /Users/hanli/cost_ml_202509/Cost_Minimasation_ML/artifacts/best_model_20251022_232321.pkl
[1;35m💾 Model saved at /Users/hanli/cost_ml_202509/Cost_Minimasation_ML/artifacts/best_model_20251022_232321.pkl[0m
