In [1]:
#Overarching importation
import sys, os
PROJECT_ROOT = os.path.abspath("..")  # go up one directory from notebooks/
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)
os.chdir(PROJECT_ROOT)

In [2]:
from src.Data_ingestor import DataIngestorFactory
from src.Missing_value_handling import MissingValueHandler,FillMissingValue,DropMissingValue

#-------#
# To help with reading and manipulating data
import pandas as pd
import numpy as np

# To help with data visualization
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# To be used for missing value imputation
from sklearn.impute import SimpleImputer

# To help with model building
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    AdaBoostClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    BaggingClassifier,
)
from xgboost import XGBClassifier

# To get different metric scores, and split data
from sklearn import metrics
#---#
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    recall_score,
    precision_score,
    confusion_matrix,
    roc_auc_score
)

# To oversample and undersample data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# To be used for data scaling and one hot encoding
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

# To be used for tuning the model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from lightgbm import LGBMClassifier

# To use statistical functions
import scipy.stats as stats

# To be used for creating pipelines and personalizing them
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)

# To supress scientific notations for a dataframe
pd.set_option("display.float_format", lambda x: "%.3f" % x)

# To supress warnings
import warnings

warnings.filterwarnings("ignore")

In [3]:
#Load Data
BASE_URL = "http://127.0.0.1:8000"
TRAIN_ENDPOINT = f"{BASE_URL}/train"
TEST_ENDPOINT = f"{BASE_URL}/test"

In [5]:
#Load Data
BASE_URL = "http://127.0.0.1:8000"
TRAIN_ENDPOINT = f"{BASE_URL}/train"
TEST_ENDPOINT = f"{BASE_URL}/test"
df_train = DataIngestorFactory.get_data_ingestor(TRAIN_ENDPOINT).ingest(TRAIN_ENDPOINT)
df_test = DataIngestorFactory.get_data_ingestor(TEST_ENDPOINT).ingest(TEST_ENDPOINT)

2025-10-12 21:45:41 [INFO] src.Data_ingestor: Fetching data from API endpoint: http://127.0.0.1:8000/train
2025-10-12 21:45:43 [INFO] src.Data_ingestor: Received 40000 records from API
2025-10-12 21:45:43 [INFO] src.Data_ingestor: Fetching data from API endpoint: http://127.0.0.1:8000/test
2025-10-12 21:45:44 [INFO] src.Data_ingestor: Received 10000 records from API


In [9]:
from src.Data_Samplier import SamplerFactory

samplier = SamplerFactory.create('smoteenn')
df_scaled = samplier.impute(df_train,target_col= 'Target')

2025-10-12 21:47:24 [INFO] src.Data_Samplier: Applying SMOTEENN hybrid resampling. Input size: (40000, 41)
2025-10-12 21:47:28 [INFO] src.Data_Samplier: SMOTEENN complete. Output size: (73091, 41)
2025-10-12 21:47:28 [INFO] src.Data_Samplier: y Class: Target
1.000    37613
0.000    35478
Name: count, dtype: int64


In [9]:
#If we want to scale up Trainig Set
from src.Data_Samplier import SMOTEENNSampler
df_scaled = SMOTEENNSampler().impute(df_train,target_col='Target')
df_scaled['Target'].value_counts()

2025-10-11 23:28:31 [INFO] src.Data_Samplier: Applying SMOTEENN hybrid resampling. Input size: (40000, 41)
2025-10-11 23:28:35 [INFO] src.Data_Samplier: SMOTEENN complete. Output size: (73091, 41)
2025-10-11 23:28:35 [INFO] src.Data_Samplier: y Class: Target
1.000    37613
0.000    35478
Name: count, dtype: int64


Target
1.000    37613
0.000    35478
Name: count, dtype: int64

In [10]:
from src.Model_Selector import CrossValidationEvaluation,ModelEvaluator

In [11]:
X = df_scaled.drop('Target', axis=1)
y = df_scaled['Target']

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=0.2,     
    random_state=42,    
    stratify=y           
)


In [None]:
# #Model Selection
models = [
    ("log", LogisticRegression(solver="newton-cg", random_state=42)),
    ("DecisionTreeClassifier", DecisionTreeClassifier(random_state=42)),
    ("XGBClassifier", XGBClassifier(random_state=42, eval_metric="logloss", device='cpu'))
]

In [13]:
# Step 1: create evaluation strategy
strategy = CrossValidationEvaluation(n_splits=5, random_state=42)

# Step 2: create the evaluator with that strategy
evaluator = ModelEvaluator(strategy=strategy)

# Step 3: evaluate  models
results = evaluator.evaluate_models(models=models, X=X_train, y=y_train)

# Step 4: Return best model name and best model attribute
best_model_name,best_model = evaluator.get_best_model(models)


2025-10-11 23:29:11 [INFO] src.Model_Selector: Evaluating models using selected strategy
2025-10-11 23:29:11 [INFO] src.Model_Selector: Starting cross-validation with cost-sensitive scoring
2025-10-11 23:29:14 [INFO] src.Model_Selector: log: Mean cost ratio = 0.8079 (±0.0036)
2025-10-11 23:29:28 [INFO] src.Model_Selector: DecisionTreeClassifier: Mean cost ratio = 0.9403 (±0.0047)
2025-10-11 23:29:31 [INFO] src.Model_Selector: XGBClassifier: Mean cost ratio = 0.9736 (±0.0028)
2025-10-11 23:29:31 [INFO] src.Model_Selector: Best model: XGBClassifier (Score: 0.9736)


In [None]:
#Model Tunning
from src.Model_Tuner import OptunaTuning

#Step 1 Define Tuning configuration
tuner = OptunaTuning(config_dir="config", n_trials=50, cv_folds=5)

#Step 2 Run Tuning to get final model, parameter and best score
best_model, best_params, best_score = tuner.tune("xgbclassifier", XGBClassifier, X_train, y_train)


In [None]:
tuner.save_tuned_model("xgbclassifier", best_model)

In [14]:
from src.Model_Evaluator import ClassificationModelEvaluator

In [26]:
evaluator = ClassificationModelEvaluator()
model = evaluator.load_model("/Users/hanli/cost_ml_202509/Cost_Minimasation_ML/models/tuned/xgbclassifier_optuna_tuned.pkl")

2025-10-11 23:31:00 [INFO] src.Model_Evaluator: Loaded model from /Users/hanli/cost_ml_202509/Cost_Minimasation_ML/models/tuned/xgbclassifier_optuna_tuned.pkl


In [27]:
# Evaluate
metrics = evaluator.evaluate_model(model, X_val, y_val)

print("\nEvaluation Metrics:")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")


Evaluation Metrics:
Accuracy: 0.9956
Precision: 0.9956
Recall: 0.9956
F1 Score: 0.9956
Min_vs_Model_Cost: 0.9882
