In [1]:
#Overarching importation
import sys, os
PROJECT_ROOT = os.path.abspath("..")  # go up one directory from notebooks/
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

In [2]:
from src.Ingest_Data import DataIngestorFactory
from src.Missing_value_handling import MissingValueHandler,FillMissingValue,DropMissingValue

#-------#
# To help with reading and manipulating data
import pandas as pd
import numpy as np

# To help with data visualization
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# To be used for missing value imputation
from sklearn.impute import SimpleImputer

# To help with model building
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    AdaBoostClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    BaggingClassifier,
)
from xgboost import XGBClassifier

# To get different metric scores, and split data
from sklearn import metrics
#---#
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    recall_score,
    precision_score,
    confusion_matrix,
    roc_auc_score
)

# To oversample and undersample data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# To be used for data scaling and one hot encoding
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

# To be used for tuning the model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# To use statistical functions
import scipy.stats as stats

# To be used for creating pipelines and personalizing them
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)

# To supress scientific notations for a dataframe
pd.set_option("display.float_format", lambda x: "%.3f" % x)

# To supress warnings
import warnings

warnings.filterwarnings("ignore")

In [3]:
#Load Data
BASE_URL = "http://127.0.0.1:8000"
TRAIN_ENDPOINT = f"{BASE_URL}/train"
TEST_ENDPOINT = f"{BASE_URL}/test"

In [4]:
df_train = DataIngestorFactory.get_data_ingestor(TRAIN_ENDPOINT).ingest(TRAIN_ENDPOINT)

2025-10-05 16:58:28 [INFO] src.Ingest_Data: Fetching data from API endpoint: http://127.0.0.1:8000/train
2025-10-05 16:58:30 [INFO] src.Ingest_Data: Received 40000 records from API


In [5]:
#df_train['Target']

In [6]:
#df_train["Target"].value_counts().plot(kind ='barh')

In [7]:
import src.scaler as scaler

In [8]:
df_scaled = scaler.SMOTEENNSampler().impute(df_train,target_col='Target')

2025-10-05 16:58:30 [INFO] src.scaler: Applying SMOTEENN hybrid resampling. Input size: (40000, 41)
2025-10-05 16:58:34 [INFO] src.scaler: SMOTEENN complete. Output size: (73091, 41)


In [9]:
# df_scaled["Target"].value_counts().plot(kind ='barh')
# df_scaled['Target'].value_counts()

In [10]:
from src.Model_Selector import CrossValidationEvaluation,ModelEvaluator

In [11]:
# #Model Selection
# models = []  # Empty list to store all the models

# # Appending models into the list
# models.append(("Logistic Regression", LogisticRegression(solver="newton-cg", random_state=1)))
# models.append(("Decision Tree", DecisionTreeClassifier(random_state=1)))
# models.append(("XGBoost", XGBClassifier(random_state=1, eval_metric="logloss", device='cpu')))

# strategy = CrossValidationEvaluation()
# evaluator = ModelEvaluator(strategy)


# # Evaluate models
# results = evaluator.evaluate_models(models, df_scaled.drop('Target',axis = 1), df_scaled['Target'])

In [12]:
# 1️⃣ Define features and target
X = df_scaled.drop('Target', axis=1)
y = df_scaled['Target']

# 2️⃣ Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=0.33,      # 33% validation data
    random_state=42,     # ensures reproducibility
    stratify=y           # keeps class balance (important for imbalance)
)


In [13]:
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
config_path = os.path.join(project_root, "config")

In [14]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73091 entries, 0 to 73090
Data columns (total 40 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   V1      73091 non-null  float64
 1   V2      73091 non-null  float64
 2   V3      73091 non-null  float64
 3   V4      73091 non-null  float64
 4   V5      73091 non-null  float64
 5   V6      73091 non-null  float64
 6   V7      73091 non-null  float64
 7   V8      73091 non-null  float64
 8   V9      73091 non-null  float64
 9   V10     73091 non-null  float64
 10  V11     73091 non-null  float64
 11  V12     73091 non-null  float64
 12  V13     73091 non-null  float64
 13  V14     73091 non-null  float64
 14  V15     73091 non-null  float64
 15  V16     73091 non-null  float64
 16  V17     73091 non-null  float64
 17  V18     73091 non-null  float64
 18  V19     73091 non-null  float64
 19  V20     73091 non-null  float64
 20  V21     73091 non-null  float64
 21  V22     73091 non-null  float64
 22

In [15]:
import os
import yaml
from xgboost import XGBClassifier
from src.Model_Tuner import OptunaTuning

# Absolute path to YAML file
yaml_file_path = r"C:\Users\laaro\OneDrive\桌面\Cost_Minimasation_ML\config\xgbclassifier.yaml"

# Load YAML manually (optional, just to check contents)
with open(yaml_file_path, "r", encoding="utf-8") as f:
    param_grid = yaml.safe_load(f)
print("✅ YAML loaded successfully")
print(param_grid)

# Initialize tuner — pass the **folder**, not the YAML file
config_dir = r"C:\Users\laaro\OneDrive\桌面\Cost_Minimasation_ML\config"

tuner = OptunaTuning(
    config_dir=config_dir,
    n_trials=100,
    cv_folds=5
)

# Tune model using training data
best_model, best_params, best_score = tuner.tune(
    model_name="xgbclassifier",
    model_class=XGBClassifier,
    X=X_train,
    y=y_train
)

print("✅ Best Params Found by Optuna:")
print(best_params)
print(f"Mean CV Score during tuning: {best_score:.4f}")

# Validate on validation set
y_val_pred = best_model.predict(X_val)

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_val, y_val_pred)
acc = accuracy_score(y_val, y_val_pred)

print("\n📊 Validation Results")
print("Confusion Matrix:\n", cm)
print(f"Accuracy: {acc:.4f}")


✅ YAML loaded successfully
{'max_depth': {'low': 3, 'high': 12, 'method': 'int'}, 'learning_rate': {'low': 0.01, 'high': 0.3, 'method': 'float'}, 'n_estimators': {'low': 100, 'high': 2000, 'method': 'int'}, 'subsample': {'low': 0.5, 'high': 1.0, 'method': 'float'}, 'colsample_bytree': {'low': 0.5, 'high': 1.0, 'method': 'float'}, 'gamma': {'low': 0, 'high': 5, 'method': 'float'}, 'min_child_weight': {'low': 1, 'high': 10, 'method': 'int'}, 'reg_alpha': {'low': 0, 'high': 1, 'method': 'float'}, 'reg_lambda': {'low': 0, 'high': 1, 'method': 'float'}}
2025-10-05 16:58:34 [INFO] src.Model_Tuner: Loaded tuning configuration for xgbclassifier from C:\Users\laaro\OneDrive\桌面\Cost_Minimasation_ML\config\xgbclassifier.yaml
2025-10-05 16:58:34 [INFO] src.Model_Tuner: Starting Optuna optimization for xgbclassifier (100 trials)...


[I 2025-10-05 16:58:34,930] A new study created in memory with name: xgbclassifier_optuna_tuning


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-10-05 16:58:43,864] Trial 0 finished with value: 2.5130632198415452 and parameters: {'max_depth': 10, 'learning_rate': 0.2015240378133949, 'n_estimators': 1997, 'subsample': 0.7622785905295169, 'colsample_bytree': 0.7480401366717921, 'gamma': 4.214679035113745, 'min_child_weight': 4, 'reg_alpha': 0.7665866098015518, 'reg_lambda': 0.631693045631458}. Best is trial 0 with value: 2.5130632198415452.
[I 2025-10-05 16:58:48,248] Trial 1 finished with value: 2.4973995378677207 and parameters: {'max_depth': 3, 'learning_rate': 0.14318128610605346, 'n_estimators': 566, 'subsample': 0.6372014050767707, 'colsample_bytree': 0.850726540712077, 'gamma': 1.173028313102325, 'min_child_weight': 8, 'reg_alpha': 0.7477026330866718, 'reg_lambda': 0.931295858125587}. Best is trial 0 with value: 2.5130632198415452.
[I 2025-10-05 16:58:56,298] Trial 2 finished with value: 2.5036955242827195 and parameters: {'max_depth': 5, 'learning_rate': 0.09268503556538826, 'n_estimators': 1653, 'subsample': 0.63

In [17]:
# Save tuned model
tuner.save_tuned_model("xgbclassifier", best_model)

2025-10-05 18:38:43 [INFO] src.Model_Tuner: Tuned model 'xgbclassifier' saved to: models/tuned\xgbclassifier_optuna_tuned.pkl


'models/tuned\\xgbclassifier_optuna_tuned.pkl'

In [18]:
os.getcwd()

'c:\\Users\\laaro\\OneDrive\\桌面\\Cost_Minimasation_ML\\notebooks'

In [19]:
from src.Model_Loader import ModelLoader

In [27]:
model = ModelLoader.load_model(r"C:\Users\laaro\OneDrive\桌面\Cost_Minimasation_ML\notebooks\models\tuned\xgbclassifier_optuna_tuned.pkl")

2025-10-05 18:47:28 [INFO] src.Model_Loader: Model loaded successfully from C:\Users\laaro\OneDrive\桌面\Cost_Minimasation_ML\notebooks\models\tuned\xgbclassifier_optuna_tuned.pkl


In [28]:
df_test = DataIngestorFactory.get_data_ingestor(TEST_ENDPOINT).ingest(TEST_ENDPOINT)

2025-10-05 18:47:28 [INFO] src.Ingest_Data: Fetching data from API endpoint: http://127.0.0.1:8000/test
2025-10-05 18:47:29 [INFO] src.Ingest_Data: Received 10000 records from API


In [31]:
loader = ModelLoader()
model = loader.load_model(r"C:\Users\laaro\OneDrive\桌面\Cost_Minimasation_ML\notebooks\models\tuned\xgbclassifier_optuna_tuned.pkl")

2025-10-05 18:47:59 [INFO] src.Model_Loader: Model loaded successfully from C:\Users\laaro\OneDrive\桌面\Cost_Minimasation_ML\notebooks\models\tuned\xgbclassifier_optuna_tuned.pkl


In [33]:
df_test_X = df_test.drop('Target',axis = 1)

In [34]:
y_pred = loader.predict(model, df_test_X)

2025-10-05 18:48:41 [INFO] src.Model_Loader: Predictions made on input of shape (10000, 40)
