In [14]:
# To help with reading and manipulating data
import pandas as pd
import numpy as np

# To help with data visualization
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# To be used for missing value imputation
from sklearn.impute import SimpleImputer

# To help with model building
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    AdaBoostClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    BaggingClassifier,
)
from xgboost import XGBClassifier

# To get different metric scores, and split data
from sklearn import metrics
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    recall_score,
    precision_score,
    confusion_matrix,
    roc_auc_score,
    ConfusionMatrixDisplay,
)

# To oversample and undersample data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# To be used for data scaling and one hot encoding
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

# To be used for tuning the model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# To use statistical functions
import scipy.stats as stats

# To be used for creating pipelines and personalizing them
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)

# To supress scientific notations for a dataframe
pd.set_option("display.float_format", lambda x: "%.3f" % x)

# To supress warnings
import warnings

warnings.filterwarnings("ignore")

import os
#os.chdir('./cost-minimization_ML')

#File reader
from src.ingest_data import DataIngestorFactory
from src.missing_value_imputation import *
from src.data_splitter import *
from src.model_selection import *
from src.tune_model import ModelTunning  # assuming saved as model_tunning.py

In [15]:
file_path = '/Users/hanli/cost-minimization_ML/data/Training_raw/Train.csv'
file_extension = os.path.splitext(file_path)[1]
data_ingestor = DataIngestorFactory.get_data_ingestor(file_extension)
df = data_ingestor.ingest(file_path)  # This will work now

In [16]:
missing_value_handler = MissingValueHandler(DropMissingValue(axis=0, thresh=3))
missing_value_handler.set_strategy(FillMissingValue(method='median'))
df_filled = missing_value_handler.handle_missing_values(df)

2025-06-15 00:00:17,048 - INFO - Switching missing value handling strategy.
2025-06-15 00:00:17,050 - INFO - Executing missing value handling strategy.
2025-06-15 00:00:17,050 - INFO - Filling missing value with median strategy
2025-06-15 00:00:17,141 - INFO - Missing values filled.


In [17]:
data_splitter = DataSplitter(SimpleTrainTestSplitStrategy(test_size=0.2))
X_train, X_test, y_train, y_test = data_splitter.split(df_filled, target_column='Target')

2025-06-15 00:00:17,987 - INFO - Splitting data using the selected strategy.
2025-06-15 00:00:17,989 - INFO - Performing simple train-test split.
2025-06-15 00:00:18,015 - INFO - Train-test split completed.


In [18]:
models = []  # Empty list to store all the models

# Appending models into the list

models.append(
    ("Logistic Regression", LogisticRegression(solver="newton-cg", random_state=1))
)
models.append(("dtree", DecisionTreeClassifier(random_state=1)))
models.append(("Xgboost", XGBClassifier(random_state=1, eval_metric="logloss")))

In [19]:
strategy = CrossValidationEvaluator(n_split=5, random_state=42)
evaluator = ModelEvaluator(strategy)

# Evaluate models
results = evaluator.evaluate_models(models, X_train, y_train)

# Get the best model
best_model_name, best_model = evaluator.get_best_model(models)

# Save the best model (correct static method call)
model_path = ModelEvaluator.save_best_model(best_model_name, best_model)


2025-06-15 00:00:24,229 - INFO - Evaluating models with maintenance cost optimization
2025-06-15 00:00:24,230 - INFO - Starting CV evaluation with maintenance cost scoring
2025-06-15 00:00:25,133 - INFO - Logistic Regression: Mean cost ratio = 1.4217 (±0.0090)
2025-06-15 00:00:36,676 - INFO - dtree: Mean cost ratio = 1.7411 (±0.0214)
2025-06-15 00:00:38,407 - INFO - Xgboost: Mean cost ratio = 2.0721 (±0.0401)
2025-06-15 00:00:38,409 - INFO - Best model 'Xgboost' saved to: models/Xgboost_base.pkl


In [20]:
tuner = ModelTunning(config_dir="config")

In [21]:
# Perform hyperparameter tuning
tuned_model = tuner.tune(best_model_name, best_model, X_train, y_train, cv=5, n_iter=30, random_state=42)

# Save the tuned model
model_path = tuner.save_tuned_model(best_model_name, tuned_model)
print(f"Tuned model saved at: {model_path}")

2025-06-15 00:00:47,260 - INFO - Loaded tuning config for Xgboost, from config/Xgboost.yaml
2025-06-15 00:00:47,260 - INFO - Starting hyperparameter tuning for Xgboost ...
2025-06-15 00:01:48,680 - INFO - Best parameters for Xgboost: {'subsample': 0.7, 'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.1, 'gamma': 1, 'colsample_bytree': 0.7}
2025-06-15 00:01:48,680 - INFO - Best CV score: 2.0894
2025-06-15 00:01:48,684 - INFO - Tuned model 'Xgboost' saved to: models/tuned/Xgboost_tuned.pkl


Tuned model saved at: models/tuned/Xgboost_tuned.pkl


In [28]:
X_train

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30,V31,V32,V33,V34,V35,V36,V37,V38,V39,V40
25205,3.925,4.826,4.900,2.909,0.890,-1.945,-0.062,-1.883,3.677,-1.911,-4.766,-2.885,0.764,-1.253,-0.458,-5.045,2.455,-1.503,-0.202,0.401,-4.052,0.059,-3.256,-0.718,2.774,5.970,0.206,-1.063,-3.801,-5.711,2.209,-2.191,-3.679,-1.801,0.039,-0.356,-1.472,-1.615,-0.367,0.096
5266,-2.914,-0.877,-1.360,0.377,2.659,-2.849,-1.244,4.461,-2.112,-0.786,-6.822,4.482,-0.606,0.726,-2.617,-0.137,5.695,4.419,-1.250,2.562,-1.403,3.295,7.674,7.427,-0.478,-0.937,-5.618,0.721,-3.800,0.094,1.433,8.096,-2.681,-2.210,0.290,3.395,1.993,-3.277,-0.242,1.092
6876,-3.845,3.017,1.263,1.195,2.292,1.802,-0.241,-3.967,-2.108,1.657,0.027,0.308,-3.491,-3.266,-0.480,-3.084,-0.621,1.548,3.137,-4.506,0.044,-0.628,-2.787,6.223,-2.347,3.406,-2.895,1.208,1.097,0.702,0.384,1.851,4.820,0.504,1.421,-0.271,-1.465,8.526,1.457,-1.504
18031,7.603,8.126,1.068,3.567,0.034,0.772,1.437,-6.074,5.965,-3.242,-1.027,-4.107,-0.431,-3.264,-0.294,-9.683,2.101,-1.475,1.206,-3.174,-5.735,3.017,-1.739,-0.207,1.943,6.360,1.213,0.519,-2.870,-4.719,-3.945,-5.002,-2.606,-2.593,0.103,-7.295,-2.266,5.251,-1.941,-0.697
21984,-2.888,-2.278,4.214,0.768,-1.087,-3.264,-1.296,1.335,-1.536,2.278,-3.472,4.072,3.250,0.344,-2.993,-1.217,-0.822,-0.091,4.097,3.021,-3.503,-0.010,-0.936,0.956,0.019,0.189,0.167,-0.532,-2.730,-0.521,1.246,1.227,-2.149,3.542,3.152,4.905,0.258,-2.519,2.670,-3.736
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6175,1.831,-0.443,4.964,2.529,-0.597,-4.833,-1.610,3.151,0.685,-0.147,-7.072,1.655,2.131,0.619,-2.245,-3.006,3.433,-0.173,2.048,4.545,-6.018,1.827,0.953,2.360,2.254,1.370,-1.495,-0.807,-4.723,-2.134,3.287,3.575,-4.623,-0.401,4.311,3.053,-0.406,-7.068,-0.418,-1.567
9704,-0.199,-3.018,1.486,5.912,-3.574,-4.337,-3.388,3.113,1.742,-1.828,-4.377,1.941,1.647,-0.490,-5.085,-5.668,6.330,0.786,5.680,9.388,-8.058,3.678,2.864,2.963,0.323,-3.843,-6.087,1.830,-0.344,5.648,-5.528,3.465,-1.986,1.164,8.284,-3.484,0.874,-6.558,-2.285,-4.385
11190,-3.856,-4.439,0.345,-3.111,1.455,-1.397,-1.081,3.685,-2.426,1.720,-3.442,1.731,1.520,3.147,-0.645,4.608,-0.245,2.339,-3.782,2.570,2.967,-1.151,1.858,-0.386,0.606,-2.383,0.608,-1.182,-2.334,-1.751,4.572,2.934,-2.093,0.195,-2.910,7.861,1.980,-5.497,2.182,1.651
26569,-2.639,4.626,1.151,4.879,1.345,1.742,-0.922,-7.271,3.429,-0.558,-0.948,-4.580,-1.175,-3.112,-2.817,-7.644,1.022,0.268,2.409,-0.830,-1.396,-0.763,-5.134,1.261,-0.914,4.455,-2.302,1.808,-0.555,-2.046,-4.798,-3.089,1.769,1.232,-1.075,-3.943,-1.287,7.667,1.462,-3.628


In [27]:
from src.model_building import ModelBuilder, ModelBuildingStrategy, XGBClassifierStrategy


builder = ModelBuilder(XGBClassifierStrategy())
model = builder.build_model(X_train, y_train)

2025-06-15 00:04:52,534 - INFO - Loading tuned XGBClassifier from models/tuned/Xgboost_tuned.pkl
2025-06-15 00:04:52,540 - INFO - Training the loaded XGBClassifier on the provided training data.
2025-06-15 00:04:53,134 - INFO - Model training complete.
