In [None]:
# To help with reading and manipulating data
import pandas as pd
import numpy as np

# To help with data visualization
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# To be used for missing value imputation
from sklearn.impute import SimpleImputer

# To help with model building
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    AdaBoostClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    BaggingClassifier,
)
from xgboost import XGBClassifier

# To get different metric scores, and split data
from sklearn import metrics
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    recall_score,
    precision_score,
    confusion_matrix,
    roc_auc_score,
    ConfusionMatrixDisplay,
)

# To oversample and undersample data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# To be used for data scaling and one hot encoding
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

# To be used for tuning the model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# To use statistical functions
import scipy.stats as stats

# To be used for creating pipelines and personalizing them
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)

# To supress scientific notations for a dataframe
pd.set_option("display.float_format", lambda x: "%.3f" % x)

# To supress warnings
import warnings

warnings.filterwarnings("ignore")

import os
#os.chdir('./cost-minimization_ML')

#File reader
from src.ingest_data import DataIngestorFactory
from src.missing_value_imputation import *
from src.data_splitter import *
from src.model_selection import *
from src.tune_model import ModelTunning  # assuming saved as model_tunning.py

In [2]:
file_path = '/Users/hanli/cost-minimization_ML/data/Training_raw/Train.csv'
file_extension = os.path.splitext(file_path)[1]
data_ingestor = DataIngestorFactory.get_data_ingestor(file_extension)
df = data_ingestor.ingest(file_path)  # This will work now

In [3]:
missing_value_handler = MissingValueHandler(DropMissingValue(axis=0, thresh=3))
missing_value_handler.set_strategy(FillMissingValue(method='median'))
df_filled = missing_value_handler.handle_missing_values(df)

2025-06-12 21:56:08,919 - INFO - Switching missing value handling strategy.
2025-06-12 21:56:08,921 - INFO - Executing missing value handling strategy.
2025-06-12 21:56:08,922 - INFO - Filling missing value with median strategy
2025-06-12 21:56:08,966 - INFO - Missing values filled.


In [4]:
data_splitter = DataSplitter(SimpleTrainTestSplitStrategy(test_size=0.2))
X_train, X_test, y_train, y_test = data_splitter.split(df_filled, target_column='Target')

2025-06-12 21:56:09,987 - INFO - Splitting data using the selected strategy.
2025-06-12 21:56:09,988 - INFO - Performing simple train-test split.
2025-06-12 21:56:10,006 - INFO - Train-test split completed.


In [13]:
models = []  # Empty list to store all the models

# Appending models into the list

models.append(
    ("Logistic Regression", LogisticRegression(solver="newton-cg", random_state=1))
)
models.append(("dtree", DecisionTreeClassifier(random_state=1)))
models.append(("Xgboost", XGBClassifier(random_state=1, eval_metric="logloss")))

In [14]:
strategy = CrossValidationEvaluator(n_split=5, random_state=42)
evaluator = ModelEvaluator(strategy)

# Evaluate models
results = evaluator.evaluate_models(models, X_train, y_train)

# Get the best model
best_model_name, best_model = evaluator.get_best_model(models)

# Save the best model (correct static method call)
model_path = ModelEvaluator.save_best_model(best_model_name, best_model)


2025-06-11 19:54:47,549 - INFO - Evaluating models with maintenance cost optimization
2025-06-11 19:54:47,550 - INFO - Starting CV evaluation with maintenance cost scoring
2025-06-11 19:54:48,476 - INFO - Logistic Regression: Mean cost ratio = 1.4217 (±0.0090)
2025-06-11 19:54:58,994 - INFO - dtree: Mean cost ratio = 1.7411 (±0.0214)
2025-06-11 19:55:10,457 - INFO - Xgboost: Mean cost ratio = 2.0820 (±0.0497)
2025-06-11 19:55:10,459 - INFO - Best model 'Xgboost' saved to: models/Xgboost_base.pkl


In [15]:
tuner = ModelTunning(config_dir="config")

In [16]:
# Perform hyperparameter tuning
tuned_model = tuner.tune(best_model_name, best_model, X_train, y_train, cv=5, n_iter=30, random_state=42)

# Save the tuned model
model_path = tuner.save_tuned_model(best_model_name, tuned_model)
print(f"Tuned model saved at: {model_path}")

2025-06-11 19:55:18,122 - INFO - Loaded tuning config for Xgboost, from config/Xgboost.yaml
2025-06-11 19:55:18,122 - INFO - Starting hyperparameter tuning for Xgboost ...
2025-06-11 20:02:29,056 - INFO - Best parameters for Xgboost: {'subsample': 0.9, 'n_estimators': 250, 'max_depth': 5, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.9}
2025-06-11 20:02:29,057 - INFO - Best CV score: 2.0688
2025-06-11 20:02:29,061 - INFO - Tuned model 'Xgboost' saved to: models/tuned/Xgboost_tuned.pkl


Tuned model saved at: models/tuned/Xgboost_tuned.pkl
