In [4]:
import os
os.chdir('..')

In [6]:
# To help with reading and manipulating data
import pandas as pd
import numpy as np

# To help with data visualization
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# To be used for missing value imputation
from sklearn.impute import SimpleImputer

# To help with model building
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    AdaBoostClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    BaggingClassifier,
)
from xgboost import XGBClassifier

# To get different metric scores, and split data
from sklearn import metrics
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    recall_score,
    precision_score,
    confusion_matrix,
    roc_auc_score,
    ConfusionMatrixDisplay,
)

# To oversample and undersample data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# To be used for data scaling and one hot encoding
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

# To be used for tuning the model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# To use statistical functions
import scipy.stats as stats

# To be used for creating pipelines and personalizing them
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)

# To supress scientific notations for a dataframe
pd.set_option("display.float_format", lambda x: "%.3f" % x)

# To supress warnings
import warnings

warnings.filterwarnings("ignore")


#File reader
from src.ingest_data import DataIngestorFactory
from src.missing_value_imputation import *
from src.data_splitter import *
from src.model_selection import *

In [7]:
file_path = '/Users/hanli/cost-minimization_ML/data/Training_raw/Train.csv'
file_extension = os.path.splitext(file_path)[1]
data_ingestor = DataIngestorFactory.get_data_ingestor(file_extension)
df = data_ingestor.ingest(file_path)  # This will work now

In [8]:
missing_value_handler = MissingValueHandler(DropMissingValue(axis=0, thresh=3))
missing_value_handler.set_strategy(FillMissingValue(method='median'))
df_filled = missing_value_handler.handle_missing_values(df)

2025-06-08 22:18:05,780 - INFO - Switching missing value handling strategy.
2025-06-08 22:18:05,781 - INFO - Executing missing value handling strategy.
2025-06-08 22:18:05,782 - INFO - Filling missing value with median strategy
2025-06-08 22:18:05,814 - INFO - Missing values filled.


In [9]:
data_splitter = DataSplitter(SimpleTrainTestSplitStrategy(test_size=0.2))
X_train, X_test, y_train, y_test = data_splitter.split(df_filled, target_column='Target')

2025-06-08 22:18:06,207 - INFO - Splitting data using the selected strategy.
2025-06-08 22:18:06,208 - INFO - Performing simple train-test split.
2025-06-08 22:18:06,228 - INFO - Train-test split completed.


In [10]:
models = []  # Empty list to store all the models

# Appending models into the list

models.append(
    ("Logistic Regression", LogisticRegression(solver="newton-cg", random_state=1))
)
models.append(("dtree", DecisionTreeClassifier(random_state=1)))
models.append(("Xgboost", XGBClassifier(random_state=1, eval_metric="logloss")))

In [11]:
strategy = CrossValidationEvaluator(n_split=5, random_state=42)
evaluator = ModelEvaluator(strategy)

# Evaluate models
results = evaluator.evaluate_models(models, X_train, y_train)

# Get the best model
best_model_name, best_model = evaluator.get_best_model(models)

# Save the best model (correct static method call)
model_path = ModelEvaluator.save_best_model(best_model_name, best_model)


2025-06-08 22:18:07,636 - INFO - Evaluating models with maintenance cost optimization
2025-06-08 22:18:07,637 - INFO - Starting CV evaluation with maintenance cost scoring
2025-06-08 22:18:08,545 - INFO - Logistic Regression: Mean cost ratio = 1.4217 (±0.0090)
2025-06-08 22:18:19,982 - INFO - dtree: Mean cost ratio = 1.7411 (±0.0214)
2025-06-08 22:18:21,763 - INFO - Xgboost: Mean cost ratio = 2.0721 (±0.0401)
2025-06-08 22:18:21,764 - INFO - Best model 'Xgboost' saved to: models/Xgboost_base.pkl
