In [4]:
#Overarching importation
import sys, os
PROJECT_ROOT = os.path.abspath("..")  # go up one directory from notebooks/
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)
os.chdir(PROJECT_ROOT)

In [5]:
from src.Ingest_Data import DataIngestorFactory
from src.Missing_value_handling import MissingValueHandler,FillMissingValue,DropMissingValue

#-------#
# To help with reading and manipulating data
import pandas as pd
import numpy as np

# To help with data visualization
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# To be used for missing value imputation
from sklearn.impute import SimpleImputer

# To help with model building
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    AdaBoostClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    BaggingClassifier,
)
from xgboost import XGBClassifier

# To get different metric scores, and split data
from sklearn import metrics
#---#
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    recall_score,
    precision_score,
    confusion_matrix,
    roc_auc_score
)

# To oversample and undersample data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# To be used for data scaling and one hot encoding
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

# To be used for tuning the model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from lightgbm import LGBMClassifier

# To use statistical functions
import scipy.stats as stats

# To be used for creating pipelines and personalizing them
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)

# To supress scientific notations for a dataframe
pd.set_option("display.float_format", lambda x: "%.3f" % x)

# To supress warnings
import warnings

warnings.filterwarnings("ignore")

In [6]:
#Load Data
BASE_URL = "http://127.0.0.1:8000"
TRAIN_ENDPOINT = f"{BASE_URL}/train"
TEST_ENDPOINT = f"{BASE_URL}/test"

In [7]:
df_train = DataIngestorFactory.get_data_ingestor(TRAIN_ENDPOINT).ingest(TRAIN_ENDPOINT)
df_test = DataIngestorFactory.get_data_ingestor(TEST_ENDPOINT).ingest(TEST_ENDPOINT)

2025-10-09 21:35:36 [INFO] src.Ingest_Data: Fetching data from API endpoint: http://127.0.0.1:8000/train
2025-10-09 21:35:38 [INFO] src.Ingest_Data: Received 40000 records from API
2025-10-09 21:35:38 [INFO] src.Ingest_Data: Fetching data from API endpoint: http://127.0.0.1:8000/test
2025-10-09 21:35:39 [INFO] src.Ingest_Data: Received 10000 records from API


In [8]:
#If we want to scale up Trainig Set
from src.Data_Samplier import SMOTEENNSampler
df_scaled = SMOTEENNSampler().impute(df_train,target_col='Target')
df_scaled['Target'].value_counts()

2025-10-09 21:35:39 [INFO] src.Data_Samplier: Applying SMOTEENN hybrid resampling. Input size: (40000, 41)
2025-10-09 21:35:43 [INFO] src.Data_Samplier: SMOTEENN complete. Output size: (73091, 41)
2025-10-09 21:35:43 [INFO] src.Data_Samplier: y Class: Target
1.000    37613
0.000    35478
Name: count, dtype: int64


Target
1.000    37613
0.000    35478
Name: count, dtype: int64

In [9]:
from src.Model_Selector import CrossValidationEvaluation,ModelEvaluator

In [10]:
X = df_scaled.drop('Target', axis=1)
y = df_scaled['Target']

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=0.33,     
    random_state=42,    
    stratify=y           
)


In [8]:
# #Model Selection
models = [
    ("log", LogisticRegression(solver="newton-cg", random_state=42)),
    ("DecisionTreeClassifier", DecisionTreeClassifier(random_state=42)),
    ("XGBClassifier", XGBClassifier(random_state=42, eval_metric="logloss", device='cuda')),
    ("LGBMClassifier", LGBMClassifier(random_state=42, device='cpu'))
]

In [9]:
# Step 1: create evaluation strategy
strategy = CrossValidationEvaluation(n_splits=5, random_state=42)

# Step 2: create the evaluator with that strategy
evaluator = ModelEvaluator(strategy=strategy)

# Step 3: evaluate  models
results = evaluator.evaluate_models(models=models, X=X_train, y=y_train)

# Step 4: Return best model name and best model attribute
best_model_name,best_model = evaluator.get_best_model(models)


2025-10-09 21:23:57 [INFO] src.Model_Selector: Evaluating models using selected strategy
2025-10-09 21:23:57 [INFO] src.Model_Selector: Starting cross-validation with cost-sensitive scoring
2025-10-09 21:23:58 [INFO] src.Model_Selector: log: Mean cost ratio = 0.8082 (±0.0027)
2025-10-09 21:24:09 [INFO] src.Model_Selector: DecisionTreeClassifier: Mean cost ratio = 0.9366 (±0.0041)
2025-10-09 21:24:11 [INFO] src.Model_Selector: XGBClassifier: Mean cost ratio = 0.9702 (±0.0036)
[LightGBM] [Info] Number of positive: 20160, number of negative: 19016
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002333 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10200
[LightGBM] [Info] Number of data points in the train set: 39176, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.514601 -> initscore=0.058420
[LightGBM] [Info] Start training from score 0.058420
[LightGBM] [Info] Number of

In [10]:
best_model_name

'XGBClassifier'

In [11]:
best_model

In [12]:
#Model Tunning
from src.Model_Tuner import OptunaTuning

#Step 1 Define Tuning configuration
tuner = OptunaTuning(config_dir="config", n_trials=50, cv_folds=5)

#Step 2 Run Tuning to get final model, parameter and best score
best_model, best_params, best_score = tuner.tune("xgbclassifier", XGBClassifier, X_train, y_train)


2025-10-09 21:24:12 [INFO] src.Model_Tuner: Loaded tuning configuration for xgbclassifier from config\xgbclassifier.yaml
2025-10-09 21:24:12 [INFO] src.Model_Tuner: Starting Optuna optimization for xgbclassifier (50 trials)...


[I 2025-10-09 21:24:12,218] A new study created in memory with name: xgbclassifier_optuna_tuning


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-10-09 21:24:17,798] Trial 0 finished with value: 0.9417629739855418 and parameters: {'max_depth': 7, 'learning_rate': 0.14957291972635783, 'n_estimators': 960, 'subsample': 0.8783877048395901, 'colsample_bytree': 0.8516330910855489, 'gamma': 3.0764342382537087, 'min_child_weight': 7, 'reg_alpha': 0.8053334780744577, 'reg_lambda': 0.8962426773573334}. Best is trial 0 with value: 0.9417629739855418.
[I 2025-10-09 21:24:24,935] Trial 1 finished with value: 0.9575130830812177 and parameters: {'max_depth': 11, 'learning_rate': 0.12889880692264843, 'n_estimators': 1438, 'subsample': 0.8752173867122736, 'colsample_bytree': 0.7450147743331643, 'gamma': 2.230814506997451, 'min_child_weight': 7, 'reg_alpha': 0.37129732726147435, 'reg_lambda': 0.5020704618064415}. Best is trial 1 with value: 0.9575130830812177.
[I 2025-10-09 21:24:32,722] Trial 2 finished with value: 0.952499597487023 and parameters: {'max_depth': 10, 'learning_rate': 0.04123839224865234, 'n_estimators': 997, 'subsample':

In [14]:
tuner.save_tuned_model("xgbclassifier", best_model)

2025-10-09 21:31:19 [INFO] src.Model_Tuner: Tuned model 'xgbclassifier' saved to: models/tuned\xgbclassifier_optuna_tuned.pkl


'models/tuned\\xgbclassifier_optuna_tuned.pkl'

In [11]:
from src.Model_Evaluator import ClassificationModelEvaluator

In [12]:
evaluator = ClassificationModelEvaluator()
model = evaluator.load_model("models/tuned/xgbclassifier_optuna_tuned.pkl")

2025-10-09 21:35:59 [INFO] src.Model_Evaluator: Loaded model from models/tuned/xgbclassifier_optuna_tuned.pkl


In [None]:
# Evaluate
metrics = evaluator.evaluate_model(model, X_val, y_val)

print("\nEvaluation Metrics:")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")


Evaluation Metrics:
Accuracy: 0.9959
Precision: 0.9959
Recall: 0.9959
F1 Score: 0.9959
Min_vs_Model_Cost: 0.9892
