### üß† End-to-End Training Pipeline (Feature Engineering, Feature Selection & CatBoost)

In [None]:
# ============================================================
# 0. Imports
# ============================================================
from pathlib import Path
from catboost import CatBoostClassifier

from health_lifestyle_diabetes.infrastructure.logger.config import configure_logging
from health_lifestyle_diabetes.infrastructure.logger.loguru_logger import LoguruLogger

from health_lifestyle_diabetes.infrastructure.utils.config_loader import YamlConfigLoader
from health_lifestyle_diabetes.infrastructure.utils.paths import get_repository_root

from health_lifestyle_diabetes.infrastructure.data_sources.csv_dataset_repository import (
    CSVDatasetRepository
)
from health_lifestyle_diabetes.infrastructure.feature_engineering.pipeline_feature_engineering import (
    FeatureEngineeringPipeline
)
from health_lifestyle_diabetes.infrastructure.splitters.sklearn_splitter import (
    SklearnDatasetSplitter
)
from health_lifestyle_diabetes.infrastructure.model_trainers.catboost_trainer import (
    CatBoostTrainer
)
from health_lifestyle_diabetes.infrastructure.features_selections import (
    features_selection as fs
)

# ============================================================
# 1. Configuration & logging
# ============================================================
configure_logging(env="dev")
logger = LoguruLogger()

root = get_repository_root()
config_loader = YamlConfigLoader()

paths_cfg = config_loader.load_config(root / "configs/paths.yaml")
splitter_cfg = config_loader.load_config(root / "configs/splitter.yaml")

# ============================================================
# 2. Chargement des donn√©es
# ============================================================
train_data_path = root / Path(paths_cfg["data"]["input"]["train_dataset"])

dataset_repo = CSVDatasetRepository(
    source_path=train_data_path,
    logger=logger,
)

raw_df = dataset_repo.load_dataset()

logger.info(
    f"Dataset charg√© | "
    f"lignes={raw_df.shape[0]} | "
    f"colonnes={raw_df.shape[1]}"
)

# ============================================================
# 3. Feature Engineering
# ============================================================
fe_pipeline = FeatureEngineeringPipeline(logger=logger)
enriched_df = fe_pipeline.transform(raw_df)

logger.info(
    "Feature engineering termin√© | "
    f"colonnes_avant={raw_df.shape[1]} | "
    f"colonnes_apr√®s={enriched_df.shape[1]}"
)

# ============================================================
# 4. Split Train / Validation
# ============================================================
splitter = SklearnDatasetSplitter(
    train_size=splitter_cfg["splitter"]["train_size"],
    target_column=fs.TARGET_COLUMN,
    random_state=splitter_cfg["splitter"]["random_state"],
    logger=logger,
)

train_df, valid_df = splitter.split(enriched_df)

logger.info(f"Train shape : {train_df.shape}")
logger.info(f"Valid shape : {valid_df.shape}")

# ============================================================
# 4.b Feature Selection (IMPORTANT)
# ============================================================
logger.info(
    f"Feature selection | "
    f"{len(fs.SELECTED_FEATURES)} features retenues"
)

# On garde uniquement les features utiles + la cible
train_df = train_df[fs.SELECTED_FEATURES + [fs.TARGET_COLUMN]]
valid_df = valid_df[fs.SELECTED_FEATURES + [fs.TARGET_COLUMN]]

logger.info("Feature selection appliqu√©e")

# ============================================================
# 5. Pr√©paration X / y
# ============================================================
X_train = train_df.drop(columns=[fs.TARGET_COLUMN])
y_train = train_df[fs.TARGET_COLUMN]

X_valid = valid_df.drop(columns=[fs.TARGET_COLUMN])
y_valid = valid_df[fs.TARGET_COLUMN]

logger.info(
    "Dimensions finales | "
    f"X_train={X_train.shape} | "
    f"X_valid={X_valid.shape}"
)

# ============================================================
# 6. Entra√Ænement CatBoost
# ============================================================
catboost_params = {
    "iterations": 500,
    "learning_rate": 0.5,
    "depth": 6,
    "loss_function": "Logloss",
    "eval_metric": "Logloss",
    "verbose": 100,
}

trainer = CatBoostTrainer(
    params=catboost_params,
    logger=logger,
)

model = trainer.train(
    X_train=X_train,
    y_train=y_train,
    X_valid=X_valid,
    y_valid=y_valid,
)

logger.info("Entra√Ænement CatBoost termin√©")

# ============================================================
# 7. Sauvegarde du mod√®le (overwrite)
# ============================================================
MODEL_PATH = (
    root
    / "models"
    / "catboost_diabetes_classifier.cbm"
)
MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)

model.save_model(MODEL_PATH)
logger.info(f"Mod√®le sauvegard√© : {MODEL_PATH}")

# ============================================================
# 8. Chargement du mod√®le (sanity check)
# ============================================================
#loaded_model = CatBoostClassifier()
#loaded_model.load_model(MODEL_PATH)

#logger.info("Mod√®le recharg√© avec succ√®s")

[32m2026-01-03 18:01:33[0m | [1mINFO    [0m | [36mconfig.py:67[0m | [33mconfigure_logging()[0m | Loguru configur√© avec succ√®s (mode: dev) | {'env': 'dev'}
[32m2026-01-03 18:01:33[0m | [1mINFO    [0m | [36mcsv_dataset_repository.py:54[0m | [33mload_dataset()[0m | Chargement du dataset depuis : /Users/surelmanda/Downloads/ml-projects/Clean-Architecture-MLops/health_lifestyle_diabetes/data/input/train.csv | {}
[32m2026-01-03 18:01:34[0m | [1mINFO    [0m | [36mcsv_dataset_repository.py:61[0m | [33mload_dataset()[0m | Dataset charg√© avec succ√®s (82702 lignes, 31 colonnes). | {}
[32m2026-01-03 18:01:34[0m | [1mINFO    [0m | [36m1594397751.py:53[0m | [33m<module>()[0m | Dataset charg√© | lignes=82702 | colonnes=31 | {}
[32m2026-01-03 18:01:34[0m | [1mINFO    [0m | [36mpipeline_feature_engineering.py:82[0m | [33mtransform()[0m | D√©marrage du pipeline complet de Feature Engineering... | {}
[32m2026-01-03 18:01:34[0m | [1mINFO    [0m | [36mexclus

### üì• Test Data Ingestion ‚Äì CSV ‚Üí DataFrame

In [20]:
from pathlib import Path
from health_lifestyle_diabetes.infrastructure.logger.config import configure_logging
from health_lifestyle_diabetes.infrastructure.logger.loguru_logger import LoguruLogger
from health_lifestyle_diabetes.infrastructure.utils.config_loader import YamlConfigLoader
from health_lifestyle_diabetes.infrastructure.utils.paths import get_repository_root
from health_lifestyle_diabetes.infrastructure.data_sources.csv_dataset_repository import (
    CSVDatasetRepository
)
from health_lifestyle_diabetes.application.use_cases.load_dataset_use_case import (
    LoadDatasetUseCase
)

# ============================================================
# Configuration
# ============================================================
configure_logging(env="dev")
logger = LoguruLogger()

root_path = get_repository_root()
config_loader = YamlConfigLoader()

# ============================================================
# Paths
# ============================================================
paths_cfg = config_loader.load_config(root_path / "configs/paths.yaml")
test_data_path = root_path / Path(paths_cfg["data"]["input"]["test_dataset"])

# ============================================================
# Dataset loading
# ============================================================
dataset_repo = CSVDatasetRepository(
    source_path=test_data_path,
    logger=logger,
)

load_test_data = LoadDatasetUseCase(
    repository=dataset_repo,
    logger=logger,
)

test_df = load_test_data.execute()

# ============================================================
# Sanity check
# ============================================================
display(test_df.head())
logger.info(
    f"Dataset de test charg√© | "
    f"lignes={test_df.shape[0]} | "
    f"colonnes={test_df.shape[1]}"
)

[32m2026-01-03 18:01:48[0m | [1mINFO    [0m | [36mconfig.py:67[0m | [33mconfigure_logging()[0m | Loguru configur√© avec succ√®s (mode: dev) | {'env': 'dev'}
[32m2026-01-03 18:01:48[0m | [1mINFO    [0m | [36mload_dataset_use_case.py:18[0m | [33mexecute()[0m | D√©marrage du chargement du dataset... | {}
[32m2026-01-03 18:01:48[0m | [1mINFO    [0m | [36mcsv_dataset_repository.py:54[0m | [33mload_dataset()[0m | Chargement du dataset depuis : /Users/surelmanda/Downloads/ml-projects/Clean-Architecture-MLops/health_lifestyle_diabetes/data/input/test.csv | {}
[32m2026-01-03 18:01:48[0m | [1mINFO    [0m | [36mcsv_dataset_repository.py:61[0m | [33mload_dataset()[0m | Dataset charg√© avec succ√®s (14595 lignes, 31 colonnes). | {}
[32m2026-01-03 18:01:48[0m | [1mINFO    [0m | [36mload_dataset_use_case.py:20[0m | [33mexecute()[0m | Dataset charg√© : 14595 lignes, 31 colonnes. | {}


Unnamed: 0,Age,gender,ethnicity,education_level,income_level,employment_status,smoking_status,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,...,hdl_cholesterol,ldl_cholesterol,triglycerides,glucose_fasting,glucose_postprandial,insulin_level,hba1c,diabetes_risk_score,diabetes_stage,diagnosed_diabetes
0,61,Female,White,Postgraduate,Middle,Retired,Never,2,104,6.2,...,52,50,229,131,210,6.09,8.14,34.0,Type 2,1
1,44,Male,Asian,Postgraduate,Lower-Middle,Unemployed,Never,0,91,5.0,...,52,134,136,114,194,6.29,7.2,25.8,Type 2,1
2,66,Female,White,Postgraduate,Lower-Middle,Employed,Never,0,136,6.6,...,54,144,125,118,194,15.55,7.36,46.3,Type 2,1
3,44,Male,Black,Highschool,Middle,Retired,Never,2,193,4.3,...,52,138,48,97,184,9.27,6.62,23.4,Type 2,1
4,64,Male,White,Graduate,Upper-Middle,Employed,Current,2,95,6.5,...,59,141,179,92,138,14.97,5.62,34.4,No Diabetes,0


[32m2026-01-03 18:01:49[0m | [1mINFO    [0m | [36m355808870.py:47[0m | [33m<module>()[0m | Dataset de test charg√© | lignes=14595 | colonnes=31 | {}


In [23]:
from pandas import DataFrame
from catboost import CatBoostClassifier

from health_lifestyle_diabetes.infrastructure.utils.paths import get_repository_root
from health_lifestyle_diabetes.infrastructure.streaming.pandas_dataframe_streamer import (
    PandasDataFrameStreamer
)
from health_lifestyle_diabetes.application.services.dataframe_streamer_service import (
    DataFrameStreamerService
)
from health_lifestyle_diabetes.application.use_cases.apply_feature_engineering_uc import (
    ApplyFeatureEngineeringUseCase
)
from health_lifestyle_diabetes.infrastructure.feature_engineering.pipeline_feature_engineering import (
    FeatureEngineeringPipeline
)
from health_lifestyle_diabetes.infrastructure.logger.loguru_logger import LoguruLogger
from health_lifestyle_diabetes.infrastructure.features_selections import (
    features_selection as fs
)

# ============================================================
# Initialisation des d√©pendances
# ============================================================
logger = LoguruLogger()

fe_pipeline = FeatureEngineeringPipeline(logger=logger)
apply_feature_engineering = ApplyFeatureEngineeringUseCase(
    pipeline=fe_pipeline,
    logger=logger,
)

streamer = PandasDataFrameStreamer()
streaming_service = DataFrameStreamerService(streamer)

# ============================================================
# Mod√®le & configuration
# ============================================================
TARGET_COLUMN = fs.ID_COLUMN

root_path = get_repository_root()
model_path = (
    root_path
    / "models"
    / "catboost_diabetes_classifier.cbm"
)

model = CatBoostClassifier()
model.load_model(model_path)

logger.info("Mod√®le CatBoost charg√© pour l'inf√©rence")

logger.info(
    f"Feature selection active | "
    f"{len(fs.SELECTED_FEATURES)} variables utilis√©es"
)

# ============================================================
# Streaming & inference
# ============================================================
event_index = 0

for record in streaming_service.run(test_df, min_delay=1, max_delay=3):
    logger.info(f"Traitement du record #{event_index}")

    # --------------------------------------------------------
    # 1. Record ‚Üí DataFrame unitaire
    # --------------------------------------------------------
    record_df = DataFrame([record])

    # --------------------------------------------------------
    # 2. Feature engineering
    # --------------------------------------------------------
    enriched_df = apply_feature_engineering.execute(record_df)

    # --------------------------------------------------------
    # 3. S√©lection des features pour l'inf√©rence
    # --------------------------------------------------------
    model_input = enriched_df[fs.SELECTED_FEATURES]

    # --------------------------------------------------------
    # 4. Pr√©diction
    # --------------------------------------------------------
    prediction_proba = model.predict_proba(model_input)[0, 1]
    prediction_label = int(prediction_proba >= 0.5)

    # --------------------------------------------------------
    # 5. Payload de sortie (JSON m√©tier)
    # --------------------------------------------------------
    prediction_event = {
        "user_id": record.get("user_id"),
        "true_diagnosed_diabetes": record.get("diagnosed_diabetes"),
        "predicted_diagnosed_diabetes": prediction_label,
        "prediction_probability": round(float(prediction_proba), 4),
    }

    # point d'int√©gration API / Kafka / Event Bus
    print(prediction_event)

    event_index += 1
    if event_index >= 10:  # garde-fou notebook
        break

logger.info("Fin du streaming des donn√©es")

[32m2026-01-03 18:03:16[0m | [1mINFO    [0m | [36m1920831522.py:51[0m | [33m<module>()[0m | Mod√®le CatBoost charg√© pour l'inf√©rence | {}
[32m2026-01-03 18:03:16[0m | [1mINFO    [0m | [36m1920831522.py:53[0m | [33m<module>()[0m | Feature selection active | 5 variables utilis√©es | {}
[32m2026-01-03 18:03:16[0m | [1mINFO    [0m | [36mpandas_dataframe_streamer.py:35[0m | [33mstream()[0m | Envoi ligne user_id=0eac564c-b395-4e06-add0-ed07af75c9ca | {}
[32m2026-01-03 18:03:18[0m | [1mINFO    [0m | [36m1920831522.py:64[0m | [33m<module>()[0m | Traitement du record #0 | {}
[32m2026-01-03 18:03:18[0m | [1mINFO    [0m | [36mapply_feature_engineering_uc.py:39[0m | [33mexecute()[0m | D√©but du feature engineering | lignes=1 | colonnes_initiales=32 | {}
{'user_id': '0eac564c-b395-4e06-add0-ed07af75c9ca', 'true_diagnosed_diabetes': 1, 'predicted_diagnosed_diabetes': 1, 'prediction_probability': 1.0}
[32m2026-01-03 18:03:18[0m | [1mINFO    [0m | [36mpip

[32m2026-01-03 18:03:38[0m | [1mINFO    [0m | [36mdemographics_features.py:68[0m | [33m_create_age_group()[0m | Utilisation du d√©coupage d'√¢ge en 3 cat√©gories. | {}
[32m2026-01-03 18:03:38[0m | [1mINFO    [0m | [36mdemographics_features.py:111[0m | [33mtransform()[0m | Variables d√©mographiques compl√©t√©es. | {}
[32m2026-01-03 18:03:38[0m | [1mINFO    [0m | [36mmedical_features.py:86[0m | [33mtransform()[0m | Application des transformations m√©dicales... | {}
[32m2026-01-03 18:03:38[0m | [1mINFO    [0m | [36mmedical_features.py:94[0m | [33mtransform()[0m | Transformations m√©dicales compl√©t√©es. | {}
[32m2026-01-03 18:03:38[0m | [1mINFO    [0m | [36mclinical_features.py:32[0m | [33mtransform()[0m | Calcul des ratios et interactions cliniques... | {}
[32m2026-01-03 18:03:38[0m | [1mINFO    [0m | [36mclinical_features.py:41[0m | [33mtransform()[0m | Variables cliniques ajout√©es avec succ√®s. | {}
[32m2026-01-03 18:03:38[0m | [1mINF