In [1]:
from pathlib import Path
from health_lifestyle_diabetes.infrastructure.utils.config_loader import YamlConfigLoader
from health_lifestyle_diabetes.infrastructure.utils.paths import get_repository_root
from health_lifestyle_diabetes.infrastructure.logger.config import configure_logging
from health_lifestyle_diabetes.infrastructure.logger.loguru_logger import LoguruLogger
from health_lifestyle_diabetes.infrastructure.data_sources.csv_dataset_repository import CSVDatasetRepository 

# Configure logging
configure_logging(env="dev")

# Get repository root path
root = get_repository_root()

# Load configuration
config_loader = YamlConfigLoader()
paths = config_loader.load_config(f"{root}/configs/paths.yaml")
train_paths = get_repository_root() / Path(paths["data"]["input"]["train_dataset"])

# Initialize Logger
logger = LoguruLogger()

# Load Dataset fron CSV file
csv_repo = CSVDatasetRepository(logger=logger,source_path=train_paths) 

# Lit le fichier CSV et le charge en mémoire (DataFrame Pandas).
data = csv_repo.load_dataset()

# Show the 5 first rows
display(data.head()) # Affiche les 5 premières lignes pour validation.
print(f"le dataset chargé a {data.shape[0]} lignes et {data.shape[1]} colonnes")

[32m2025-12-25 08:14:00[0m | [1mINFO    [0m | [36mconfig.py:67[0m | [33mconfigure_logging()[0m | Loguru configuré avec succès (mode: dev) | {'env': 'dev'}
[32m2025-12-25 08:14:00[0m | [1mINFO    [0m | [36mcsv_dataset_repository.py:56[0m | [33mload_dataset()[0m | Chargement du dataset depuis : /Users/surelmanda/Downloads/ml-projects/Clean-Architecture-MLops/health_lifestyle_diabetes/data/input/train.csv | {}
[32m2025-12-25 08:14:00[0m | [1mINFO    [0m | [36mcsv_dataset_repository.py:63[0m | [33mload_dataset()[0m | Dataset chargé avec succès (77837 lignes, 31 colonnes). | {}


Unnamed: 0,Age,gender,ethnicity,education_level,income_level,employment_status,smoking_status,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,...,hdl_cholesterol,ldl_cholesterol,triglycerides,glucose_fasting,glucose_postprandial,insulin_level,hba1c,diabetes_risk_score,diabetes_stage,diagnosed_diabetes
0,40,Female,White,Graduate,Upper-Middle,Employed,Current,4,83,6.7,...,48,92,131,85,104,7.59,5.21,24.6,Type 2,1
1,49,Male,Asian,Postgraduate,Lower-Middle,Employed,Former,2,345,6.4,...,41,127,186,109,187,12.01,6.75,25.0,Type 2,1
2,22,Other,Other,Postgraduate,Middle,Retired,Never,3,206,5.6,...,41,95,124,112,151,9.69,6.36,16.8,Pre-Diabetes,0
3,90,Female,Asian,Highschool,Middle,Employed,Never,2,73,9.4,...,41,204,79,113,129,18.75,5.75,40.9,Pre-Diabetes,0
4,58,Female,White,Highschool,Lower-Middle,Unemployed,Never,3,152,4.2,...,46,61,92,101,162,9.11,6.52,28.8,Type 2,1


le dataset chargé a 77837 lignes et 31 colonnes


In [2]:
from health_lifestyle_diabetes.infrastructure.feature_engineering.pipeline_feature_engineering import FeatureEngineeringPipeline

pipeline = FeatureEngineeringPipeline(logger=LoguruLogger())
df_enriched = pipeline.transform(data)


display(df_enriched.head())
print(f"Le dataset d'entrainement initial avait {data.shape[1]} colonnes")
print(f"Le dataset d'entrainement fianla apres FE a desormais {df_enriched.shape[1]} colonnes")

[32m2025-12-25 08:14:00[0m | [1mINFO    [0m | [36mpipeline_feature_engineering.py:82[0m | [33mtransform()[0m | Démarrage du pipeline complet de Feature Engineering... | {}
[32m2025-12-25 08:14:00[0m | [1mINFO    [0m | [36mexclusion.py:36[0m | [33mdrop_leakage_columns()[0m | Vérification des colonnes à risque de data leakage... | {}
[32m2025-12-25 08:14:00[0m | [1mINFO    [0m | [36mexclusion.py:40[0m | [33mdrop_leakage_columns()[0m | Suppression des colonnes à risque de leakage : ['diabetes_stage', 'diabetes_risk_score'] | {}
[32m2025-12-25 08:14:00[0m | [1mINFO    [0m | [36mbase_preprocessing.py:25[0m | [33mclean_categorical_variables()[0m | Nettoyage des variables catégorielles... | {}
[32m2025-12-25 08:14:00[0m | [1mINFO    [0m | [36mbase_preprocessing.py:32[0m | [33mclean_categorical_variables()[0m | Libellés uniformisés avec succès. | {}
[32m2025-12-25 08:14:00[0m | [1mINFO    [0m | [36mdemographics_features.py:82[0m | [33mtransform()

Unnamed: 0,Age,gender,ethnicity,education_level,income_level,employment_status,smoking_status,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,...,glucose_diff,glycemic_load,dyslipidemia_flag,cardiometabolic_burden_score,bp_ratio,activity_adequacy_ratio,screen_sleep_ratio,sedentary_risk_flag,lifestyle_score,sleep_efficiency
0,40,Female,White,Graduate,Upper-Middle,Employed,Current,4,83,6.7,...,19,1878.5,0,0,1.411765,0.553333,1.136986,1,4,0.784946
1,49,Male,Asian,Postgraduate,Lower-Middle,Employed,Ex-Smoker,2,345,6.4,...,78,4011.2,1,2,1.859375,2.3,0.369565,0,6,1.703704
2,22,Unknown,Other,Postgraduate,Middle,Inactive,Never,3,206,5.6,...,39,2844.8,0,1,1.384615,1.373333,0.714286,0,6,1.184615
3,90,Female,Asian,Highschool,Middle,Employed,Never,2,73,9.4,...,16,3412.6,0,3,2.157143,0.486667,0.913043,1,6,0.945205
4,58,Female,White,Highschool,Lower-Middle,Inactive,Never,3,152,4.2,...,61,2373.5,0,0,1.265957,1.013333,1.291667,0,6,0.699029


[32m2025-12-25 08:14:01[0m | [1mINFO    [0m | [36mpipeline_feature_engineering.py:120[0m | [33mtransform()[0m | Pipeline exécuté avec succès. Nombre total de colonnes : 52 | {}
Le dataset d'entrainement initial avait 31 colonnes
Le dataset d'entrainement fianla apres FE a desormais 52 colonnes


In [5]:
from loguru import logger

logger.bind(step="feature_engineering").info("Début du pipeline")

[32m2025-12-25 08:08:18[0m | [1mINFO[0m | [36m__main__[0m | Début du pipeline | {'step': 'feature_engineering'}


In [1]:
from health_lifestyle_diabetes.infrastructure.logger.config import configure_logging
from health_lifestyle_diabetes.infrastructure.logger.loguru_logger import LoguruLogger

# Configure logging
configure_logging(env="dev")
logger = LoguruLogger()
logger.info("Début du pipeline")

[32m2025-12-25 08:10:43[0m | [1mINFO    [0m | [36mconfig.py:67[0m | [33mconfigure_logging()[0m | Loguru configuré avec succès (mode: dev) | {'env': 'dev'}
[32m2025-12-25 08:10:43[0m | [1mINFO    [0m | [36mloguru_logger.py:17[0m | [33minfo()[0m | Début du pipeline | {}
