In [1]:
from health_lifestyle_diabetes.infrastructure.utils.paths import get_repository_root
from health_lifestyle_diabetes.infrastructure.utils.config_loader import YamlConfigLoader
root = get_repository_root()

# Load configuration
config_loader = YamlConfigLoader()
paths = config_loader.load_config(f"{root}/configs/paths.yaml")

# 
print(paths)

{'data': {'root': 'data/', 'input': {'raw_dataset': 'data/raw/diabetes_health_indicators.csv', 'train_dataset': 'data/input/train.csv', 'test_dataset': 'data/input/test.csv'}, 'processed': {'cleaned_dataset': 'data/processed/cleaned_diabetes.csv', 'features_dataset': 'data/processed/features_diabetes.csv'}, 'output': {'model': 'data/output/model.pkl', 'pipeline': 'data/output/pipeline.pkl'}}, 'reports': {'eda_report': 'reports/eda_reports', 'metrics_report': 'reports/metrics', 'curves_reports': 'reports/figures', 'cm_reports': 'reports/cm'}, 'logs': {'folder': 'logs', 'main_log': 'logs/app.log'}}


In [2]:
from health_lifestyle_diabetes.infrastructure.data_sources.csv_dataset_repository import CSVDatasetRepository 
from health_lifestyle_diabetes.infrastructure.logger.loguru_logger import LoguruLogger

logger = LoguruLogger()

# Load Dataset fron CSV file
csv_repo = CSVDatasetRepository(logger=logger,) 

# Lit le fichier CSV et le charge en mémoire (DataFrame Pandas).
data = csv_repo.load_dataset()

# Show the 5 first rows
display(data.head()) # Affiche les 5 premières lignes pour validation.
print(f"le dataset chargé a {data.shape[0]} lignes et {data.shape[1]} colonnes")

[32m2025-12-25 07:13:06.314[0m | [1mINFO    [0m | [36mhealth_lifestyle_diabetes.infrastructure.logger.loguru_logger[0m:[36minfo[0m:[36m17[0m - [1mChargement du dataset depuis : /Users/surelmanda/Downloads/ml-projects/Clean-Architecture-MLops/health_lifestyle_diabetes/data/raw/diabetes_health_indicators.csv[0m
[32m2025-12-25 07:13:06.474[0m | [1mINFO    [0m | [36mhealth_lifestyle_diabetes.infrastructure.logger.loguru_logger[0m:[36minfo[0m:[36m17[0m - [1mDataset chargé avec succès (97297 lignes, 31 colonnes).[0m


Unnamed: 0,Age,gender,ethnicity,education_level,income_level,employment_status,smoking_status,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,...,hdl_cholesterol,ldl_cholesterol,triglycerides,glucose_fasting,glucose_postprandial,insulin_level,hba1c,diabetes_risk_score,diabetes_stage,diagnosed_diabetes
0,58,Male,Asian,Highschool,Lower-Middle,Employed,Never,0,215,5.7,...,41,160,145,136,236,6.36,8.18,29.6,Type 2,1
1,52,Female,White,Highschool,Middle,Employed,Former,1,143,6.7,...,55,50,30,93,150,2.0,5.63,23.0,No Diabetes,0
2,60,Male,Hispanic,Highschool,Middle,Unemployed,Never,1,57,6.4,...,66,99,36,118,195,5.07,7.51,44.7,Type 2,1
3,74,Female,Black,Highschool,Low,Retired,Never,0,49,3.4,...,50,79,140,139,253,5.28,9.03,38.2,Type 2,1
4,46,Male,White,Graduate,Middle,Retired,Never,1,109,7.2,...,52,125,160,137,184,12.74,7.2,23.5,Type 2,1


le dataset chargé a 97297 lignes et 31 colonnes


In [3]:
from health_lifestyle_diabetes.infrastructure.utils.config_loader import YamlConfigLoader
from health_lifestyle_diabetes.infrastructure.utils.paths import get_repository_root
from health_lifestyle_diabetes.infrastructure.logger.loguru_logger import LoguruLogger
from health_lifestyle_diabetes.infrastructure.splitters.sklearn_splitter import SklearnDatasetSplitter

root = get_repository_root()

# Load configuration
config_loader = YamlConfigLoader()
cfg = config_loader.load_config(f"{root}/configs/splitter.yaml")
print(cfg)

logger = LoguruLogger()

splitter = SklearnDatasetSplitter(
    train_size=cfg["splitter"]["train_size"],
    target_column=cfg["splitter"]["target_column"],
    random_state=cfg["splitter"]["random_state"],
    logger=logger,
)
train_df, test_df = splitter.split(data)

[32m2025-12-25 07:13:07.424[0m | [34m[1mDEBUG   [0m | [36mhealth_lifestyle_diabetes.infrastructure.logger.loguru_logger[0m:[36mdebug[0m:[36m29[0m - [34m[1mSklearnDatasetSplitter initialized[0m
[32m2025-12-25 07:13:07.425[0m | [1mINFO    [0m | [36mhealth_lifestyle_diabetes.infrastructure.logger.loguru_logger[0m:[36minfo[0m:[36m17[0m - [1mStarting dataset split[0m
[32m2025-12-25 07:13:07.457[0m | [1mINFO    [0m | [36mhealth_lifestyle_diabetes.infrastructure.logger.loguru_logger[0m:[36minfo[0m:[36m17[0m - [1mSplit done | train=(77837, 31) | test=(19460, 31)[0m


{'splitter': {'train_size': 0.8, 'target_column': 'diagnosed_diabetes', 'random_state': 42}}


In [4]:
from pathlib import Path
from health_lifestyle_diabetes.infrastructure.logger.loguru_logger import LoguruLogger
from health_lifestyle_diabetes.infrastructure.data_sources.csv_dataset_repository import CSVDatasetRepository 

csv_repo = CSVDatasetRepository(logger=logger,) 

logger = LoguruLogger()

output_paths = {
    "train": get_repository_root() / Path(paths["data"]["input"]["train_dataset"]),
    "test": get_repository_root() / Path(paths["data"]["input"]["test_dataset"]),
}

datasets = {
    "train": train_df,
    "test": test_df,
}

for split_name, dataset in datasets.items():
    output_path = output_paths[split_name]

    logger.info(
        f"Saving {split_name} dataset | "
        f"rows={dataset.shape[0]} | "
        f"columns={dataset.shape[1]}"
    )

    csv_repo.save_dataset(dataset, output_path)

    logger.info(
        f"{split_name.capitalize()} dataset successfully saved at '{output_path}'"
    )

[32m2025-12-25 07:13:07.467[0m | [1mINFO    [0m | [36mhealth_lifestyle_diabetes.infrastructure.logger.loguru_logger[0m:[36minfo[0m:[36m17[0m - [1mSaving train dataset | rows=77837 | columns=31[0m
[32m2025-12-25 07:13:07.468[0m | [1mINFO    [0m | [36mhealth_lifestyle_diabetes.infrastructure.logger.loguru_logger[0m:[36minfo[0m:[36m17[0m - [1mSauvegarde du dataset dans : /Users/surelmanda/Downloads/ml-projects/Clean-Architecture-MLops/health_lifestyle_diabetes/data/input/train.csv[0m
[32m2025-12-25 07:13:07.887[0m | [1mINFO    [0m | [36mhealth_lifestyle_diabetes.infrastructure.logger.loguru_logger[0m:[36minfo[0m:[36m17[0m - [1mDataset sauvegardé avec succès : /Users/surelmanda/Downloads/ml-projects/Clean-Architecture-MLops/health_lifestyle_diabetes/data/input/train.csv[0m
[32m2025-12-25 07:13:07.887[0m | [1mINFO    [0m | [36mhealth_lifestyle_diabetes.infrastructure.logger.loguru_logger[0m:[36minfo[0m:[36m17[0m - [1mTrain dataset successfully 