In [1]:
from health_lifestyle_diabetes.infrastructure.utils.paths import get_repository_root
from health_lifestyle_diabetes.infrastructure.utils.config_loader import YamlConfigLoader
root = get_repository_root()

# Load configuration
config_loader = YamlConfigLoader()
paths = config_loader.load_config(f"{root}/configs/paths.yaml")

# 
print(paths)

{'data': {'root': 'data/', 'input': {'raw_dataset': 'data/raw/diabetes_health_indicators.csv', 'train_dataset': 'data/input/train.csv', 'test_dataset': 'data/input/test.csv'}, 'processed': {'cleaned_dataset': 'data/processed/cleaned_diabetes.csv', 'features_dataset': 'data/processed/features_diabetes.csv'}, 'output': {'model': 'data/output/model.pkl', 'pipeline': 'data/output/pipeline.pkl'}}, 'reports': {'eda_report': 'reports/eda_reports', 'metrics_report': 'reports/metrics', 'curves_reports': 'reports/figures', 'cm_reports': 'reports/cm'}, 'logs': {'folder': 'logs', 'main_log': 'logs/app.log'}}


In [2]:
from health_lifestyle_diabetes.infrastructure.data_sources.csv_dataset_repository import CSVDatasetRepository 
from health_lifestyle_diabetes.infrastructure.logger.loguru_logger import LoguruLogger

logger = LoguruLogger()

# Load Dataset fron CSV file
csv_repo = CSVDatasetRepository(logger=logger,) 

# Lit le fichier CSV et le charge en mémoire (DataFrame Pandas).
data = csv_repo.load_dataset()

# Show the 5 first rows
display(data.head()) # Affiche les 5 premières lignes pour validation.
print(f"le dataset chargé a {data.shape[0]} lignes et {data.shape[1]} colonnes")

[32m2026-01-08 02:45:55.534[0m | [1mINFO    [0m | [36mhealth_lifestyle_diabetes.infrastructure.data_sources.csv_dataset_repository[0m:[36mload_dataset[0m:[36m54[0m - [1mChargement du dataset depuis : /Users/surelmanda/Downloads/ml-projects/Clean-Architecture-MLops/health_lifestyle_diabetes/data/raw/diabetes_health_indicators.csv[0m
[32m2026-01-08 02:45:55.683[0m | [1mINFO    [0m | [36mhealth_lifestyle_diabetes.infrastructure.data_sources.csv_dataset_repository[0m:[36mload_dataset[0m:[36m61[0m - [1mDataset chargé avec succès (97297 lignes, 31 colonnes).[0m


Unnamed: 0,Age,gender,ethnicity,education_level,income_level,employment_status,smoking_status,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,...,hdl_cholesterol,ldl_cholesterol,triglycerides,glucose_fasting,glucose_postprandial,insulin_level,hba1c,diabetes_risk_score,diabetes_stage,diagnosed_diabetes
0,58,Male,Asian,Highschool,Lower-Middle,Employed,Never,0,215,5.7,...,41,160,145,136,236,6.36,8.18,29.6,Type 2,1
1,52,Female,White,Highschool,Middle,Employed,Former,1,143,6.7,...,55,50,30,93,150,2.0,5.63,23.0,No Diabetes,0
2,60,Male,Hispanic,Highschool,Middle,Unemployed,Never,1,57,6.4,...,66,99,36,118,195,5.07,7.51,44.7,Type 2,1
3,74,Female,Black,Highschool,Low,Retired,Never,0,49,3.4,...,50,79,140,139,253,5.28,9.03,38.2,Type 2,1
4,46,Male,White,Graduate,Middle,Retired,Never,1,109,7.2,...,52,125,160,137,184,12.74,7.2,23.5,Type 2,1


le dataset chargé a 97297 lignes et 31 colonnes


In [3]:
from health_lifestyle_diabetes.infrastructure.utils.config_loader import YamlConfigLoader
from health_lifestyle_diabetes.infrastructure.utils.paths import get_repository_root
from health_lifestyle_diabetes.infrastructure.logger.loguru_logger import LoguruLogger
from health_lifestyle_diabetes.infrastructure.splitters.sklearn_splitter import SklearnDatasetSplitter

root = get_repository_root()

# Load configuration
config_loader = YamlConfigLoader()
cfg = config_loader.load_config(f"{root}/configs/splitter.yaml")
print(cfg)

logger = LoguruLogger()

splitter = SklearnDatasetSplitter(
    train_size=cfg["splitter"]["train_size"],
    target_column=cfg["splitter"]["target_column"],
    random_state=cfg["splitter"]["random_state"],
    logger=logger,
)
train_df, test_df = splitter.split(data)

[32m2026-01-08 02:45:56.576[0m | [34m[1mDEBUG   [0m | [36mhealth_lifestyle_diabetes.infrastructure.splitters.sklearn_splitter[0m:[36m__init__[0m:[36m37[0m - [34m[1mSklearnDatasetSplitter initialized[0m
[32m2026-01-08 02:45:56.577[0m | [1mINFO    [0m | [36mhealth_lifestyle_diabetes.infrastructure.splitters.sklearn_splitter[0m:[36msplit[0m:[36m42[0m - [1mStarting dataset split[0m
[32m2026-01-08 02:45:56.605[0m | [1mINFO    [0m | [36mhealth_lifestyle_diabetes.infrastructure.splitters.sklearn_splitter[0m:[36msplit[0m:[36m57[0m - [1mSplit done | train=(82702, 31) | test=(14595, 31)[0m


{'splitter': {'train_size': 0.85, 'target_column': 'diagnosed_diabetes', 'random_state': 42}}


In [4]:
from pathlib import Path
from health_lifestyle_diabetes.infrastructure.logger.loguru_logger import LoguruLogger
from health_lifestyle_diabetes.infrastructure.data_sources.csv_dataset_repository import CSVDatasetRepository 

csv_repo = CSVDatasetRepository(logger=logger,) 

logger = LoguruLogger()

output_paths = {
    "train": get_repository_root() / Path(paths["data"]["input"]["train_dataset"]),
    "test": get_repository_root() / Path(paths["data"]["input"]["test_dataset"]),
}

datasets = {
    "train": train_df,
    "test": test_df,
}

for split_name, dataset in datasets.items():
    output_path = output_paths[split_name]

    logger.info(
        f"Saving {split_name} dataset | "
        f"rows={dataset.shape[0]} | "
        f"columns={dataset.shape[1]}"
    )

    csv_repo.save_dataset(dataset, output_path)

    logger.info(
        f"{split_name.capitalize()} dataset successfully saved at '{output_path}'"
    )

[32m2026-01-08 02:45:56.612[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m22[0m - [1mSaving train dataset | rows=82702 | columns=31[0m
[32m2026-01-08 02:45:56.612[0m | [1mINFO    [0m | [36mhealth_lifestyle_diabetes.infrastructure.data_sources.csv_dataset_repository[0m:[36msave_dataset[0m:[36m74[0m - [1mSauvegarde du dataset dans : /Users/surelmanda/Downloads/ml-projects/Clean-Architecture-MLops/health_lifestyle_diabetes/data/input/train.csv[0m
[32m2026-01-08 02:45:57.060[0m | [1mINFO    [0m | [36mhealth_lifestyle_diabetes.infrastructure.data_sources.csv_dataset_repository[0m:[36msave_dataset[0m:[36m79[0m - [1mDataset sauvegardé avec succès : /Users/surelmanda/Downloads/ml-projects/Clean-Architecture-MLops/health_lifestyle_diabetes/data/input/train.csv[0m
[32m2026-01-08 02:45:57.060[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [1mTrain dataset successfully saved at '/Users/surelmanda/Downloads/ml-projects/Cl

In [None]:
from pandas import DataFrame


def summarize_dataset(df: DataFrame, max_examples: int = 5) -> DataFrame:
    """
    Fournit une synthèse détaillée du dataset :
    - Nom de colonne
    - Type de données
    - Nombre et pourcentage de valeurs manquantes
    - Cardinalité (nombre de valeurs uniques)
    - Exemples représentatifs

    Args:
        df (pd.DataFrame): Le DataFrame à analyser.
        max_examples (int): Nombre maximum d'exemples à afficher par colonne.

    Returns:
        pd.DataFrame: Tableau récapitulatif des colonnes et de leurs caractéristiques.
    """
    logger = LoguruLogger()

    if df is None or df.empty:
        logger.warning("Aucun dataset fourni ou dataset vide.")
        return DataFrame(columns=["Column", "Type", "Missing", "% Missing", "Cardinality", "Examples"])

    logger.info("Résumé détaillé du dataset en cours d'analyse...")
    total_rows = len(df)
    column_details = []

    for col in df.columns:
        col_type = df[col].dtype

        # Valeurs manquantes
        missing_count = df[col].isna().sum()
        missing_pct = round((missing_count / total_rows) * 100, 2)

        # Cardinalité
        cardinality = df[col].nunique(dropna=True)

        # Exemples représentatifs
        unique_values = df[col].dropna().unique()
        if df[col].dtype == "object" or df[col].dtype.name == "category":
            examples = unique_values[:max_examples]
        else:
            examples = sorted(unique_values[:max_examples])

        column_details.append([
            col,
            col_type,
            missing_count,
            missing_pct,
            cardinality,
            examples
        ])

    # Création du DataFrame récapitulatif
    summary_df = DataFrame(
        column_details,
        columns=["Column", "Type", "Missing", "% Missing", "Cardinality", "Examples"]
    ).sort_values(by="% Missing", ascending=False)

    # Logs synthétiques
    logger.info(f"Résumé terminé : {len(summary_df)} colonnes, {total_rows} lignes.")

    return summary_df




print("Affichage des 5 premières lignes du dataset d'entraînement enrichi :")
# display(train_df_enriched.head())
display(train_df.describe())

# En supposant que 'data' est votre DataFrame
display(train_df.dtypes)
summarize_data = summarize_dataset(train_df, 3)

display(summarize_data)

Affichage des 5 premières lignes du dataset d'entraînement enrichi :


Unnamed: 0,Age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,family_history_diabetes,hypertension_history,cardiovascular_history,bmi,...,cholesterol_total,hdl_cholesterol,ldl_cholesterol,triglycerides,glucose_fasting,glucose_postprandial,insulin_level,hba1c,diabetes_risk_score,diagnosed_diabetes
count,82702.0,82702.0,82702.0,82702.0,82702.0,82702.0,82702.0,82702.0,82702.0,82702.0,...,82702.0,82702.0,82702.0,82702.0,82702.0,82702.0,82702.0,82702.0,82702.0,82702.0
mean,50.163841,2.008718,118.877572,5.99652,6.996503,6.000937,0.220019,0.25084,0.079176,25.608891,...,185.93028,54.068414,102.939385,121.384501,111.140517,160.01844,9.07158,6.520899,30.215115,0.600034
std,15.48883,1.420623,84.286669,1.782218,1.094331,2.46972,0.414262,0.433499,0.270015,3.592292,...,32.002744,10.274665,33.36953,43.349357,13.58803,30.962219,4.971388,0.814468,9.067717,0.489894
min,19.0,0.0,0.0,0.0,3.0,0.5,0.0,0.0,0.0,15.0,...,100.0,20.0,50.0,30.0,60.0,70.0,2.0,4.0,2.7,0.0
25%,38.0,1.0,57.0,4.8,6.3,4.3,0.0,0.0,0.0,23.2,...,164.0,47.0,78.0,91.0,102.0,139.0,5.09,5.97,23.8,0.0
50%,51.0,2.0,100.0,6.0,7.0,6.0,0.0,0.0,0.0,25.6,...,186.0,54.0,102.0,121.0,111.0,160.0,8.8,6.52,28.9,1.0
75%,61.0,3.0,160.0,7.2,7.7,7.7,0.0,1.0,0.0,28.0,...,207.0,61.0,126.0,151.0,120.0,181.0,12.46,7.07,35.6,1.0
max,90.0,10.0,833.0,10.0,10.0,16.8,1.0,1.0,1.0,39.2,...,318.0,96.0,263.0,344.0,172.0,287.0,32.22,9.8,67.2,1.0


Age                                     int64
gender                                 object
ethnicity                              object
education_level                        object
income_level                           object
employment_status                      object
smoking_status                         object
alcohol_consumption_per_week            int64
physical_activity_minutes_per_week      int64
diet_score                            float64
sleep_hours_per_day                   float64
screen_time_hours_per_day             float64
family_history_diabetes                 int64
hypertension_history                    int64
cardiovascular_history                  int64
bmi                                   float64
waist_to_hip_ratio                    float64
systolic_bp                             int64
diastolic_bp                            int64
heart_rate                              int64
cholesterol_total                       int64
hdl_cholesterol                   

[32m2026-01-08 02:46:23.303[0m | [1mINFO    [0m | [36m__main__[0m:[36msummarize_dataset[0m:[36m26[0m - [1mRésumé détaillé du dataset en cours d'analyse...[0m
[32m2026-01-08 02:46:23.391[0m | [1mINFO    [0m | [36m__main__[0m:[36msummarize_dataset[0m:[36m63[0m - [1mRésumé terminé : 31 colonnes, 82702 lignes.[0m


Unnamed: 0,Column,Type,Missing,% Missing,Cardinality,Examples
0,Age,int64,0,0.0,70,"[46, 65, 67]"
16,waist_to_hip_ratio,float64,0,0.0,40,"[0.85, 0.86, 0.87]"
29,diabetes_stage,object,0,0.0,5,"[Pre-Diabetes, Type 2, No Diabetes]"
28,diabetes_risk_score,float64,0,0.0,562,"[23.3, 35.7, 43.3]"
27,hba1c,float64,0,0.0,545,"[6.01, 7.42, 7.81]"
26,insulin_level,float64,0,0.0,2329,"[5.51, 9.5, 10.56]"
25,glucose_postprandial,int64,0,0.0,210,"[146, 191, 205]"
24,glucose_fasting,int64,0,0.0,108,"[104, 124, 133]"
23,triglycerides,int64,0,0.0,261,"[81, 131, 197]"
22,ldl_cholesterol,int64,0,0.0,188,"[68, 78, 148]"
