In [1]:
pip install ucimlrepo

Collecting ucimlrepo
  Using cached ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Collecting pandas>=1.0.0 (from ucimlrepo)
  Using cached pandas-2.3.3-cp313-cp313-macosx_10_13_x86_64.whl.metadata (91 kB)
Collecting numpy>=1.26.0 (from pandas>=1.0.0->ucimlrepo)
  Using cached numpy-2.3.4-cp313-cp313-macosx_14_0_x86_64.whl.metadata (62 kB)
Collecting pytz>=2020.1 (from pandas>=1.0.0->ucimlrepo)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas>=1.0.0->ucimlrepo)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Using cached pandas-2.3.3-cp313-cp313-macosx_10_13_x86_64.whl (11.5 MB)
Using cached numpy-2.3.4-cp313-cp313-macosx_14_0_x86_64.whl (6.6 MB)
Using cached pytz-2025.2-py2.py3-none-any.whl (509 kB)
Using cached tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: pytz, tzdata, numpy, pandas, ucimlrepo
[2K   [90m━━━━━━━━━━━━━━━━━

In [None]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
diabetes_130_us_hospitals_for_years_1999_2008 = fetch_ucirepo(id=296) 
  
# data (as pandas dataframes) 
X = diabetes_130_us_hospitals_for_years_1999_2008.data.features 
y = diabetes_130_us_hospitals_for_years_1999_2008.data.targets 
  
# metadata 
print(diabetes_130_us_hospitals_for_years_1999_2008.metadata) 
  
# variable information 
print(diabetes_130_us_hospitals_for_years_1999_2008.variables) 


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# 1) Drop colonne quasi vide
if "weight" in X.columns:
    X = X.drop(columns=["weight"])

# 2) Nettoyages légers
# Harmoniser certains codes "unknown"/"invalid" en NaN
for col in X.select_dtypes(include="object").columns:
    X[col] = X[col].replace(["Unknown/Invalid", "UNKNOWN", "unknown"], np.nan)

In [None]:
# 3) Réduction de cardinalité (option conseillée)
def trim_icd(df):
    for c in ["diag_1", "diag_2", "diag_3"]:
        if c in df.columns:
            df[c] = df[c].astype(str).str.slice(0, 3)   # 3 premiers caractères
    return df

X = trim_icd(X)

    
    
    

In [None]:
# 4) Détection des types
num_cols  = X.select_dtypes(include=["number"]).columns.tolist()
cat_cols  = X.select_dtypes(include=["object"]).columns.tolist()

# A1Cresult / max_glu_serum sont catégorielles cliniques ; on garde en cat
for c in ["A1Cresult", "max_glu_serum"]:
    if c in X.columns and c not in cat_cols:
        cat_cols.append(c)
        if c in num_cols:
            num_cols.remove(c)

In [None]:
# 5) Transformers
numeric_tf = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="median")),
    ("scale", StandardScaler())
])

categorical_tf = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="most_frequent")),  # ou fill_value="Missing"
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_tf, num_cols),
        ("cat", categorical_tf, cat_cols)
    ],
    remainder="drop"
)

In [None]:
# 6) Fit-transform X (sans modèle pour l’instant)
X_proc = preprocess.fit_transform(X)

print("X shape raw:", X.shape)
print("X_proc shape:", X_proc.shape)

# 7) Récupérer les noms de features encodées (utile pour le rapport)
oh = preprocess.named_transformers_["cat"].named_steps["onehot"]
cat_feature_names = oh.get_feature_names_out(cat_cols)
feature_names = list(num_cols) + list(cat_feature_names)

X shape raw: (101766, 46)
X_proc shape: (101766, 2252)
