In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# importation de la base depuis UCI 
from ucimlrepo import fetch_ucirepo 

diabetes_130_us_hospitals_for_years_1999_2008 = fetch_ucirepo(id=296) 

X = diabetes_130_us_hospitals_for_years_1999_2008.data.features 
Y = diabetes_130_us_hospitals_for_years_1999_2008.data.targets 

# variable information 
print(diabetes_130_us_hospitals_for_years_1999_2008.variables) 


  df = pd.read_csv(data_url)


                        name     role         type demographic  \
0               encounter_id       ID                     None   
1                patient_nbr       ID                     None   
2                       race  Feature  Categorical        Race   
3                     gender  Feature  Categorical      Gender   
4                        age  Feature  Categorical         Age   
5                     weight  Feature  Categorical        None   
6          admission_type_id  Feature  Categorical        None   
7   discharge_disposition_id  Feature  Categorical        None   
8        admission_source_id  Feature  Categorical        None   
9           time_in_hospital  Feature      Integer        None   
10                payer_code  Feature  Categorical        None   
11         medical_specialty  Feature  Categorical        None   
12        num_lab_procedures  Feature      Integer        None   
13            num_procedures  Feature      Integer        None   
14        

In [3]:
# Fusion X + Y
df = pd.concat([X, Y], axis=1)
print("Fusion terminée : ", df.shape)

Fusion terminée :  (101766, 48)


In [13]:
# Affichage des premières lignes
df.head()

Unnamed: 0,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,Caucasian,Female,[0-10),,6,25,1,1,,Pediatrics-Endocrinology,...,No,No,No,No,No,No,No,No,No,NO
1,Caucasian,Female,[10-20),,1,1,7,3,,,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,AfricanAmerican,Female,[20-30),,1,1,7,2,,,...,No,No,No,No,No,No,No,No,Yes,NO
3,Caucasian,Male,[30-40),,1,1,7,2,,,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,Caucasian,Male,[40-50),,1,1,7,1,,,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [14]:
num_cols  = df.select_dtypes(include=["number"]).columns.tolist()
cat_cols  = df.select_dtypes(include=["object"]).columns.tolist()

print(num_cols)
print(cat_cols)          

['admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses']
['race', 'gender', 'age', 'weight', 'payer_code', 'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted']


In [15]:
# A1Cresult / max_glu_serum sont catégorielles cliniques ; on garde en cat
for c in ["A1Cresult", "max_glu_serum"]:
    if c in df.columns and c not in cat_cols:
        cat_cols.append(c)
        if c in num_cols:
            num_cols.remove(c)

In [16]:
# Colonnes à supprimer (avec au moins 40% de valeurs manquantes)
cols_to_remove = ["payer_code", "medical_specialty", "max_glu_serum", "A1Cresult", "weight"]

X = df.drop(columns=cols_to_remove)

# Vérification
print(f"Colonnes supprimées : {cols_to_remove}")

Colonnes supprimées : ['payer_code', 'medical_specialty', 'max_glu_serum', 'A1Cresult', 'weight']


In [17]:
# Remplacer les mentions textuelles par de vrais NaN
X = X.replace(["missing value", "Missing value", "Missing Value"], np.nan)

initial_rows = X.shape[0]

# Supprimer toutes les lignes contenant au moins un NaN
X = X.dropna()

# Nombre de lignes après suppression
final_rows = X.shape[0]

# Pourcentage de lignes supprimées
pct_removed = 100 * (initial_rows - final_rows) / initial_rows

# Vérification
print(f"Lignes restantes après suppression : {final_rows}")
print(f"Pourcentage de lignes supprimées : {pct_removed:.2f}%")

Lignes restantes après suppression : 98053
Pourcentage de lignes supprimées : 3.65%


In [18]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# 1) Encodage ordinal des colonnes de médicaments

drug_cols = [
    "metformin", "repaglinide", "nateglinide", "chlorpropamide",
    "glimepiride", "acetohexamide", "glipizide", "glyburide", "tolbutamide",
    "pioglitazone", "rosiglitazone", "acarbose", "miglitol", "troglitazone",
    "tolazamide", "examide", "citoglipton", "insulin", "glyburide-metformin",
    "glipizide-metformin", "glimepiride-pioglitazone", "metformin-rosiglitazone",
    "metformin-pioglitazone", "change", "diabetesMed"
]

# Classification ordinale des médicaments
drug_map = {"No": 0, "Steady": 1, "Up": 2, "Down": -1}


for col in drug_cols:
    if col in X.columns:
        X[col] = X[col].map(drug_map).fillna(0).astype(int)

print("Médicaments encodés (mapping ordinal appliqué).")

Médicaments encodés (mapping ordinal appliqué).


In [19]:
# 2) Définir les groupes de variables

num_cols = [
    "admission_type_id","discharge_disposition_id","admission_source_id",
    "time_in_hospital","num_lab_procedures","num_procedures",
    "num_medications","number_outpatient","number_emergency",
    "number_inpatient","number_diagnoses"
]
num_cols = [c for c in num_cols if c in X.columns]  # sécurité

ohe_cols = ["race","gender","age"]
ohe_cols = [c for c in ohe_cols if c in X.columns]  # sécurité

# Les colonnes restantes (autres features déjà numériques)
other_cols = [c for c in X.columns if c not in num_cols + ohe_cols]

print(f"Numériques standardisées : {num_cols}")
print(f"Catégorielles à One-Hot : {ohe_cols}")
print(f"Déjà numériques (médicaments encodés) : {len(other_cols)} colonnes")

Numériques standardisées : ['admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses']
Catégorielles à One-Hot : ['race', 'gender', 'age']
Déjà numériques (médicaments encodés) : 29 colonnes


In [20]:
# 3) Pipelines

numeric_tf = Pipeline(steps=[
    ("scale", StandardScaler())
])

categorical_tf = Pipeline(steps=[
    ("onehot", OneHotEncoder(drop="first", handle_unknown="ignore", sparse_output=False))
])


# ColumnTransformer
preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_tf, num_cols),
        ("ohe", categorical_tf, ohe_cols),
        ("pass", "passthrough", other_cols)  # on garde les autres telles quelles
    ]
)

In [21]:

# 5) Fit-transform & reconstruction DataFrame

X_mat = preprocess.fit_transform(X)

# Récupération des noms des colonnes encodées
ohe = preprocess.named_transformers_["ohe"].named_steps["onehot"]
ohe_feature_names = ohe.get_feature_names_out(ohe_cols)
feature_names = list(num_cols) + list(ohe_feature_names) + other_cols

X_final = pd.DataFrame(X_mat, columns=feature_names, index=X.index)

print("Shape finale :", X_final.shape)
X_final.head()

Shape finale : (98053, 55)


Unnamed: 0,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
1,-0.707403,-0.518587,0.300448,-0.475091,0.804179,-0.790607,0.231901,-0.293277,-0.214725,-0.508936,...,0,2,0,0,0,0,0,0,0,>30
2,-0.707403,-0.518587,0.300448,-0.809197,-1.630894,2.135942,-0.384741,1.265141,-0.214725,0.277838,...,0,0,0,0,0,0,0,0,0,NO
3,-0.707403,-0.518587,0.300448,-0.809197,0.043219,-0.205297,-0.014756,-0.293277,-0.214725,-0.508936,...,0,2,0,0,0,0,0,0,0,NO
4,-0.707403,-0.518587,0.300448,-1.143304,0.398334,-0.790607,-1.001383,-0.293277,-0.214725,-0.508936,...,0,1,0,0,0,0,0,0,0,NO
5,-0.0178,-0.518587,-0.927565,-0.475091,-0.61628,2.721252,-0.014756,-0.293277,-0.214725,-0.508936,...,0,1,0,0,0,0,0,0,0,>30


In [22]:
# Liste simple
print(X_final.columns.tolist())

['admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses', 'race_Asian', 'race_Caucasian', 'race_Hispanic', 'race_Other', 'gender_Male', 'gender_Unknown/Invalid', 'age_[10-20)', 'age_[20-30)', 'age_[30-40)', 'age_[40-50)', 'age_[50-60)', 'age_[60-70)', 'age_[70-80)', 'age_[80-90)', 'age_[90-100)', 'diag_1', 'diag_2', 'diag_3', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted']


In [23]:
# Définir la cible
y = X_final["readmitted"].copy()

# Transformer en binaire : 1 si réadmission <30 jours, 0 sinon
y = y.apply(lambda x: 1 if x == "<30" else 0)

# Supprimer la colonne cible du dataset
X_final= X_final.drop(columns=["readmitted"])

print(f"X shape: {X_final.shape}")
print(f"Y shape: {y.shape}")
print(f"Valeurs uniques de y : {y.unique()}")


X shape: (98053, 54)
Y shape: (98053,)
Valeurs uniques de y : [0 1]


In [24]:
# Crée le dossier data
os.makedirs("data", exist_ok=True)

# Sauvegarde X_final et y
X_final.to_csv("data/X_final.csv", index=False)
y.to_csv("data/y.csv", index=False)

