# **Préparation des données**

In [1]:
import pandas as pd

In [2]:
#Chargement de données
df = pd.read_csv("balanced_data.csv")
print(df.shape)
df.head()

(1629, 50)


Unnamed: 0.1,Unnamed: 0,patient_id,month_visit,day_visit,year_visit,is_arrival_ambulance,heart_rate_signal,ambulance_time,patient_age,patient_sexe,...,medicament2,medicament3,medicament4,is_med1_given_emergency,is_med2_given_emergency,is_med3_given_emergency,is_med4_given_emergency,number_medicaments_given_emergency,number_medicaments_given_discharge,polarity_health_status
0,0,2.0,April,Monday,2020.0,Yes,106.0,28.0,64.0,Male,...,VANTIN,NO ENTRY MADE,NO ENTRY MADE,RX at discharge,Both given and RX marked,"Not applicable, no medication listed","Not applicable, no medication listed",1.0,2.0,2
1,1,10.0,April,Thursday,2020.0,Yes,72.0,24.0,23.0,Male,...,NO ENTRY MADE,NO ENTRY MADE,NO ENTRY MADE,"Not applicable, no medication listed","Not applicable, no medication listed","Not applicable, no medication listed","Not applicable, no medication listed",None listed/listed drug Rx at discharge only o...,None listed/listed drug given at ED only or un...,2
2,2,12.0,April,Thursday,2020.0,Yes,101.0,38.0,50.0,Male,...,VERSED,NO ENTRY MADE,NO ENTRY MADE,Given in ED,Given in ED,"Not applicable, no medication listed","Not applicable, no medication listed",2.0,None listed/listed drug given at ED only or un...,2
3,3,16.0,April,Saturday,2020.0,Yes,110.0,74.0,61.0,Male,...,NO ENTRY MADE,NO ENTRY MADE,NO ENTRY MADE,Given in ED,"Not applicable, no medication listed","Not applicable, no medication listed","Not applicable, no medication listed",1.0,None listed/listed drug given at ED only or un...,2
4,4,20.0,April,Sunday,2020.0,Yes,112.0,47.0,32.0,Male,...,OXYGEN,SODIUM CHLORIDE,TYLENOL,Given in ED,Given in ED,Given in ED,Both given and RX marked,4.0,1.0,2


In [3]:
#Supprimer les colonnes dangereuses Ces colonnes donnent déjà la réponse au modèle → interdites.
leak_cols = [
    "polarity_health_status",
    "is_med1_given_emergency",
    "is_med2_given_emergency",
    "is_med3_given_emergency",
    "is_med4_given_emergency",
    "number_medicaments_given_emergency"
]

df = df.drop(columns=[c for c in leak_cols if c in df.columns])


In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,patient_id,month_visit,day_visit,year_visit,is_arrival_ambulance,heart_rate_signal,ambulance_time,patient_age,patient_sexe,...,is_patient_suffer_high_cholesterol,is_patient_suffer_hyper_tension,is_patient_suffer_obesity,is_patient_suffer_apnea,is_patient_suffer_osteoporosis,medicament1,medicament2,medicament3,medicament4,number_medicaments_given_discharge
0,0,2.0,April,Monday,2020.0,Yes,106.0,28.0,64.0,Male,...,No,Yes,No,No,No,FLOMAX,VANTIN,NO ENTRY MADE,NO ENTRY MADE,2.0
1,1,10.0,April,Thursday,2020.0,Yes,72.0,24.0,23.0,Male,...,No,Yes,No,No,No,NO ENTRY MADE,NO ENTRY MADE,NO ENTRY MADE,NO ENTRY MADE,None listed/listed drug given at ED only or un...
2,2,12.0,April,Thursday,2020.0,Yes,101.0,38.0,50.0,Male,...,No,No,No,No,No,OXYGEN,VERSED,NO ENTRY MADE,NO ENTRY MADE,None listed/listed drug given at ED only or un...
3,3,16.0,April,Saturday,2020.0,Yes,110.0,74.0,61.0,Male,...,No,Yes,No,No,No,ASPIRIN,NO ENTRY MADE,NO ENTRY MADE,NO ENTRY MADE,None listed/listed drug given at ED only or un...
4,4,20.0,April,Sunday,2020.0,Yes,112.0,47.0,32.0,Male,...,Yes,Yes,Yes,No,No,KEPPRA,OXYGEN,SODIUM CHLORIDE,TYLENOL,1.0


In [5]:
#Transformer le statut patient en chiffres
df["patient_health_status"] = df["patient_health_status"].map({
    "Emergent": 0,
    "Semi-urgent": 1,
    "Urgent": 2
})


In [6]:
#Supprimer les lignes vide
df = df.dropna(subset=["patient_health_status"])

In [7]:
#Vérifier le nombre de patients par classe
print(df["patient_health_status"].value_counts())

patient_health_status
2    543
0    543
1    543
Name: count, dtype: int64


In [8]:
#Séparer X (entrées) et y (sortie)
X = df.drop(columns=["patient_health_status"])
y = df["patient_health_status"]

In [9]:
#Séparer colonnes numériques et texte
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = [c for c in X.columns if c not in cat_cols]

print("Colonnes texte :", cat_cols)
print("Colonnes numériques :", num_cols)


Colonnes texte : ['month_visit', 'day_visit', 'is_arrival_ambulance', 'ambulance_time', 'patient_sexe', 'patient_race', 'patient_pain_category', 'is_patient_seen_before_72h', 'patient_alchol_level', 'is_patient_suffer_alzheimar', 'is_patient_suffer_cancer', 'is_patient_suffer_cerebrovascular', 'is_patient_suffer_chronic_kidney', 'is_patient_suffer_chronic_obstructive_pulmonary', 'is_patient_suffer_congestive_heart_failure', 'is_patient_suffer_coronary_artery', 'is_patient_suffer_depression', 'is_patient_suffer_diabet_L1', 'is_patient_suffer_diabet_L2', 'is_patient_suffer_diabet_L0', 'is_patient_suffer_renal_insufficiency', 'is_patient_suffer_pulmonary_embolism', 'is_patient_suffer_HIV_infection', 'is_patient_suffer_high_cholesterol', 'is_patient_suffer_hyper_tension', 'is_patient_suffer_obesity', 'is_patient_suffer_apnea', 'is_patient_suffer_osteoporosis', 'medicament1', 'medicament2', 'medicament3', 'medicament4', 'number_medicaments_given_discharge']
Colonnes numériques : ['Unnamed: 

In [10]:
#Créer le prétraitement automatique
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), num_cols),

        ("cat", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols)
    ]
)

In [11]:
#Séparer les données en entraînement et test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


# **Entraîner le modèle RandomForest**

In [None]:
# Construction de modèle + pipeline (prétraitement + modèle)
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

rf_model = RandomForestClassifier(
    n_estimators=500,
    random_state=42,
    class_weight="balanced",
    n_jobs=-1
)

rf_pipe = Pipeline(steps=[
    ("preprocess", preprocess),     # imputation + scaling + onehot
    ("smote", SMOTE(random_state=42)),  # équilibre des classes
    ("model", rf_model)
])


In [13]:
# Entraîner le modèle
rf_pipe.fit(X_train, y_train)


In [14]:
# Prédire sur le test
y_pred = rf_pipe.predict(X_test)


In [15]:
# Évaluer (Accuracy + F1 + matrice de confusion)
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1-macro:", f1_score(y_test, y_pred, average="macro"))

print("\nClassification report:\n", classification_report(y_test, y_pred))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.7423312883435583
F1-macro: 0.7466023255496941

Classification report:
               precision    recall  f1-score   support

           0       0.70      0.64      0.67       108
           1       0.97      0.79      0.87       109
           2       0.63      0.80      0.70       109

    accuracy                           0.74       326
   macro avg       0.76      0.74      0.75       326
weighted avg       0.76      0.74      0.75       326


Confusion matrix:
 [[69  2 37]
 [ 9 86 14]
 [21  1 87]]


In [16]:
# Sauvegarder le pipeline entraîné
import joblib

joblib.dump(rf_pipe, "rf_triage_pipeline.joblib")



['rf_triage_pipeline.joblib']