In [1]:
%pip install pandas scikit-learn fastapi uvicorn joblib


Collecting fastapi
  Downloading fastapi-0.120.0-py3-none-any.whl.metadata (28 kB)
Collecting uvicorn
  Downloading uvicorn-0.38.0-py3-none-any.whl.metadata (6.8 kB)
Collecting starlette<0.49.0,>=0.40.0 (from fastapi)
  Using cached starlette-0.48.0-py3-none-any.whl.metadata (6.3 kB)
Collecting pydantic!=1.8,!=1.8.1,!=2.0.0,!=2.0.1,!=2.1.0,<3.0.0,>=1.7.4 (from fastapi)
  Downloading pydantic-2.12.3-py3-none-any.whl.metadata (87 kB)
     ---------------------------------------- 0.0/87.7 kB ? eta -:--:--
     ---- ----------------------------------- 10.2/87.7 kB ? eta -:--:--
     -------------------------- ----------- 61.4/87.7 kB 656.4 kB/s eta 0:00:01
     -------------------------------------- 87.7/87.7 kB 823.7 kB/s eta 0:00:00
Collecting annotated-doc>=0.0.2 (from fastapi)
  Downloading annotated_doc-0.0.2-py3-none-any.whl.metadata (5.4 kB)
Collecting click>=7.0 (from uvicorn)
  Using cached click-8.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting h11>=0.8 (from uvicorn)
  Using c


[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


### Étape 1 : Préparation du dataset

In [4]:
import pandas as pd

# Charger le dataset fusionné
df = pd.read_csv("data/medical_diagnostic_dataset.csv")

# Vérifier les colonnes et valeurs manquantes
print(df.head())
print("les valeurs manquantes:", df.isnull().sum())


    age    bp  glucose   bmi disease_type  cholesterol  creatinine  \
0  50.0  72.0    148.0  33.6     diabetes          NaN         NaN   
1  31.0  66.0     85.0  26.6     diabetes          NaN         NaN   
2  32.0  64.0    183.0  23.3     diabetes          NaN         NaN   
3  21.0  66.0     89.0  28.1     diabetes          NaN         NaN   
4  33.0  40.0    137.0  43.1     diabetes          NaN         NaN   

   hemoglobin  hypertension diabetes_history  anemia  heart_rate  
0         NaN           NaN              NaN     NaN         NaN  
1         NaN           NaN              NaN     NaN         NaN  
2         NaN           NaN              NaN     NaN         NaN  
3         NaN           NaN              NaN     NaN         NaN  
4         NaN           NaN              NaN     NaN         NaN  
les valeurs manquantes: age                    9
bp                  1019
glucose             1051
bmi                 1425
disease_type           0
cholesterol         2175
cre

In [6]:
#  Gestion des valeurs manquantes
# Colonnes numériques
num_cols = ['age', 'bp', 'glucose', 'bmi', 'cholesterol', 'creatinine', 'hemoglobin', 'heart_rate']
for col in num_cols:
    df[col] = df[col].fillna(df[col].mean())

In [12]:
bin_cols = ['hypertension', 'diabetes_history', 'anemia']

for col in bin_cols:
    # Tout en float
    df[col] = pd.to_numeric(df[col], errors='coerce')  # 'yes', 'no', '0.0', '1.0' → float
    # Remplacer les NaN par 0
    df[col] = df[col].fillna(0)
    # Convertir en int
    df[col] = df[col].astype(int)


In [13]:
for col in bin_cols:
    print(col, df[col].unique())


hypertension [0 1]
diabetes_history [0 1]
anemia [0 1]


In [14]:
df.isnull().sum()

age                 0
bp                  0
glucose             0
bmi                 0
disease_type        0
cholesterol         0
creatinine          0
hemoglobin          0
hypertension        0
diabetes_history    0
anemia              0
heart_rate          0
dtype: int64

### Étape 2 : Encodage des variables catégoriques

In [15]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['target'] = le.fit_transform(df['disease_type'])
# Maintenant target est un entier 0,1,2,3


In [19]:
df['target'].unique()

array([0, 1, 3, 2])

### Étape 3 : Sélection des features et target

In [20]:
#  Sélection des features et target
features = ['age', 'bp', 'glucose', 'bmi', 'cholesterol', 'creatinine',
            'hemoglobin', 'heart_rate', 'hypertension', 'diabetes_history', 'anemia']
X = df[features]
y = df['target']


In [23]:
X.head(20)

Unnamed: 0,age,bp,glucose,bmi,cholesterol,creatinine,hemoglobin,heart_rate,hypertension,diabetes_history,anemia
0,50.0,72.0,148.0,33.6,246.0,3.072454,12.526437,76.056604,0,0,0
1,31.0,66.0,85.0,26.6,246.0,3.072454,12.526437,76.056604,0,0,0
2,32.0,64.0,183.0,23.3,246.0,3.072454,12.526437,76.056604,0,0,0
3,21.0,66.0,89.0,28.1,246.0,3.072454,12.526437,76.056604,0,0,0
4,33.0,40.0,137.0,43.1,246.0,3.072454,12.526437,76.056604,0,0,0
5,30.0,74.0,116.0,25.6,246.0,3.072454,12.526437,76.056604,0,0,0
6,26.0,50.0,78.0,31.0,246.0,3.072454,12.526437,76.056604,0,0,0
7,29.0,0.0,115.0,35.3,246.0,3.072454,12.526437,76.056604,0,0,0
8,53.0,70.0,197.0,30.5,246.0,3.072454,12.526437,76.056604,0,0,0
9,54.0,96.0,125.0,0.0,246.0,3.072454,12.526437,76.056604,0,0,0


In [25]:
X.tail(20)

Unnamed: 0,age,bp,glucose,bmi,cholesterol,creatinine,hemoglobin,heart_rate,hypertension,diabetes_history,anemia
3180,66.0,99.79138,76.271754,31.42,246.0,3.072454,12.526437,90.0,1,0,0
3181,66.0,99.79138,76.271754,20.49,246.0,3.072454,12.526437,90.0,1,0,0
3182,66.0,99.79138,76.271754,29.84,246.0,3.072454,12.526437,95.0,1,0,0
3183,66.0,99.79138,76.271754,23.63,246.0,3.072454,12.526437,80.0,1,0,0
3184,66.0,99.79138,76.271754,25.27,246.0,3.072454,12.526437,80.0,1,0,0
3185,67.0,99.79138,76.271754,25.95,246.0,3.072454,12.526437,65.0,0,0,0
3186,67.0,99.79138,76.271754,25.81,246.0,3.072454,12.526437,70.0,0,0,0
3187,67.0,99.79138,76.271754,15.54,246.0,3.072454,12.526437,96.0,0,0,0
3188,67.0,99.79138,76.271754,22.11,246.0,3.072454,12.526437,69.0,0,0,0
3189,67.0,99.79138,76.271754,24.53,246.0,3.072454,12.526437,65.0,0,0,0


### Étape 4 : Séparation train/test

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(2560, 11) (640, 11) (2560,) (640,)


### Étape 5 : Entraînement d’un modèle de classification

In [28]:
pip install pandas scikit-learn xgboost lightgbm catboost joblib


Collecting xgboost
  Downloading xgboost-3.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Collecting lightgbm
  Using cached lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-win_amd64.whl.metadata (1.5 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.21-py3-none-any.whl.metadata (12 kB)
Collecting matplotlib (from catboost)
  Downloading matplotlib-3.10.7-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting plotly (from catboost)
  Downloading plotly-6.3.1-py3-none-any.whl.metadata (8.5 kB)
Collecting contourpy>=1.0.1 (from matplotlib->catboost)
  Downloading contourpy-1.3.3-cp311-cp311-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib->catboost)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib->catboost)
  Downloading fonttools-4.60.1-cp311-cp311-win_amd64.whl.metadata (114 kB)
     ---------------------------------------


[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [29]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

models = {
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=200, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0)
}


In [30]:
from sklearn.metrics import accuracy_score, classification_report

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"\n=== {name} ===")
    print("Accuracy:", acc)
    print(classification_report(y_test, y_pred))
    results[name] = (model, acc)

# Trouver le meilleur modèle
best_model_name = max(results, key=lambda x: results[x][1])
best_model = results[best_model_name][0]
print(f"\n Meilleur modèle : {best_model_name} avec accuracy {results[best_model_name][1]}")



=== RandomForest ===
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       183
           1       1.00      1.00      1.00       198
           2       1.00      1.00      1.00       173
           3       1.00      1.00      1.00        86

    accuracy                           1.00       640
   macro avg       1.00      1.00      1.00       640
weighted avg       1.00      1.00      1.00       640


=== GradientBoosting ===
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       183
           1       1.00      1.00      1.00       198
           2       1.00      1.00      1.00       173
           3       1.00      1.00      1.00        86

    accuracy                           1.00       640
   macro avg       1.00      1.00      1.00       640
weighted avg       1.00      1.00      1.00       640



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== XGBoost ===
Accuracy: 0.9984375
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       183
           1       1.00      1.00      1.00       198
           2       1.00      1.00      1.00       173
           3       1.00      0.99      0.99        86

    accuracy                           1.00       640
   macro avg       1.00      1.00      1.00       640
weighted avg       1.00      1.00      1.00       640

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000973 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 835
[LightGBM] [Info] Number of data points in the train set: 2560, number of used features: 11
[LightGBM] [Info] Start training from score -1.476151
[LightGBM] [Info] Start training from score -1.129958
[LightGBM] [Info] Start training from score -1.121529
[LightGBM] [Inf

In [32]:
import joblib

joblib.dump(best_model, "models/best_medical_diagnosis_model.pkl")
joblib.dump(le, "models/label_encoder.pkl")
print(" Meilleur modèle et label encoder sauvegardés !")


 Meilleur modèle et label encoder sauvegardés !
