In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler # Iris features are numeric, scaling can be good
from sklearn.datasets import load_iris

import joblib

# Optional: Configure visualisations
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

In [6]:
# Warnings are not shown
import warnings
warnings.filterwarnings('ignore')


# Étape 1 : Chargement et Prétraitement des Données (Iris)

In [7]:
print("--- Chargement et Prétraitement du Dataset Iris ---")

# Load the Iris dataset
iris = load_iris()
X_raw = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.Series(iris.target, name='species')
class_names = list(iris.target_names)
feature_names = list(iris.feature_names)

print("--- Aperçu des données initiales (Iris) ---")
print(X_raw.head())
print(f"\nClasses cibles: {class_names}")
print(f"Noms des caractéristiques: {feature_names}")

print("\n--- Informations sur le dataset ---")
X_raw.info()
print("\n--- Valeurs manquantes par colonne ---")
print(X_raw.isnull().sum()) # Expected to be zero for Iris

# --- Prétraitement ---
# For Iris, features are all numerical. We'll apply standard scaling.
# No complex imputation or encoding needed like for Titanic.

scaler = StandardScaler()
X_processed_array = scaler.fit_transform(X_raw)
X_processed_df = pd.DataFrame(X_processed_array, columns=feature_names)

print("\n--- Dimensions de X après prétraitement ---")
print(X_processed_df.shape)
print("\n--- Aperçu de X après prétraitement ---")
print(X_processed_df.head())

# --- Enregistrement des données prétraitées (Optionnel pour Iris, mais pour la cohérence) ---
X_processed_df.to_csv('X_iris_preprocessed.csv', index=False)
y.to_csv('y_iris_preprocessed.csv', index=False, header=['species'])
print("\n--- Données prétraitées X_iris_preprocessed.csv et y_iris_preprocessed.csv enregistrées. ---")

# --- Séparation des données en ensembles d'entraînement et de test ---
X_train, X_test, y_train, y_test = train_test_split(
    X_processed_df, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Dimensions de X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"Dimensions de X_test: {X_test.shape}, y_test: {y_test.shape}")

--- Chargement et Prétraitement du Dataset Iris ---
--- Aperçu des données initiales (Iris) ---
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2

Classes cibles: [np.str_('setosa'), np.str_('versicolor'), np.str_('virginica')]
Noms des caractéristiques: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']

--- Informations sur le dataset ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (

# Étape 2 : Entraînement et Évaluation du Modèle Black-Box

In [8]:
print("\n--- Entraînement du Modèle Black-Box (Random Forest) sur Iris ---")

# Initialize Random Forest classifier
# For multiclass, ensure it handles it appropriately (RandomForestClassifier does)
blackbox_model_iris = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

# Train the model
blackbox_model_iris.fit(X_train, y_train)

# Make predictions on the test set
y_pred_blackbox_iris = blackbox_model_iris.predict(X_test)

# Evaluate the model
accuracy_blackbox_iris = accuracy_score(y_test, y_pred_blackbox_iris)
print(f"Accuracy du modèle Black-Box (Random Forest) sur Iris: {accuracy_blackbox_iris:.4f}")
print("\nClassification Report du Black-Box (Iris):")
print(classification_report(y_test, y_pred_blackbox_iris, target_names=class_names))
print("\nConfusion Matrix du Black-Box (Iris):")
print(confusion_matrix(y_test, y_pred_blackbox_iris))

# --- Enregistrement du modèle black-box ---
joblib.dump(blackbox_model_iris, 'blackbox_model_iris.JOBLIB')
print("\n--- Modèle blackbox_model_iris.JOBLIB enregistré. ---")


--- Entraînement du Modèle Black-Box (Random Forest) sur Iris ---
Accuracy du modèle Black-Box (Random Forest) sur Iris: 0.9333

Classification Report du Black-Box (Iris):
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       0.90      0.90      0.90        10
   virginica       0.90      0.90      0.90        10

    accuracy                           0.93        30
   macro avg       0.93      0.93      0.93        30
weighted avg       0.93      0.93      0.93        30


Confusion Matrix du Black-Box (Iris):
[[10  0  0]
 [ 0  9  1]
 [ 0  1  9]]

--- Modèle blackbox_model_iris.JOBLIB enregistré. ---
