In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split



In [6]:
columns = [
    'class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
    'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
    'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring',
    'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color',
    'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat'
]

df = pd.read_csv("agaricus-lepiota.data", names=columns)
df

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [8]:
# Dictionnaires de correspondance pour chaque variable
mapping = {
    'class': {'e': 'edible', 'p': 'poisonous'},
    'cap-shape': {'b': 'bell', 'c': 'conical', 'x': 'convex', 'f': 'flat', 'k': 'knobbed', 's': 'sunken'},
    'cap-surface': {'f': 'fibrous', 'g': 'grooves', 'y': 'scaly', 's': 'smooth'},
    'cap-color': {'n': 'brown','b': 'buff','c': 'cinnamon','g': 'gray','r': 'green','p': 'pink',
                  'u': 'purple','e': 'red','w': 'white','y': 'yellow'},
    'bruises': {'t': 'bruises','f': 'no'},
    'odor': {'a': 'almond','l': 'anise','c': 'creosote','y': 'fishy','f': 'foul','m': 'musty',
             'n': 'none','p': 'pungent','s': 'spicy'},
    'gill-attachment': {'a': 'attached','d': 'descending','f': 'free','n': 'notched'},
    'gill-spacing': {'c': 'close','w': 'crowded','d': 'distant'},
    'gill-size': {'b': 'broad','n': 'narrow'},
    'gill-color': {'k': 'black','n': 'brown','b': 'buff','h': 'chocolate','g': 'gray','r': 'green',
                   'o': 'orange','p': 'pink','u': 'purple','e': 'red','w': 'white','y': 'yellow'},
    'stalk-shape': {'e': 'enlarging','t': 'tapering'},
    'stalk-root': {'b': 'bulbous','c': 'club','u': 'cup','e': 'equal','z': 'rhizomorphs','r': 'rooted','?': 'missing'},
    'stalk-surface-above-ring': {'f': 'fibrous','y': 'scaly','k': 'silky','s': 'smooth'},
    'stalk-surface-below-ring': {'f': 'fibrous','y': 'scaly','k': 'silky','s': 'smooth'},
    'stalk-color-above-ring': {'n': 'brown','b': 'buff','c': 'cinnamon','g': 'gray','o': 'orange','p': 'pink',
                               'e': 'red','w': 'white','y': 'yellow'},
    'stalk-color-below-ring': {'n': 'brown','b': 'buff','c': 'cinnamon','g': 'gray','o': 'orange','p': 'pink',
                               'e': 'red','w': 'white','y': 'yellow'},
    'veil-type': {'p': 'partial','u': 'universal'},
    'veil-color': {'n': 'brown','o': 'orange','w': 'white','y': 'yellow'},
    'ring-number': {'n': 'none','o': 'one','t': 'two'},
    'ring-type': {'c': 'cobwebby','e': 'evanescent','f': 'flaring','l': 'large','n': 'none',
                  'p': 'pendant','s': 'sheathing','z': 'zone'},
    'spore-print-color': {'k': 'black','n': 'brown','b': 'buff','h': 'chocolate','r': 'green',
                          'o': 'orange','u': 'purple','w': 'white','y': 'yellow'},
    'population': {'a': 'abundant','c': 'clustered','n': 'numerous','s': 'scattered','v': 'several','y': 'solitary'},
    'habitat': {'g': 'grasses','l': 'leaves','m': 'meadows','p': 'paths','u': 'urban','w': 'waste','d': 'woods'}
}

# Remplacer les lettres par les mots dans le DataFrame
df_mush = df.replace(mapping)

# Afficher un aperçu
df_mush

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,poisonous,convex,smooth,brown,bruises,pungent,free,close,narrow,black,...,smooth,white,white,partial,white,one,pendant,black,scattered,urban
1,edible,convex,smooth,yellow,bruises,almond,free,close,broad,black,...,smooth,white,white,partial,white,one,pendant,brown,numerous,grasses
2,edible,bell,smooth,white,bruises,anise,free,close,broad,brown,...,smooth,white,white,partial,white,one,pendant,brown,numerous,meadows
3,poisonous,convex,scaly,white,bruises,pungent,free,close,narrow,brown,...,smooth,white,white,partial,white,one,pendant,black,scattered,urban
4,edible,convex,smooth,gray,no,none,free,crowded,broad,black,...,smooth,white,white,partial,white,one,evanescent,brown,abundant,grasses
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,edible,knobbed,smooth,brown,no,none,attached,close,broad,yellow,...,smooth,orange,orange,partial,orange,one,pendant,buff,clustered,leaves
8120,edible,convex,smooth,brown,no,none,attached,close,broad,yellow,...,smooth,orange,orange,partial,brown,one,pendant,buff,several,leaves
8121,edible,flat,smooth,brown,no,none,attached,close,broad,brown,...,smooth,orange,orange,partial,orange,one,pendant,buff,clustered,leaves
8122,poisonous,knobbed,scaly,brown,no,fishy,free,close,narrow,buff,...,silky,white,white,partial,white,one,evanescent,white,several,leaves


In [10]:
df_mush["class"].value_counts()

class
edible       4208
poisonous    3916
Name: count, dtype: int64

On voit que les proportions de edible et de poisonous sont à peut près égales, on peut donc diviser notre dataset en deux (entrainement et test) de façon aléatoire, et il y aura environ 50% de label edible dans train et dans test.

In [14]:
X = df_mush.drop('class', axis=1) #X = toutes les colonnes sauf la cible
y = df_mush['class'] #y = les labels

# Séparation train / test (80% / 20%)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,     # 20% du dataset en test
    random_state=42,   # pour reproduire la même coupure à chaque exécution
    shuffle=True       # mélange les données avant de couper (par défaut)
)

print("Train :", X_train.shape)
print("Test  :", X_test.shape)


Train : (6499, 22)
Test  : (1625, 22)


# Random Forest

In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score

In [None]:
# On modifie le dataset pour que chaque colonne soit maintenant binaire. Les différentes valeurs d'une variables sont toutes en colonnes. 

# Liste de toutes les colonnes à encoder
categorical_features = X_train.columns.tolist()

# Créer le transformateur
# handle_unknown='ignore' permet de gérer des catégories qui n'apparaissent que dans le test
preprocessor = ColumnTransformer( #permet d’appliquer l’One-Hot Encoder à toutes les colonnes catégorielles en une seule fois.
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_features)
    ]
)

# Appliquer l'encodage sur le train
X_train_encoded = preprocessor.fit_transform(X_train)

# Appliquer le même encodage sur le test
X_test_encoded = preprocessor.transform(X_test)

# Vérifier la forme
print("Shape X_train_encoded :", X_train_encoded.shape)
print("Shape X_test_encoded  :", X_test_encoded.shape)

X_train_encoded

Shape X_train_encoded : (6499, 117)
Shape X_test_encoded  : (1625, 117)




array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [20]:


# Définition des hyperparamètres à tester
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5]
}

# Modèle
rf = RandomForestClassifier(random_state=42)

# GridSearchCV
grid = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=3,                      # ← 3 folds comme tu veux
    scoring='accuracy',        # ou 'f1', 'precision', 'recall'
    n_jobs=-1,                 # utilise tous les CPU
    verbose=1                  # affiche la progression
)

# Entraînement (avec validation croisée intégrée)
grid.fit(X_train_encoded, y_train)


Fitting 3 folds for each of 18 candidates, totalling 54 fits


In [21]:
print("Meilleurs hyperparamètres :", grid.best_params_)
print("Score moyen validation :", grid.best_score_)

Meilleurs hyperparamètres : {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 50}
Score moyen validation : 1.0


In [24]:
y_pred = grid.best_estimator_.predict(X_test_encoded)
test_accuracy = accuracy_score(y_test, y_pred)
print("Accuracy sur la base de test :", test_accuracy)

Accuracy sur la base de test : 1.0
