In [84]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

## Caricamento dataset, stampe e prima pulizia dei dati

In [85]:
file_path = 'mushroom_data_all.csv' 
data = pd.read_csv(file_path)

In [86]:
data

Unnamed: 0,class_edible,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [87]:
data.columns

Index(['class_edible', 'cap-shape', 'cap-surface', 'cap-color', 'bruises',
       'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

In [88]:
# controllo se ci sono valori nulli
data.isnull().sum()

class_edible                0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [89]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class_edible              8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [90]:
# controllo quanti valori unici ci sono per ogni colonna
print ("Unique values :  \n", data.nunique())

Unique values :  
 class_edible                 2
cap-shape                    6
cap-surface                  4
cap-color                   10
bruises                      2
odor                         9
gill-attachment              2
gill-spacing                 2
gill-size                    2
gill-color                  12
stalk-shape                  2
stalk-root                   5
stalk-surface-above-ring     4
stalk-surface-below-ring     4
stalk-color-above-ring       9
stalk-color-below-ring       9
veil-type                    1
veil-color                   4
ring-number                  3
ring-type                    5
spore-print-color            9
population                   6
habitat                      7
dtype: int64


In [91]:
# stampo i dati relativi alla prima parte di colonne
data[['class_edible', 'cap-shape', 'cap-surface', 'cap-color', 'bruises',
       'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape']]     

Unnamed: 0,class_edible,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape
0,p,x,s,n,t,p,f,c,n,k,e
1,e,x,s,y,t,a,f,c,b,k,e
2,e,b,s,w,t,l,f,c,b,n,e
3,p,x,y,w,t,p,f,c,n,n,e
4,e,x,s,g,f,n,f,w,b,k,t
...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,e
8120,e,x,s,n,f,n,a,c,b,y,e
8121,e,f,s,n,f,n,a,c,b,n,e
8122,p,k,y,n,f,y,f,c,n,b,t


In [92]:
# stampo i dati relativi alla seconda parte di colonne
data[['stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 
      'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type', 
      'spore-print-color', 'population', 'habitat']]

Unnamed: 0,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,e,s,s,w,w,p,w,o,p,k,s,u
1,c,s,s,w,w,p,w,o,p,n,n,g
2,c,s,s,w,w,p,w,o,p,n,n,m
3,e,s,s,w,w,p,w,o,p,k,s,u
4,e,s,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...
8119,?,s,s,o,o,p,o,o,p,b,c,l
8120,?,s,s,o,o,p,n,o,p,b,v,l
8121,?,s,s,o,o,p,o,o,p,b,c,l
8122,?,s,k,w,w,p,w,o,e,w,v,l


In [93]:
# ho notato i punti interrogativi sulla colonna 'stalk_root'
# controllo quindi i valori di quella colonna 
data['stalk-root'].value_counts()

b    3776
?    2480
e    1120
c     556
r     192
Name: stalk-root, dtype: int64

In [94]:
# sostituisco NaN ai punti interrogativi (sono effettivamente valori nulli)
data['stalk-root'] = data['stalk-root'].replace("?",np.nan)

In [95]:
# e poi elimino quella colonna dal dataset
data = data.drop(['stalk-root'], axis=1)

In [96]:
data.shape

(8124, 22)

## Label Encoding

In [97]:
# individuo le colonne di tipo 'object'
s = (data.dtypes == 'object')
object_cols = list(s[s].index)
print (object_cols)

['class_edible', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']


In [98]:
# copio il dataset ed effettuo il label encoding per ogni colonna di tipo 'object'
label_data = data.copy()
label_encoder = LabelEncoder()
for col in object_cols:
    label_data[col] = label_encoder.fit_transform(data[col])

In [99]:
label_data

Unnamed: 0,class_edible,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0,3,2,4,0,5,0,0,0,11,...,2,5,5,0,1,1,4,0,1,2
8120,0,5,2,4,0,5,0,0,0,11,...,2,5,5,0,0,1,4,0,4,2
8121,0,2,2,4,0,5,0,0,0,5,...,2,5,5,0,1,1,4,0,1,2
8122,1,3,3,4,0,8,1,0,1,0,...,1,7,7,0,2,1,0,7,4,2


## Individuazione target e split dei dati

In [100]:
# seleziono la colonna target
y=label_data.class_edible

In [101]:
y

0       1
1       0
2       0
3       1
4       0
       ..
8119    0
8120    0
8121    0
8122    1
8123    0
Name: class_edible, Length: 8124, dtype: int32

In [102]:
# elimino la colonna target dal dataset
X=label_data.drop(['class_edible'], axis=1)

In [103]:
X.shape

(8124, 21)

In [104]:
# split dei dati
train_X,val_X,train_y,val_y=train_test_split(X,y,test_size=0.3,random_state=1)

In [105]:
train_X

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
6848,2,2,2,0,2,1,0,1,0,1,...,1,6,7,0,2,1,0,7,4,2
2246,5,3,3,1,5,1,0,0,7,1,...,2,3,6,0,2,1,4,3,4,0
2095,5,3,3,1,5,1,0,0,7,1,...,2,3,7,0,2,1,4,2,5,0
4535,2,3,9,0,2,1,0,0,3,0,...,1,0,6,0,2,1,2,1,5,0
6865,2,2,4,0,8,1,0,1,0,1,...,2,7,7,0,2,1,0,7,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7935,3,3,2,0,8,1,0,1,0,1,...,2,7,7,0,2,1,0,7,4,0
5192,5,3,4,0,7,1,0,1,0,1,...,2,7,7,0,2,1,0,7,4,0
3980,2,3,3,0,2,1,0,0,7,0,...,1,0,4,0,2,1,2,1,4,1
235,5,0,8,1,3,1,1,1,10,1,...,2,7,7,0,2,1,4,3,4,0


In [106]:
val_X

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
1392,2,2,8,0,5,1,1,0,3,1,...,0,7,7,0,2,1,0,2,0,1
4051,5,0,5,0,1,1,1,1,9,0,...,2,7,7,0,2,1,4,3,4,0
3725,5,2,8,0,1,1,1,1,2,0,...,2,7,7,0,2,1,4,3,4,0
7177,3,2,4,0,2,1,0,1,0,1,...,1,6,7,0,2,1,0,7,4,2
103,5,3,9,1,0,1,0,0,5,0,...,3,7,7,0,2,1,4,2,5,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6017,2,3,4,0,2,1,0,1,0,1,...,2,6,7,0,2,1,0,7,4,4
2505,5,3,2,1,5,1,0,0,9,1,...,2,6,6,0,2,1,4,3,5,0
5333,2,2,8,1,2,1,0,0,10,1,...,0,7,7,0,2,1,4,1,4,5
2917,2,0,4,1,5,1,0,0,7,1,...,2,6,6,0,2,1,4,2,5,0


In [107]:
train_y

6848    1
2246    0
2095    0
4535    1
6865    1
       ..
7935    1
5192    1
3980    1
235     0
5157    1
Name: class_edible, Length: 5686, dtype: int32

In [108]:
val_y

1392    0
4051    1
3725    1
7177    1
103     0
       ..
6017    1
2505    0
5333    1
2917    0
6022    1
Name: class_edible, Length: 2438, dtype: int32

## Scelta del modello e allenamento

In [109]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=0)
model.fit(train_X,train_y)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [110]:
# validazione
preds = model.predict(val_X)

## Accuracy

In [111]:
# calcolo MAE
mae = mean_absolute_error(val_y,preds)
# calcolo accuracy sul training set in percentuale
train_accuracy = model.score(train_X, train_y)*100
# calcolo accuracy sul test set in percentuale
test_accuracy = model.score(val_X, val_y)*100
print('MAE: ')
print(mae)
print('Accuracy training set: ')
print(train_accuracy)
print('Accuracy test set: ')
print(test_accuracy)

MAE: 
0.0
Accuracy training set: 
100.0
Accuracy test set: 
100.0
