Dataset: Mushroom Data Set
Jeff Schlimmer
https://archive.ics.uci.edu/ml/datasets/Mushroom

In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv('agaricus-lepiota.data', header=None)

In [2]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


perform 1-hot encoding on all the columns

In [3]:
for i in range(1,23):
    data = pd.get_dummies(data, columns=[i], prefix = [i])

In [4]:
data.head()

Unnamed: 0,0,1_b,1_c,1_f,1_k,1_s,1_x,2_f,2_g,2_s,...,21_s,21_v,21_y,22_d,22_g,22_l,22_m,22_p,22_u,22_w
0,p,0,0,0,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,1,0
1,e,0,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0
2,e,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,p,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
4,e,0,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0


The first row represents if the mushroom is (p)poisonous or (e)edible

Seperate our X and y sets

In [5]:
y = data[0]
X = data.drop(columns=0)

Split our testing and training sets

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

In [7]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')

In [8]:
from sklearn.metrics import accuracy_score

y_pred = dt.predict(X_test)
print('Accuracy: ' + str(accuracy_score(y_test, y_pred)))

Accuracy: 1.0


In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print("Accuracy score: ", accuracy_score(y_test, y_pred))
print("Precision score: ", precision_score(y_test, y_pred, average=None))
print("Recall score: ", recall_score(y_test, y_pred, average=None))

Accuracy score:  1.0
Precision score:  [1. 1.]
Recall score:  [1. 1.]


In [10]:
from sklearn.linear_model import Perceptron

ppn = Perceptron(max_iter=40, eta0=0.1, random_state=1)
ppn.fit(X_train,y_train)

Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=0.1,
           fit_intercept=True, max_iter=40, n_iter_no_change=5, n_jobs=None,
           penalty=None, random_state=1, shuffle=True, tol=0.001,
           validation_fraction=0.1, verbose=0, warm_start=False)

In [14]:
y_pred = ppn.predict(X_test)

print("Accuracy score: ", accuracy_score(y_test, y_pred))
print("Precision score: ", precision_score(y_test, y_pred, average=None))
print("Recall score: ", recall_score(y_test, y_pred, average=None))

Accuracy score:  1.0
Precision score:  [1. 1.]
Recall score:  [1. 1.]


Check feature importances

In [11]:
X.shape

(8124, 117)

In [12]:
print(dt.feature_importances_)

[0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 2.09464469e-03
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.39935486e-03
 1.74330243e-01 2.06477365e-03 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 6.21681161e-01
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 4.57193659e-05
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 2.11316678e-03 6.12423150e-02 0.00000000e+00 5.21255870e-02
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 6.92603097e-04
 0.00000000e+00 0.000000

In [13]:
import matplotlib.pyplot as plt

#https://towardsdatascience.com/feature-selection-techniques-in-machine-learning-with-python-f24e7da3f36e
feat_importances = pd.Series(dt.feature_importances_, index=X.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.show()

<Figure size 640x480 with 1 Axes>

5_n = No odor
4_f = No bruising
11_c = Club stalk-root
11_r = Rooted stalk-root
22_d = Woodland habitat
20_r = Green spore-print-color
15_y = Yellow stalk-color-below-ring