In [1]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

### Data Ingestion

In [2]:
cols = ['class','cap_shape','cap_surface','cap_color','bruises_?','odour','gill_attachment',
        'gill_spacing','gill_size','gill_color','stalk_shape','stalk_root','stalk_surface_above_ring',
        'stalk_surface_below_ring','stalk_color_above_ring','stalk_color_below_ring','veil_type', 
        'veil_color','ring_number','ring_type','spore_print_color','population','habitat']

In [3]:
filepath = "data/agaricus-lepiota.data"

In [4]:
df = pd.read_csv(filepath, names=cols)
#inspecting first 3 rows
df.iloc[0:3]

Unnamed: 0,class,cap_shape,cap_surface,cap_color,bruises_?,odour,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m


### Data Inspection


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
class                       8124 non-null object
cap_shape                   8124 non-null object
cap_surface                 8124 non-null object
cap_color                   8124 non-null object
bruises_?                   8124 non-null object
odour                       8124 non-null object
gill_attachment             8124 non-null object
gill_spacing                8124 non-null object
gill_size                   8124 non-null object
gill_color                  8124 non-null object
stalk_shape                 8124 non-null object
stalk_root                  8124 non-null object
stalk_surface_above_ring    8124 non-null object
stalk_surface_below_ring    8124 non-null object
stalk_color_above_ring      8124 non-null object
stalk_color_below_ring      8124 non-null object
veil_type                   8124 non-null object
veil_color                  8124 non-null object
ring_number

In [7]:
from sklearn.preprocessing import LabelEncoder 

le =  LabelEncoder()

# Iterate over all the values of each column and extract their dtypes
for col in df:
    if df[col].dtype=='object':
    # Use LabelEncoder to do the numeric transformation
        df[col]=le.fit_transform(df[col])

In [8]:
df.dtypes

class                       int64
cap_shape                   int64
cap_surface                 int64
cap_color                   int64
bruises_?                   int64
odour                       int64
gill_attachment             int64
gill_spacing                int64
gill_size                   int64
gill_color                  int64
stalk_shape                 int64
stalk_root                  int64
stalk_surface_above_ring    int64
stalk_surface_below_ring    int64
stalk_color_above_ring      int64
stalk_color_below_ring      int64
veil_type                   int64
veil_color                  int64
ring_number                 int64
ring_type                   int64
spore_print_color           int64
population                  int64
habitat                     int64
dtype: object

In [9]:
# Print unique classes values
print(set(df['class']))

{0, 1}


In [10]:
# Calculate the ratio size of each classes group
df.groupby(['class']).size() / df.shape[0] * 100

class
0    51.797144
1    48.202856
dtype: float64

In [11]:
# sns.pairplot(df,hue="classes")

### Train Test Split 

In [12]:
# define target as y
y = df['class']
# define X as anything BUT target variable
X_cols = [col for col in df.columns if col not in 'class']
X = df[X_cols]

In [13]:
#split data into 70% train, 30% test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

### Decision Trees

In [14]:
from sklearn.tree import DecisionTreeClassifier

In [15]:
dtree = DecisionTreeClassifier()

In [16]:
dtree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

### Prediction & Evaluation

In [17]:
predictions = dtree.predict(X_test)

In [18]:
from sklearn.metrics import classification_report,confusion_matrix

In [19]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1251
           1       1.00      1.00      1.00      1187

    accuracy                           1.00      2438
   macro avg       1.00      1.00      1.00      2438
weighted avg       1.00      1.00      1.00      2438



In [20]:
print(confusion_matrix(y_test,predictions))

[[1251    0]
 [   0 1187]]


The 0's indicate that nothing has been mislabeled.

### Random Forest Classifier

In [21]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [22]:
rfc_pred = rfc.predict(X_test)

In [23]:
print(confusion_matrix(y_test,rfc_pred))

[[1251    0]
 [   0 1187]]
