# Assignment 6 : Mushroom Classifier 

# Name: Matthew Bilodeau 



# Background:

The purpose of this assignment is to properly classify mushrooms based on various attributes using a KNN model and PCA and dimenstionality reduction. 

### Data: Mushroom Database
### Sources:
        (a) Mushroom records drawn from The Audubon Society Field Guide to North
            American Mushrooms (1981). G. H. Lincoff (Pres.), New York: Alfred
            A. Knopf
        (b) Donor: Jeff Schlimmer (Jeffrey.Schlimmer@a.gp.cs.cmu.edu)
        (c) Date: 27 April 1987
#### Attribute Information: (classes: edible=e, poisonous=p)
     1. cap-shape:                bell=b,conical=c,convex=x,flat=f,
                                  knobbed=k,sunken=s
     2. cap-surface:              fibrous=f,grooves=g,scaly=y,smooth=s
     3. cap-color:                brown=n,buff=b,cinnamon=c,gray=g,green=r,
                                  pink=p,purple=u,red=e,white=w,yellow=y
     4. bruises?:                 bruises=t,no=f
     5. odor:                     almond=a,anise=l,creosote=c,fishy=y,foul=f,
                                  musty=m,none=n,pungent=p,spicy=s
     6. gill-attachment:          attached=a,descending=d,free=f,notched=n
     7. gill-spacing:             close=c,crowded=w,distant=d
     8. gill-size:                broad=b,narrow=n
     9. gill-color:               black=k,brown=n,buff=b,chocolate=h,gray=g,
                                  green=r,orange=o,pink=p,purple=u,red=e,
                                  white=w,yellow=y
    10. stalk-shape:              enlarging=e,tapering=t
    11. stalk-root:               bulbous=b,club=c,cup=u,equal=e,
                                  rhizomorphs=z,rooted=r,missing=?
    12. stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s
    13. stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s
    14. stalk-color-above-ring:   brown=n,buff=b,cinnamon=c,gray=g,orange=o,
                                  pink=p,red=e,white=w,yellow=y
    15. stalk-color-below-ring:   brown=n,buff=b,cinnamon=c,gray=g,orange=o,
                                  pink=p,red=e,white=w,yellow=y
    16. veil-type:                partial=p,universal=u
    17. veil-color:               brown=n,orange=o,white=w,yellow=y
    18. ring-number:              none=n,one=o,two=t
    19. ring-type:                cobwebby=c,evanescent=e,flaring=f,large=l,
                                  none=n,pendant=p,sheathing=s,zone=z
    20. spore-print-color:        black=k,brown=n,buff=b,chocolate=h,green=r,
                                  orange=o,purple=u,white=w,yellow=y
    21. population:               abundant=a,clustered=c,numerous=n,
                                  scattered=s,several=v,solitary=y
    22. habitat:                  grasses=g,leaves=l,meadows=m,paths=p,
                                  urban=u,waste=w,woods=d

It is also noted there are 2480 missing Attribute values (denoted by "?"), all for attribute #11 (stalkroot)

In [334]:
# imports
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import cm
import numpy as np
import pandas as pd



mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
import os

# Where to save the figures
PROJECT_ROOT_DIR = "."
FOLDER = "figures"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, FOLDER)
os.makedirs(IMAGES_PATH, exist_ok=True)

In [335]:
# import data
file_path = 'C:/Users/matth/OneDrive/Desktop/Eastern/680/Module_4/mushrooms/agaricus-lepiota.data'
data = pd.read_csv(file_path, delimiter= ',')
data

Unnamed: 0,p,x,s,n,t,p.1,f,c,n.1,k,...,s.2,w,w.1,p.2,w.2,o,p.3,k.1,s.3,u
0,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
1,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
2,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
3,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
4,e,x,y,y,t,a,f,c,b,n,...,s,w,w,p,w,o,p,k,n,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8118,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8119,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8120,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8121,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


## 1. Imputing Missing Data 

In [336]:
#view data
missing_values = (data == "?").sum()

print(data.head(5), f' , Size: {data.size},\n missing values:\n{missing_values}')


   p  x  s  n  t p.1  f  c n.1  k  ... s.2  w w.1 p.2 w.2  o p.3 k.1 s.3  u
0  e  x  s  y  t   a  f  c   b  k  ...   s  w   w   p   w  o   p   n   n  g
1  e  b  s  w  t   l  f  c   b  n  ...   s  w   w   p   w  o   p   n   n  m
2  p  x  y  w  t   p  f  c   n  n  ...   s  w   w   p   w  o   p   k   s  u
3  e  x  s  g  f   n  f  w   b  k  ...   s  w   w   p   w  o   e   n   a  g
4  e  x  y  y  t   a  f  c   b  n  ...   s  w   w   p   w  o   p   k   n  g

[5 rows x 23 columns]  , Size: 186829,
 missing values:
p         0
x         0
s         0
n         0
t         0
p.1       0
f         0
c         0
n.1       0
k         0
e         0
e.1    2480
s.1       0
s.2       0
w         0
w.1       0
p.2       0
w.2       0
o         0
p.3       0
k.1       0
s.3       0
u         0
dtype: int64


All of the missing points are in the column "e.1" - Stalkroot. 

In [337]:
#finding the missing values: 
missing_value_rows = data.loc[data['e.1'] == '?'].index

In [338]:
###imputing missing values###
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split


data_copy = data.copy()

# Apply LabelEncoder to categorical columns
labeler = LabelEncoder()

labels = labeler.fit_transform(data_copy['e.1'])

# Apply OneHotEncoder to categorical columns
encoder = OneHotEncoder(sparse_output=False)
data_encoded = encoder.fit_transform(data_copy)

# Create a new DataFrame with encoded values
encoded_data = pd.DataFrame(data_encoded, labels).reset_index(drop= True)

encoded_data.columns = encoder.get_feature_names_out()
target_columns = [idx for idx in encoded_data.columns if 'e.1' in idx]
print(target_columns)
target_columns = ['e.1_b', 'e.1_c', 'e.1_e', 'e.1_r']

['e.1_?', 'e.1_b', 'e.1_c', 'e.1_e', 'e.1_r']


In [339]:
# create training and testing 

test_df = encoded_data.loc[missing_value_rows,:]
train_df = encoded_data.loc[~encoded_data.index.isin(missing_value_rows)]


#double checking all are accounted for: 


print( (test_df.shape,train_df.shape ))

((2480, 119), (5643, 119))


In [340]:
X_train = train_df.drop(columns= target_columns)
y_train = train_df.loc[:, target_columns]
X_test = test_df.drop(columns= target_columns) 
y_test = test_df.loc[:, target_columns]



In [341]:
#predicting missing values using a KNN classifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler


knn = KNeighborsClassifier(n_neighbors=5, 
                                p =2, 
                                metric= 'minkowski', 
                                )
knn.fit(X_train.values, y_train.values)

y_preds = knn.predict(X_test.values)

In [342]:
#assinging class names to prediction

class_names = ['b', 'c', 'e', 'r']

missing_values = pd.Series([class_names[np.argmax(row)] for row in y_preds], index=missing_value_rows)
print(f'{missing_values},\n\nUnique Values: {missing_values.unique()}\n\nValue Count:\n{missing_values.value_counts()}')

3983    b
4022    e
4075    e
4099    b
4103    b
       ..
8118    e
8119    e
8120    e
8121    b
8122    e
Length: 2480, dtype: object,

Unique Values: ['b' 'e' 'c']

Value Count:
b    1611
e     764
c     105
dtype: int64


In [343]:
#replacing '?' with na 
data = data.replace('?', np.nan)

#inserting values to original df 

data['e.1'] = data['e.1'].fillna(missing_values.to_dict())



Graded Concept Question #1 :
Why don’t we one-hot encode the response data to train the KNN model instead?



## 3. RandomForrest And Logistic Regression Models: Is it poisoinous?

In [355]:

data_copy = data.copy()
# Apply OneHotEncoder to categorical columns
labels = labeler.fit_transform(data_copy['p']) 

# Apply OneHotEncoder to categorical columns
encoder = OneHotEncoder(sparse_output=False, drop = 'first')
data_encoded = encoder.fit_transform(data_copy)

# Create a new DataFrame with encoded values
encoded_data = pd.DataFrame(data_encoded, labels).reset_index(drop= True)

encoded_data.columns = encoder.get_feature_names_out()
target_columns = encoded_data[ 'p_p']
feature_columns = encoded_data.drop(columns= ['p_p'])
X_train, X_test, y_train, y_test = train_test_split(feature_columns, target_columns, random_state=42)
X_train

Unnamed: 0,x_c,x_f,x_k,x_s,x_x,s_g,s_s,s_y,n_c,n_e,...,s.3_n,s.3_s,s.3_v,s.3_y,u_g,u_l,u_m,u_p,u_u,u_w
4633,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6574,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1600,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4987,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
5829,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5226,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5390,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
860,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
7603,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [356]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

%time rdf =RandomForestClassifier( random_state = 42).fit(X_train, y_train) 

y_preds = rdf.predict(X_test)


print(accuracy_score(y_test,y_preds),precision_score(y_test,y_preds, average= 'micro'),recall_score(y_test,y_preds, average= 'micro'))

CPU times: total: 219 ms
Wall time: 406 ms
1.0 1.0 1.0


In [368]:
from sklearn.linear_model import LogisticRegression


%time reg = LogisticRegression(random_state=42).fit(X_train, y_train)

y_preds = reg.predict(X_test)

print('accuracy_score',  accuracy_score(y_test,y_preds),
      'precision_score', precision_score(y_test,y_preds, average= 'micro'),
      'recall_score', recall_score(y_test,y_preds, average= 'micro'))

CPU times: total: 0 ns
Wall time: 21.1 ms
accuracy_score 0.9896602658788775 precision_score 0.9896602658788775 recall_score 0.9896602658788775


## Dimenstionality Reduction: 
Here we are performing PCA to keep 95% of the variance

In [358]:
from sklearn.decomposition import PCA
pca = PCA(n_components=.95)
pca_data = pca.fit_transform(feature_columns)
shape = pca_data.shape
print(shape)

(8123, 38)


In [370]:
X_train, X_test, y_train, y_test = train_test_split(pca_data, target_columns, random_state=42)

In [372]:
%time rdf =RandomForestClassifier( random_state = 42).fit(X_train, y_train) 

y_preds = rdf.predict(X_test)


print(accuracy_score(y_test,y_preds),precision_score(y_test,y_preds),recall_score(y_test,y_preds))

CPU times: total: 1.2 s
Wall time: 2.57 s
1.0 1.0 1.0


In [371]:
from sklearn.linear_model import LogisticRegression

%time reg = LogisticRegression(random_state=42).fit(X_train, y_train)

y_preds = reg.predict(X_test)

print('accuracy_score',  accuracy_score(y_test,y_preds),
      'precision_score', precision_score(y_test,y_preds),
      'recall_score', recall_score(y_test,y_preds))

CPU times: total: 62.5 ms
Wall time: 36.7 ms
accuracy_score 0.9896602658788775 precision_score 0.988659793814433 recall_score 0.9896800825593395
