# Mushroom Stew

## Develop a mushroom stew, that is visually appealing, pleasingly smelling, and preferably non-toxic.

* Explore the fields, which ones could affect the taste or smell, which can be ignored? 
* Which fields may affect if the stew is visually appealing? 
* Use graphics to support your choices

## Import Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import plotly.express as px
from pandas_profiling import ProfileReport
from urllib.request import urlopen
import json
import plotly.graph_objects as go
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.tools.eval_measures import rmse
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import sklearn.metrics as metrics

## Load Dataset, Explore and Display Features

In [None]:
col_names=['class','cap-shape', 'cap-surface', 'cap-color', 'bruises?', 'odor','gill-attachment',\
           'gill-spacing','gill-size','gill-color','stalk-shape','stalk-root','stalk-surface-above-ring',\
           'stalk-surface-below-ring','stalk-color-above-ring','stalk-color-below-ring','veil-type',\
           'veil-color','ring-number','ring-type','spore-print-color','population','habitat'] 

mushroom_df = pd.read_csv('expanded.csv', names=col_names, header=None)

In [None]:
pd.set_option("display.max_columns", None)
mushroom_df.head(5)

In [None]:
mushroom_df.dtypes

In [None]:
mushroom_df.describe()

In [None]:
profile = ProfileReport(mushroom_df)
profile

## Feature Engineering


### The field `veil-type` doesn't contribute any information (all are the same value) and can be dropped from the dataset

In [None]:
mushroom_df_adj = mushroom_df.drop('veil-type', axis=1)
mushroom_df_adj.shape

## Dataframe Expanded into Dummy Variables

### First version: encode all variables with `get_dummies`

In [None]:
df_dummy = pd.get_dummies(mushroom_df, columns=col_names)

df_dummy.head()

### Second version: Label encode the "class" column and encode the input columns using `get_dummies`

In [None]:
# Don't convert "class" column to dummies, convert to Binary instead using LabelEncoder

# label_encoder object
label_encoder =LabelEncoder()
# Encode labels in column. 
mushroom_df_adj['class']= label_encoder.fit_transform(mushroom_df_adj['class']) # 0 is Edible, 1 is Poisonous

col_names_dummies=['cap-shape', 'cap-surface', 'cap-color', 'bruises?', 'odor','gill-attachment',\
           'gill-spacing','gill-size','gill-color','stalk-shape','stalk-root','stalk-surface-above-ring',\
           'stalk-surface-below-ring','stalk-color-above-ring','stalk-color-below-ring',\
           'veil-color','ring-number','ring-type','spore-print-color','population','habitat'] 

In [None]:
# drop_first = True helps avoid multicollinearity
df_dummy2 = pd.get_dummies(mushroom_df_adj, columns=col_names_dummies, drop_first=True)
df_dummy2.head()

## Pleasant Smell

In [None]:
good_odor = ['ALMOND','ANISE','SPICY'] # Up to interpretation I suppose

uniq_vals = mushroom_df['odor'].unique()
count_edible=[]
count_poison=[]
for j in uniq_vals:
    count_edible.append(len(mushroom_df[(mushroom_df['odor']==j)
                                            & (mushroom_df['class']=='EDIBLE')]))
    count_poison.append(len(mushroom_df[(mushroom_df['odor']==j)
                                            & (mushroom_df['class']=='POISONOUS')]))
fig, ax = plt.subplots(figsize=(9,6))

ax.bar(uniq_vals, count_edible, label='EDIBLE',color='b')
ax.bar(uniq_vals, count_poison, label='POISONOUS', bottom=count_edible,color='r')

ax.set_ylabel('Count')
ax.set_title('Mushroom Edibility by Feature: ODOR')
ax.legend()
plt.show()

We can see from the above plot that nearly all mushrooms with a smell that is not Almond, Anise, or None are Poisonous.

## Correlation Heatmaps

Let's look at the data another way, and observe what kind of affect different combinations of the variables have. 

In [None]:
# Create crosstables of different groupings of the variables
cap_xtab = pd.crosstab(mushroom_df['class'],\
                   columns=[mushroom_df['cap-shape'], mushroom_df['cap-surface'], \
                            mushroom_df['cap-color'], mushroom_df['bruises?']]) 

gill_xtab = pd.crosstab(mushroom_df['class'],\
                    columns=[mushroom_df['odor'],mushroom_df['gill-attachment'],\
                            mushroom_df['gill-spacing'], mushroom_df['gill-size'], \
                            mushroom_df['gill-color']])

stalk_xtab = pd.crosstab(mushroom_df['class'],\
                     columns=[mushroom_df['stalk-shape'],mushroom_df['stalk-root'],\
                            mushroom_df['stalk-surface-above-ring'],\
                            mushroom_df['stalk-surface-below-ring'],\
                            mushroom_df['stalk-color-above-ring'],\
                            mushroom_df['stalk-color-below-ring']])

other_xtab = pd.crosstab(mushroom_df['class'],\
                    columns=[mushroom_df['veil-type'],mushroom_df['veil-color'],\
                            mushroom_df['ring-number'],mushroom_df['spore-print-color'],
                            mushroom_df['population'], mushroom_df['habitat']])


In [None]:
# An example of what one of the cross tables looks like 
gill_xtab

In [None]:
sns.heatmap(cap_xtab)

In [None]:
sns.heatmap(gill_xtab)

In [None]:
sns.heatmap(stalk_xtab)

In [None]:
sns.heatmap(other_xtab)

## Violin Plot 

The previous heatmaps are interesting, but are dense and a little confusing; a violin plot is a good way to take in the variables at a glance. 

In [None]:
labelencoder=LabelEncoder()
mushroom_df_encoded = mushroom_df.copy()
for column in mushroom_df_encoded.columns:
    mushroom_df_encoded[column] = labelencoder.fit_transform(mushroom_df_encoded[column])

df_div = pd.melt(mushroom_df_encoded.drop("veil-type",axis=1),
                 'class', var_name='Characteristics')
fig, ax = plt.subplots(figsize=(22,10))

p = sns.violinplot(ax = ax, x='Characteristics', y='value',
                   hue='class', split = True, data=df_div,
                   inner = 'quartile', palette = 'Set1')

df_no_class = mushroom_df_encoded.drop(['class','veil-type'],axis = 1)
p.set_xticklabels(rotation = 90, labels = list(df_no_class.columns))

plt.show()

As we can see from the above violin plot, gill color, spore print color, and habitat seem to have strong indications of edibility. Let's see if our intuition is correct by looking at the edibility numbers for each variable.

## Edibility Histograms

In [None]:
set(mushroom_df['odor'])
        
for column in mushroom_df.drop(["class"], axis=1).columns:
    uniq_vals = mushroom_df[column].unique()
    count_edible=[]
    count_poison=[]

    for j in uniq_vals:
        count_edible.append(len(mushroom_df[(mushroom_df[column]==j)
                                            & (mushroom_df['class']=='EDIBLE')]))
        count_poison.append(len(mushroom_df[(mushroom_df[column]==j)
                                            & (mushroom_df['class']=='POISONOUS')]))
    fig, ax = plt.subplots()
    ax.bar(uniq_vals, count_edible, label='EDIBLE',color='b')
    ax.bar(uniq_vals, count_poison, label='POISONOUS', bottom=count_edible,color='r')

    ax.set_ylabel('Count')
    ax.set_title('Mushroom Edibility by Feature: '+column.upper())
    ax.legend()
    fig.tight_layout()
    plt.show()

From the above histograms, we can conclude that `bruises` might also be a good indicator of edibility.

## Run 6 Classifier Models

In [None]:
y = df_dummy2["class"].values
X = df_dummy2.drop(["class"], axis=1).values
# Split data for train and test.
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42,test_size=0.2)

In [None]:
matplotlib.rcParams['figure.figsize'] = [12, 8]
sns.set_style('darkgrid')
sns.set(font_scale=1.2)
%matplotlib inline

In [None]:
def classification(method, x_dat, y_dat, **params): # I took classification type out but we might want it)
    
    # Split data for train and test
    X_train, X_test, y_train, y_test = train_test_split(x_dat,y_dat,random_state=42,test_size=0.2)
    
    #fit model
    mod = method(**params)
    mod.fit(X_train, y_train)
    y_pred = mod.predict(X_test)
    
    #print results
    print("Results for {}:".format(str(method)))
    print(classification_report(y_pred, y_test))
    print("Test Accuracy: {}%".format(round(mod.score(X_test,y_test)*100,2)))
    
    #print confusion matrix
    y_pred_rf = y_pred
    y_true_rf = y_test
    cm = confusion_matrix(y_true_rf, y_pred_rf)
    f, ax = plt.subplots(figsize =(5,5))
    sns.heatmap(cm,annot = True,linewidths=0.5,linecolor="red",fmt = ".0f",ax=ax)
    plt.xlabel("y_pred_rf")
    plt.ylabel("y_true_rf")
    plt.title('Confusion Matrix')
    plt.show()
    
    if method == DecisionTreeClassifier:
        print('Feature Importance Plot')
        sns.barplot(y=x_dat.columns, x=mod.feature_importances_)
        plt.xlabel('Mean Decrease Gini')
        plt.show()
        
    if method == DecisionTreeClassifier:
        plot_tree(mod.fit(X_train, y_train))

In [None]:
x_mushroom = df_dummy2.drop(["class"], axis=1)
y_mushroom = df_dummy2["class"]

In [None]:
# Split data for train and test
X_train_DT, X_test_DT, y_train_DT, y_test_DT = train_test_split(x_mushroom,y_mushroom,random_state=42,test_size=0.2)

#fit model
DT_mod = DecisionTreeClassifier()
DT_mod.fit(X_train_DT, y_train_DT)
y_pred = DT_mod.predict(X_test_DT)        

In [None]:
print('Feature Importance Plot')
sns.barplot(y=x_mushroom.columns, x=DT_mod.feature_importances_)
plt.xlabel('Mean Decrease Gini')
plt.show()

In [None]:
text_representation = tree.export_text(DT_mod)
print(text_representation)

In [None]:
feature_names = df_dummy2.drop(["class"], axis=1).columns
feature_names

In [None]:
class_names = df_dummy2["class"]
class_names

In [None]:
fig = plt.figure(figsize=(30,40))
plot_tree(DT_mod,filled=True);  

#feature_names = df_dummy2.drop(["class"], axis=1).columns
#feature_names=iris.feature_names, class_names=iris.target_names

In [None]:
classification(DecisionTreeClassifier, x_mushroom, y_mushroom)

In [None]:
classification(SVC, x_mushroom, y_mushroom, random_state=42, gamma="auto") # check on parameters, did better with default

In [None]:

classification(KNeighborsClassifier, x_mushroom, y_mushroom, n_neighbors=5)

In [None]:
classification(LogisticRegression, x_mushroom, y_mushroom) # on parameters

In [None]:
classification(GaussianNB, x_mushroom, y_mushroom)

In [None]:
classification(RandomForestClassifier, x_mushroom, y_mushroom, n_estimators=100, random_state=42) #check on parameters

In [None]:
classification(LinearDiscriminantAnalysis, x_mushroom, y_mushroom) #check parameters

In [None]:
# Neural Network Multi-layer Perceptron Classifier

classification(MLPClassifier, x_mushroom, y_mushroom)

## COVID version: what if we lose our sense of smell?

Odor is obviously the most powerful predictive attribute. What happens to our models if we drop that variable and we can only identify mushrooms visually?

In [None]:
# create the dataframes and appropriate variables 
odorless = mushroom_df.drop(["odor","veil-type"], axis=1)
odorless.head()

odorless['class']= label_encoder.fit_transform(odorless['class']) # 0 is Edible, 1 is Poisonous
odorless_col_names_dummies=['cap-shape', 'cap-surface', 'cap-color', 'bruises?','gill-attachment',\
           'gill-spacing','gill-size','gill-color','stalk-shape','stalk-root','stalk-surface-above-ring',\
           'stalk-surface-below-ring','stalk-color-above-ring','stalk-color-below-ring',\
           'veil-color','ring-number','ring-type','spore-print-color','population','habitat'] 

# drop_first = True helps avoid multicollinearity
odorless_dummy = pd.get_dummies(odorless, columns=odorless_col_names_dummies, drop_first=True)
odorless_dummy.head()

In [None]:
no_odor_y = odorless_dummy["class"].values
no_odor_X = odorless_dummy.drop(["class"], axis=1).values
# Split data for train and test.
X_train_vis, X_test_vis, y_train_vis, y_test_vis = train_test_split(no_odor_X,no_odor_y,random_state=42,test_size=0.2)

x_visual = odorless_dummy.drop(["class"], axis=1)
y_visual = odorless_dummy["class"]

In [None]:
classification(DecisionTreeClassifier, x_visual, y_visual)

In [None]:
classification(SVC, x_visual, y_visual, random_state=42, gamma="auto") # check on parameters, did better with default

In [None]:
classification(KNeighborsClassifier, x_visual, y_visual, n_neighbors=5)

In [None]:
classification(LogisticRegression, x_visual, y_visual) # on parameters

In [None]:
classification(LinearDiscriminantAnalysis, x_visual, y_visual) #check parameters

In [None]:
classification(GaussianNB, x_visual, y_visual)

In [None]:
classification(RandomForestClassifier, x_visual, y_visual, n_estimators=100, random_state=42) #check on parameters

In [None]:
classification(MLPClassifier, x_visual, y_visual)