# Mushroom's Dataset

### Using machine learning to create a model that predicts if a mushroom is edible or poisonus


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## Loading Data

In [2]:
df = pd.read_csv("data/agaricus_lepiota_small_c.csv")

df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     1000 non-null   object
 1   cap-shape                 1000 non-null   object
 2   cap-surface               1000 non-null   object
 3   cap-color                 1000 non-null   object
 4   bruises                   1000 non-null   object
 5   odor                      1000 non-null   object
 6   gill-attachment           1000 non-null   object
 7   gill-spacing              1000 non-null   object
 8   gill-size                 1000 non-null   object
 9   gill-color                1000 non-null   object
 10  stalk-shape               1000 non-null   object
 11  stalk-root                690 non-null    object
 12  stalk-surface-above-ring  1000 non-null   object
 13  stalk-surface-below-ring  1000 non-null   object
 14  stalk-color-above-ring   

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,e,x,s,y,t,a,f,w,b,g,...,s,w,w,p,w,o,p,n,v,d
1,e,f,s,y,f,n,f,c,b,p,...,s,w,w,p,w,o,f,n,y,g
2,e,k,s,w,f,c,f,w,b,g,...,s,w,n,p,w,t,e,w,n,g
3,e,f,f,n,t,n,f,c,b,w,...,s,g,w,p,w,o,p,k,v,d
4,p,x,s,w,t,p,f,c,n,w,...,s,w,w,p,w,o,p,n,s,u


## Preprocessing Data

- Tranforming and extracting the labes from df ('class')
- Imputation of a default value for the missing values in 'stalk-root' attribute 
- I decided to use one-hot-encoder in some attributes where the category type is determinant
- For the others, i used ordinal-encoder. Specifying some attributes where the order is most relevant

In [3]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

df['class'].replace('e', 0, inplace=True)
df['class'].replace('p', 1, inplace=True)

y = df['class'].to_numpy()

df.drop('class', axis=1, inplace=True)

df['stalk-root'].fillna('-', inplace=True)


ohe_labels = ['cap-shape','odor','gill-attachment','stalk-root','veil-type','ring-type','habitat']
oe_special_labels = ['ring-number','stalk-surface-above-ring','stalk-surface-below-ring','cap-surface']
oe_labels = [label for label in df.columns if label not in ohe_labels + oe_special_labels]

transformers = [
    ('ohe', OneHotEncoder(), ohe_labels),
    ('oe', OrdinalEncoder(), oe_labels),
    ('oe_ring_number', OrdinalEncoder(categories=[['n','o','t']]), ['ring-number']),
    ('oe_stalk_surface_above_ring', OrdinalEncoder(categories=[['f','y','k','s']]),['stalk-surface-above-ring']),
    ('oe_stalk_surface_below_ring', OrdinalEncoder(categories=[['f','y','k','s']]),['stalk-surface-below-ring']),
    ('oe_cap_surface', OrdinalEncoder(categories=[['f','g','y','s']]),['cap-surface']),
]

ct = ColumnTransformer(
    transformers, remainder='passthrough'
)

X = ct.fit_transform(df)

In [4]:
results = {
    "knn": {
        "kfold": {
            "scores": [],
            "predicts": []
        },
        "grid": {
            "scores": [],
            "predicts": []
        }
    },
    "svm": {
        "kfold": {
            "scores": [],
            "predicts": []
        },
        "grid": {
            "scores": [],
            "predicts": [],
            "params": [],
        }
    }
}

def show_scores_stats(scores_list):
    print("Accuracies scores statistics: min: %.2f, max: %.2f, avg +- std: %.2f+-%.2f" % (min(scores_list), max(scores_list), np.mean(scores_list), np.std(scores_list)))

## KNN

- First level - StratifiedKFold (k-vias = 10)

In [5]:
from utils import do_cv_knn

ks = range(1,30,2)
k_vias = 10
knn_kfold_cv_scores, knn_kfold_cv_predicts = do_cv_knn(X, y, k_vias, ks, score_type='f1')
results['knn']['kfold']['scores'], results['knn']['kfold']['predicts'] = knn_kfold_cv_scores, knn_kfold_cv_predicts

show_scores_stats(knn_kfold_cv_scores)
print("Best K in validation: %d (acc=%.2f)" % (ks[np.argmax(knn_kfold_cv_scores)], max(knn_kfold_cv_scores)))
knn_kfold_cv_scores

HBox(children=(FloatProgress(value=0.0, description='Processed folds', max=10.0, style=ProgressStyle(descripti…


Accuracies scores statistics: min: 0.90, max: 0.98, avg +- std: 0.93+-0.02
Best K in validation: 9 (acc=0.98)


[0.945054945054945,
 0.8958333333333334,
 0.8979591836734694,
 0.9148936170212766,
 0.9795918367346939,
 0.9484536082474228,
 0.9361702127659574,
 0.9361702127659574,
 0.9052631578947369,
 0.9361702127659575]

- Second level - GridSearchCV (k-vias = 5)

In [6]:
from utils import do_grid_cv_knn

ks = range(1,30,2)
k_vias = 5
knn_hp_cv_scores, knn_hp_cv_predicts = do_grid_cv_knn(X, y, k_vias, ks, score_type='f1')
results['knn']['grid']['scores'], results['knn']['grid']['predicts'] = knn_hp_cv_scores, knn_hp_cv_predicts

show_scores_stats(knn_hp_cv_scores)
print("Best K in validation: %d (acc=%.2f)" % (ks[np.argmax(knn_hp_cv_scores)], max(knn_hp_cv_scores)))
knn_hp_cv_scores

HBox(children=(FloatProgress(value=0.0, description='Processed folds', max=5.0, style=ProgressStyle(descriptio…


Accuracies scores statistics: min: 0.90, max: 0.95, avg +- std: 0.93+-0.02
Best K in validation: 5 (acc=0.95)


[0.9368421052631578,
 0.8958333333333334,
 0.9533678756476685,
 0.9263157894736843,
 0.9214659685863875]

## SVM

- First level - StratifiedKFold (k-vias = 10)

In [7]:
from utils import do_cv_svm

k_vias = 10
svm_kfold_cv_scores, svm_kfold_cv_predicts = do_cv_svm(X, y, k_vias, Cs=[1, 10, 100, 1000], gammas=['scale', 'auto', 2e-2, 2e-3, 2e-4], score_type='f1')
results['svm']['kfold']['scores'], results['svm']['kfold']['predicts'] = svm_kfold_cv_scores, svm_kfold_cv_predicts

show_scores_stats(svm_kfold_cv_scores)
svm_kfold_cv_scores

HBox(children=(FloatProgress(value=0.0, description='Processed folds', max=10.0, style=ProgressStyle(descripti…


Accuracies scores statistics: min: 0.91, max: 0.96, avg +- std: 0.94+-0.02


[0.9484536082474228,
 0.9072164948453607,
 0.9387755102040817,
 0.9574468085106383,
 0.9494949494949494,
 0.9583333333333334,
 0.9166666666666666,
 0.9347826086956522,
 0.9263157894736843,
 0.9484536082474228]

- Second level - GridSearchCV (k-vias = 5)

In [8]:
from utils import do_grid_cv_svm

k_vias = 5
svm_hp_cv_scores, svm_hp_cv_predicts, svm_hp_cv_params = do_grid_cv_svm(X, y, k_vias, Cs=[1, 10, 100, 1000], gammas=['scale', 'auto', 2e-2, 2e-3, 2e-4], score_type='f1')
results['svm']['grid']['scores'], results['svm']['grid']['predicts'], results['svm']['grid']['params'] = svm_hp_cv_scores, svm_hp_cv_predicts, svm_hp_cv_params

show_scores_stats(svm_hp_cv_scores)
print(f"Best C: {svm_hp_cv_params[np.argmax(svm_hp_cv_scores)]['C']} and  Gamma:{svm_hp_cv_params[np.argmax(svm_hp_cv_scores)]['gamma']} in validation:  (acc={ max(svm_hp_cv_scores)})")
svm_hp_cv_scores

HBox(children=(FloatProgress(value=0.0, description='Processed folds', max=5.0, style=ProgressStyle(descriptio…


Accuracies scores statistics: min: 0.90, max: 0.96, avg +- std: 0.93+-0.02
Best C: 100 and  Gamma:0.002 in validation:  (acc=0.9591836734693877)


[0.9032258064516129,
 0.9319371727748692,
 0.9591836734693877,
 0.9297297297297297,
 0.9479166666666666]

## Null Hypothesis Test by Student's T Test

In [9]:
from scipy.stats import ttest_ind_from_stats

minimum_confidence_level = 0.05

knn_scores = results['knn']['kfold']['scores'] + results['knn']['grid']['scores']

svm_scores = results['svm']['kfold']['scores'] + results['svm']['grid']['scores']

_, pvalue = ttest_ind_from_stats(np.mean(knn_scores), np.std(knn_scores), len(knn_scores), np.mean(svm_scores), np.std(svm_scores), len(svm_scores))

pvalue

0.26066579821095054

- Selecting the best classifier, KNN or SVM

In [11]:
if not pvalue <= minimum_confidence_level:
    print("The difference between the averages of the results of the classifiers it's not enough to determine if one is better than the other")
else:
    if np.mean(knn_scores) > np.mean(svm_scores):
        print(f"KNN achieved better results, with a hit average of {np.mean(knn_scores)}")
    else:
        print(f"SVM achieved better results, with a hit average of {np.mean(svm_scores)}")

The difference between the averages of the results of the classifiers it's not enough to determine if one is better than the other


## The final question:

#### Would you use some classifier that you created to decide whether or not to eat a mushroom ranked by it? Justify using the performance obtained and the result of the hypothesis test.

#### R: Yes, the hit rate in both was good, arround 93%. I think this performance and the ammount of data is good enough to trust in the classifier prediction. 