## Mushroom Classification
A project aiming to classificate mushrooms based on recorded characteristics into poisonous or edible

In [2]:
# Data manipulation
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Data cleaning
from sklearn.preprocessing import LabelEncoder

# Display plots in the notebook
%matplotlib inline

In [95]:
# Importing the data
data = pd.read_csv('C:/Users/liaba/Documents/mushroom_classification/mushroom_data/train.csv')
test_data = pd.read_csv('C:/Users/liaba/Documents/mushroom_classification/mushroom_data/test.csv')
data.set_index('id', inplace=True)
test_data.set_index('id', inplace=True)

In [96]:
# Setting the data type of the columns
data['cap-diameter'] = pd.to_numeric(data['cap-diameter'])
data['stem-height'] = pd.to_numeric(data['stem-height'])
data['stem-width'] = pd.to_numeric(data['stem-width'])

In [97]:
# Define allowed values and encoding mappings
class_val = {'e', 'p'}
cap_val = {'b', 'c', 'x', 'f', 's', 'p', 'o'}
surf_val = {'i', 'g', 'y', 's', 'h', 'l', 'k', 't', 'w', 'e'}
color_val = {'n', 'b', 'g', 'r', 'p', 'u', 'e', 'w', 'y', 'l', 'o', 'k'}
bruise_val = {'t', 'f'}
gill_val = {'a', 'x', 'd', 'e', 's', 'p', 'f'}
spacing_val = {'c', 'd', 'f'}
gill_col_val = {'n', 'b', 'g', 'r', 'p', 'u', 'e', 'w', 'y', 'l', 'o', 'k', 'f'}
root_val = {'b', 's', 'c', 'u', 'e', 'z', 'r'}
veil_val = {'p', 'u'}
ring_val = {'t', 'f'}
ing_val = {'c', 'e', 'r', 'g', 'l', 'p', 's', 'z', 'y', 'm', 'f'}
habitat_val = {'g', 'l', 'm', 'p', 'h', 'u', 'w', 'd'}
season_val = {'s', 'u', 'a', 'w'}

class_enc = {'e': 1, 'p': 0}
cap_enc = {'b': 1, 'c': 2, 'x': 3, 'f': 4, 's': 5, 'p': 6, 'o': 7, '0': 0}
surf_enc = {'i': 1, 'g': 2, 'y': 3, 's': 4, 'h': 5, 'l': 6, 'k': 7, 't': 8, 'w': 9, 'e': 10, '0': 0}
color_enc = {'n': 1, 'b': 2, 'g': 3, 'r': 4, 'p': 5, 'u': 6, 'e': 7, 'w': 8, 'y': 9, 'l': 10, 'o': 11, 'k': 12, '0': 0}
bruise_enc = {'t': 1, 'f': 0, '0':0}
gill_enc = {'a': 1, 'x': 2, 'd': 3, 'e': 4, 's': 5, 'p': 6, 'f': 7, '0': 0}
spacing_enc = {'c': 1, 'd': 2, 'f': 3, '0': 0}
gill_col_enc = {'n': 1, 'b': 2, 'g': 3, 'r': 4, 'p': 5, 'u': 6, 'e': 7, 'w': 8, 'y': 9, 'l': 10, 'o': 11, 'k': 12, 'f':13, '0': 0}
root_enc = {'b': 1, 's': 2, 'c': 3, 'u': 4, 'e': 5, 'z': 6, 'r': 7, '0': 0}
veil_enc = {'p': 1, 'u': 2, '0': 0}
ring_enc = {'t': 1, 'f': 0, '0': 0}
ing_enc = {'c': 1, 'e': 2, 'r': 3, 'g': 4, 'l': 5, 'p': 6, 's': 7, 'z': 8, 'y': 9, 'm': 10, 'f': 11, '0': 0}
habitat_enc = {'g': 1, 'l': 2, 'm': 3, 'p': 4, 'h': 5, 'u': 6, 'w': 7, 'd': 8, '0': 0}
season_enc = {'s': 1, 'u': 2, 'a': 3, 'w': 4, '0': 0}

# Replace values not in the sets with '0'
data['cap-shape'] = data['cap-shape'].apply(lambda x: x if x in cap_val else '0')
data['cap-surface'] = data['cap-surface'].apply(lambda x: x if x in surf_val else '0')
data['cap-color'] = data['cap-color'].apply(lambda x: x if x in color_val else '0')
data['does-bruise-or-bleed'] = data['does-bruise-or-bleed'].apply(lambda x: x if x in bruise_val else '0')
data['gill-attachment'] = data['gill-attachment'].apply(lambda x: x if x in gill_val else '0')
data['gill-spacing'] = data['gill-spacing'].apply(lambda x: x if x in spacing_val else '0')
data['gill-color'] = data['gill-color'].apply(lambda x: x if x in gill_col_val else '0')
data['stem-root'] = data['stem-root'].apply(lambda x: x if x in root_val else '0')
data['stem-surface'] = data['stem-surface'].apply(lambda x: x if x in surf_val else '0')
data['stem-color'] = data['stem-color'].apply(lambda x: x if x in gill_col_val else '0')
data['veil-type'] = data['veil-type'].apply(lambda x: x if x in veil_val else '0')
data['veil-color'] = data['veil-color'].apply(lambda x: x if x in gill_col_val else '0')
data['has-ring'] = data['has-ring'].apply(lambda x: x if x in ring_val else '0')
data['ring-type'] = data['ring-type'].apply(lambda x: x if x in ing_val else '0')
data['spore-print-color'] = data ['spore-print-color'].apply(lambda x: x if x in color_val else '0')
data['habitat'] = data['habitat'].apply(lambda x: x if x in habitat_val else '0')
data['season'] = data['season'].apply(lambda x: x if x in season_val else '0')

# Apply the custom mapping
data['cap-shape_encoded'] = data['cap-shape'].map(cap_enc)
data['cap-surface_encoded'] = data['cap-surface'].map(surf_enc)
data['cap-color_encoded'] = data['cap-color'].map(color_enc)
data['does-bruise-or-bleed_encoded'] = data['does-bruise-or-bleed'].map(bruise_enc)
data['gill-attachment_encoded'] = data['gill-attachment'].map(gill_enc)
data['gill-spacing_encoded'] = data['gill-spacing'].map(spacing_enc)
data['gill-color_encoded'] = data['gill-color'].map(gill_col_enc)
data['stem-root_encoded'] = data['stem-root'].map(root_enc)
data['stem-surface_encoded'] = data['stem-surface'].map(surf_enc)
data['stem-color_encoded'] = data['stem-color'].map(gill_col_enc)
data['veil-type_encoded'] = data['veil-type'].map(veil_enc)
data['veil-color_encoded'] = data['veil-color'].map(gill_col_enc)
data['has-ring_encoded'] = data['has-ring'].map(ring_enc)
data['ring-type_encoded'] = data['ring-type'].map(ing_enc)
data['spore-print-color_encoded'] = data['spore-print-color'].map(color_enc)
data['habitat_encoded'] = data['habitat'].map(habitat_enc)
data['season_encoded'] = data['season'].map(season_enc)
data['class_encoded'] = data['class'].map(class_enc)

# Rename the columns directly to the original names
data['cap-shape'] = data['cap-shape_encoded'].copy()
data['cap-surface'] = data['cap-surface_encoded'].copy()
data['cap-color'] = data['cap-color_encoded'].copy()
data['does-bruise-or-bleed'] = data['does-bruise-or-bleed_encoded'].copy()
data['gill-attachment'] = data['gill-attachment_encoded'].copy()
data['gill-spacing'] = data['gill-spacing_encoded'].copy()
data['gill-color'] = data['gill-color_encoded'].copy()
data['stem-root'] = data['stem-root_encoded'].copy()
data['stem-surface'] = data['stem-surface_encoded'].copy()
data['stem-color'] = data['stem-color_encoded'].copy()
data['veil-type'] = data['veil-type_encoded'].copy()
data['has-ring'] = data['has-ring_encoded'].copy()
data['ring-type'] = data['ring-type_encoded'].copy()
data['spore-print-color'] = data['spore-print-color_encoded'].copy()
data['habitat'] = data['habitat_encoded'].copy()
data['season'] = data['season_encoded'].copy()
data['class'] = data['class_encoded'].copy()
data['veil-color'] = data['veil-color_encoded'].copy()

# Drop the temporary encoded columns
data = data.drop(['class_encoded','cap-shape_encoded','cap-surface_encoded','cap-color_encoded','does-bruise-or-bleed_encoded','gill-attachment_encoded','gill-spacing_encoded','gill-color_encoded','stem-root_encoded','stem-surface_encoded','stem-color_encoded','veil-type_encoded', 'veil-color_encoded','has-ring_encoded','ring-type_encoded','spore-print-color_encoded','habitat_encoded','season_encoded'], axis=1)


In [98]:
data.head(50)

Unnamed: 0_level_0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,8.8,4,4,6,0,1,1,8,4.51,...,0,0,8,0,0,0,11,0,8,3
1,0,4.51,3,5,11,0,1,1,1,4.79,...,0,3,11,0,0,1,8,0,8,4
2,1,6.94,4,4,2,0,2,1,8,6.85,...,0,4,1,0,0,0,11,0,2,4
3,1,3.88,4,3,3,0,5,0,3,4.16,...,0,0,8,0,0,0,11,0,8,2
4,1,5.85,3,6,8,0,3,0,8,3.37,...,0,0,8,0,0,0,11,0,1,3
5,0,4.3,3,8,1,0,5,1,1,5.91,...,0,0,8,0,1,1,8,0,8,3
6,1,9.65,6,3,8,0,4,1,12,19.07,...,0,4,8,0,0,1,2,0,1,4
7,0,4.55,3,10,7,0,1,0,9,8.31,...,0,0,9,0,8,1,8,0,8,3
8,0,7.36,4,5,7,0,2,2,8,5.77,...,1,0,8,0,0,0,11,0,8,3
9,1,6.45,3,8,1,0,1,2,8,7.13,...,0,0,7,0,0,0,11,0,8,3
