# Data Preprocessing
Data preprocessing is a critical step in machine learning that often determines the success of a model. We are seeking to enhance our data preprocessing in our machine learning project.

## Objective
Give the numerical representation of the categorical data such that it can be used for classification of whether a mushroom is 'poisonous' or 'edible'.

## Tasks
- Improve the data preprocessing workflow.
- Data Cleaning & Transformation.
- Feature Engineering.
- Encoding of categorical data, and also provide reason behind the use of any particular encoding technique.

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [18]:
df = pd.read_csv('mushroom.csv')

print(df.head())

   Unnamed: 0 cap-shape cap-surface cap-color  bruises     odor  \
0           0    convex      smooth     brown  bruises  pungent   
1           1    convex      smooth    yellow  bruises   almond   
2           2      bell      smooth     white  bruises    anise   
3           3    convex       scaly     white  bruises  pungent   
4           4    convex      smooth      gray       no     none   

  gill-attachment gill-spacing gill-size gill-color  ...  \
0            free        close    narrow      black  ...   
1            free        close     broad      black  ...   
2            free        close     broad      brown  ...   
3            free        close    narrow      brown  ...   
4            free      crowded     broad      black  ...   

  stalk-color-above-ring stalk-color-below-ring veil-type veil-color  \
0                  white                  white   partial      white   
1                  white                  white   partial      white   
2                  w

In [19]:
# Checking whether there are any missing values
print(df.isnull().sum())

Unnamed: 0                     0
cap-shape                      0
cap-surface                    0
cap-color                      0
bruises                        0
odor                           0
gill-attachment                0
gill-spacing                   0
gill-size                      0
gill-color                     0
stalk-shape                    0
stalk-root                  2480
stalk-surface-above-ring       0
stalk-surface-below-ring       0
stalk-color-above-ring         0
stalk-color-below-ring         0
veil-type                      0
veil-color                     0
ring-number                    0
ring-type                      0
spore-print-color              0
population                     0
habitat                        0
poisonous                      0
dtype: int64


In [20]:
# Imputing null values in stalk-root with the most frequent value (mode)
most_frequent = df['stalk-root'].mode()[0]

# Filling the missing values with the most frequent
df['stalk-root'].fillna(most_frequent, inplace=True)

# Verify that the missing values are filled
print(df['stalk-root'].isnull().sum())

0


In [21]:
# Confirming the missing entires once again
print(df.isnull().sum())

Unnamed: 0                  0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
poisonous                   0
dtype: int64


In [22]:
# Combining cap features, stalk features, gill features, veil features and ring features

df['cap_combined'] = df['cap-shape'] + '_' + df['cap-color'] + '_' + df['cap-surface']
df = df.drop(['cap-shape', 'cap-color', 'cap-surface'], axis=1)

df['gill_combined'] = df['gill-attachment'] + '_' + df['gill-spacing'] + '_' + df['gill-size'] + '_' + df['gill-color']
df = df.drop(['gill-attachment', 'gill-spacing', 'gill-size', 'gill-color'], axis=1)

df['stalk_combined'] = df['stalk-color-above-ring'] + '_' + df['stalk-color-below-ring'] + '_' + df['stalk-root'] + '_' + df['stalk-shape'] + '_' + df['stalk-surface-above-ring'] + '_' + df['stalk-surface-below-ring']
df = df.drop(['stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-below-ring', 'stalk-color-above-ring', 'stalk-shape', 'stalk-root'], axis=1)

df['veil_combined'] = df['veil-color'] + '_' + df['veil-type']
df = df.drop(['veil-color', 'veil-type'], axis=1)

df['ring_combined'] = df['ring-number'] + '_' + df['ring-type']
df = df.drop(['ring-number', 'ring-type'], axis=1)

In [23]:
print(df.head())

   Unnamed: 0  bruises     odor spore-print-color population  habitat  \
0           0  bruises  pungent             black  scattered    urban   
1           1  bruises   almond             brown   numerous  grasses   
2           2  bruises    anise             brown   numerous  meadows   
3           3  bruises  pungent             black  scattered    urban   
4           4       no     none             brown   abundant  grasses   

  poisonous          cap_combined             gill_combined  \
0         p   convex_brown_smooth   free_close_narrow_black   
1         e  convex_yellow_smooth    free_close_broad_black   
2         e     bell_white_smooth    free_close_broad_brown   
3         p    convex_white_scaly   free_close_narrow_brown   
4         e    convex_gray_smooth  free_crowded_broad_black   

                              stalk_combined  veil_combined   ring_combined  
0  white_white_equal_enlarging_smooth_smooth  white_partial     one_pendant  
1   white_white_club_enlar

In [24]:
# Encoding
# Bruises: Label encoding no to 0 and bruises to 1
df['bruises'] = df['bruises'].map({'no': 0, 'bruises': 1})
# Odor: No inherent order, so one-hot encoding
df = pd.get_dummies(df, columns=['odor'])
# Spore Print Color: No order, so one-hot encoding
df = pd.get_dummies(df, columns=['spore-print-color'])
# Population, Habitat, and combined categories: similarly, we can use one-hot encoding as no inherent order of the categorical data
df = pd.get_dummies(df, columns=['population', 'habitat', 'cap_combined', 'gill_combined', 'stalk_combined', 'veil_combined', 'ring_combined'])

In [25]:
# Label encoding the target column
df['poisonous'] = df['poisonous'].map({'e': 0, 'p': 1})

# Moving target column to the end
target_column = df.pop('poisonous')
df['poisonous'] = target_column

print(df.head())

   Unnamed: 0  bruises  odor_almond  odor_anise  odor_creosote  odor_fishy  \
0           0        1        False       False          False       False   
1           1        1         True       False          False       False   
2           2        1        False        True          False       False   
3           3        1        False       False          False       False   
4           4        0        False       False          False       False   

   odor_foul  odor_musty  odor_none  odor_pungent  ...  \
0      False       False      False          True  ...   
1      False       False      False         False  ...   
2      False       False      False         False  ...   
3      False       False      False          True  ...   
4      False       False       True         False  ...   

   veil_combined_white_partial  veil_combined_yellow_partial  \
0                         True                         False   
1                         True                        

In [27]:
# Converting dummy variables to integers
df = df.astype(int)

# Converting it into csv
df.to_csv('mushroom_preprocessed.csv', index=False)