# DAV 5400 Project 4

In this project we will be using the UCI Mushroom Data Set. The source of the dataset can be found [here](https://archive.ics.uci.edu/ml/datasets/mushroom)

In [27]:
# Import pandas
import pandas as pd

# Import numpy
import numpy as np

# Import Seirs and DataFrame
from pandas import Series, DataFrame

# load the pyplot function from the matplotlib library
import matplotlib.pyplot as plt

# Import seaborn 
import seaborn as sns

In [28]:
# Read the data from GitHub and create a data frame
mushroom = pd.read_csv('https://raw.githubusercontent.com/Humayrakausar/AIM-5001/master/agaricus-lepiota%20(1).data', header = None)
# Use head function to make sure the data loaded properly
mushroom.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [29]:
# To check the number of rows and columns in the dataset
mushroom.shape

(8124, 23)

The dataset contains 8124 rows and 23 columns

# Introduction

# Data Dictionary 

According to the websites, the dataset includes descriptions of hypothetical samples corresponding to 23 species of gilled mushrooms in the Agaricus and Lepiota Family (pp. 500-525). Each species is identified as definitely edible, definitely poisonous, or of unknown edibility and not recommended.

The dataset contains of 8124 instances and 22 attributes, all the attributes are nominally valued

Classes: edible = e, poisonous = p  

According to the website, the attributes of the data are as follows:

1. cap-shape: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s
2. cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s
3. cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r, pink=p,purple=u,red=e,white=w,yellow=y
4. bruises?: bruises=t,no=f
5. odor: almond=a,anise=l,creosote=c,fishy=y,foul=f, musty=m,none=n,pungent=p,spicy=s
6. gill-attachment: attached=a,descending=d,free=f,notched=n
7. gill-spacing: close=c,crowded=w,distant=d
8. gill-size: broad=b,narrow=n
9. gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e, white=w,yellow=y
10. stalk-shape: enlarging=e,tapering=t
11. stalk-root: bulbous=b,club=c,cup=u,equal=e, rhizomorphs=z,rooted=r,missing=?
12. stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s
13. stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s
14. stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o, pink=p,red=e,white=w,yellow=y
15. stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o, pink=p,red=e,white=w,yellow=y
16. veil-type: partial=p,universal=u
17. veil-color: brown=n,orange=o,white=w,yellow=y
18. ring-number: none=n,one=o,two=t
19. ring-type: cobwebby=c,evanescent=e,flaring=f,large=l, none=n,pendant=p,sheathing=s,zone=z
20. spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r, orange=o,purple=u,white=w,yellow=y
21. population: abundant=a,clustered=c,numerous=n, scattered=s,several=v,solitary=y
22. habitat: grasses=g,leaves=l,meadows=m,paths=p, urban=u,waste=w,woods=d


# Data Preparation

**In this stage, we will create a sub dataframe with the columns that indicate edible or poisonous, and odor along with other columns to satisfy the project requirements.**

In [30]:
# Make use of iloc function to cretae a sub dataframe
mushroom_sub = mushroom.iloc[:, [0, 1, 5, 17,]].copy()
mushroom_sub.head()

Unnamed: 0,0,1,5,17
0,p,x,p,w
1,e,x,a,w
2,e,b,l,w
3,p,x,p,w
4,e,x,n,w


In [31]:
# Make use of isna and sum function to see if there are any null values in the data set
mushroom_sub.isna().sum()

0     0
1     0
5     0
17    0
dtype: int64

From the result above it can be stated that there is no null value presence in the dataset 

In [33]:
# Provide suitable name for each colum
mushroom_sub.columns = ['Class', 'Cap_shape', 'Odor', 'Veil_color']
mushroom_sub.head()

Unnamed: 0,Class,Cap_shape,Odor,Veil_color
0,p,x,p,w
1,e,x,a,w
2,e,b,l,w
3,p,x,p,w
4,e,x,n,w


Now, We will transform all the single letter abbreviations into words for better comprehension

In [43]:
# Transform all the column's single letter abbreviated values into words 
mushroom_sub.Class.replace({'e':'edible','p': 'poisonous'}, inplace = True)

mushroom_sub.Cap_shape.replace({'b':'bell', 'c': 'conical', 'x':'convex', 'f': 'flat','k':'knobbed',
                                's': 'sunken'}, inplace = True)

mushroom_sub.Odor.replace({'a': 'almond', 'l': 'anise', 'c': 'creosote','y': 'fishy', 'f': 'foul', 'm': 'musty',
                          'n': 'none', 'p': 'pungent', 's':'spicy'}, inplace = True)

mushroom_sub.Veil_color.replace({'n': 'brown', 'o': 'orange', 'w': 'white', 'y': 'yellow'}, inplace = True)

# To check whether all the intended transformation have completed properly 
mushroom_sub.head()

Unnamed: 0,Class,Cap_shape,Odor,Veil_color
0,poisonous,convex,pungent,white
1,edible,convex,almond,white
2,edible,bell,anise,white
3,poisonous,convex,pungent,white
4,edible,convex,none,white


From the data above, it can be stated that all the column values transformed properly

# Exploratory Data Analysis

### Data distribution of each of the column above 

In [44]:
# Make use of describe function to show the data distribution of each column
mushroom_sub.describe()

Unnamed: 0,Class,Cap_shape,Odor,Veil_color
count,8124,8124,8124,8124
unique,2,6,9,4
top,edible,convex,none,white
freq,4208,3656,3528,7924


### Creating Dummy variables for each of the column

In [45]:
# Convert edible and poisonous indicators in class column into digits
mushroom_sub.Class.replace('edible', 0, inplace = True)
mushroom_sub.Class.replace('poisonous', 1, inplace = True)
# To check whether the class column values converted properly
mushroom_sub.head()

Unnamed: 0,Class,Cap_shape,Odor,Veil_color
0,1,convex,pungent,white
1,0,convex,almond,white
2,0,bell,anise,white
3,1,convex,pungent,white
4,0,convex,none,white


From the data above, it can be inferred that the class column values coverted properly 

In [51]:
# To create dummy variables for rest of the columns 
m_dummy = pd.get_dummies(mushroom_sub)
# To check whether dummy variables created properly 
m_dummy.head()

Unnamed: 0,Class,Cap_shape_bell,Cap_shape_conical,Cap_shape_convex,Cap_shape_flat,Cap_shape_knobbed,Cap_shape_sunken,Odor_almond,Odor_anise,Odor_creosote,Odor_fishy,Odor_foul,Odor_musty,Odor_none,Odor_pungent,Odor_spicy,Veil_color_brown,Veil_color_orange,Veil_color_white,Veil_color_yellow
0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
2,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0
3,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
4,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0


From the data above, it can be stated that the dummy variables have created for all the intended columns above properly