In [1]:
using CSV
using DataFrames
using PyPlot
using ScikitLearn # machine learning package
using Statistics
using Random
using LaTeXStrings # for L"$x$" to work instead of needing to do "\$x\$"
using Printf

rcParams = PyPlot.PyDict(PyPlot.matplotlib."rcParams")
rcParams["font.size"] = 16

16

# Is that Mushroom Edible????

The following project will be attempting to classify a species of mushroom as either poisonous or edible. The data set includes descriptions of hypothetical samples corresponding to 23 species of gilled mushrooms in the Agaricus and Lepiota Family. Each species is identied as definitely edible, definitely poisonous, or of unknown edibility and not recommended(labeled as poisonous). The goal is to use a _______ machine learning software to correctly predict whether or not a mushroom is edible. Each mushroom species in the data set has a classification along with 22 attributes listed below.

Attribute Information:

1. cap-shape: 
        b=bell
        c=conical
        x=convex
        f=flat
        k=knobbed
        s=knobbed
2. cap-surface: 
        f=fibrous
        g=grooves
        y=scaly
        s=smooth
3. cap-color: 
        n=brown
        b=buff
        c=cinnamon
        g=gray
        r=green
        p=pink
        u=purple
        e=red
        w=white
        y=yellow
4. bruises?: 
        t=bruises
        f=no
5. odor: 
        a=almond
        l=anise
        c=creosote
        y=fishy
        f=foul
        m=musty
        n=none
        p=pungent
        s=spicy
6. gill-attachment: 
        a=attached
        d=descending
        f=free
        n=notched
7. gill-spacing: 
        c=close
        w=crowded
        d=distant
8. gill-size: 
        b=broad
        n=narrow
9. gill-color: 
        k=black
        n=brown
        b=buff
        h=chocolate
        g=gray
        r=green
        o=orange
        p=pink
        u=purple
        e=red
        w=white
        y=yellow
10. stalk-shape: 
        e=enlarging
        t=tapering
11. stalk-root: 
        b=bulbous
        c=club
        u=cup
        e=equal
        z=rhizomorphs
        r=rooted
        ?=missing
12. stalk-surface-above-ring: 
        f=fibrous
        y=scaly
        k=silky
        s=smooth
13. stalk-surface-below-ring: 
        f=fibrous
        y=scaly
        k=silky
        s=smooth
14. stalk-color-above-ring: 
        n=brown
        b=buff
        c=cinnamon
        g=gray
        o=orange
        p=pink
        e=red
        w=white
        y=yellow
15. stalk-color-below-ring: 
        n=brown
        b=buff
        c=cinnamon
        g=gray
        o=orange
        p=pink
        e=red
        w=white
        y=yellow
16. veil-type: 
        p=partial
        u=universal
17. veil-color: 
        n=brown
        o=orange
        w=white
        y=yellow
18. ring-number: 
        n=none
        o=one
        t=two
19. ring-type: 
        c=cobwebby
        e=evanescent
        f=flaring
        l=large
        n=none
        p=pendant
        s=sheathing
        z=zone
20. spore-print-color: 
        k=black
        n=brown
        b=buff
        h=chocolate
        r=green
        o=orange
        u=purple
        w=white
        y=yellow
21. population: 
        a=abundant
        c=clustered
        n=numerous
        s=scattered
        v=several
        y=solitary
22. habitat: 
        g=grasses
        l=leaves
        m=meadows
        p=paths
        u=urban
        w=waste
        d=woods

The logical rules to determine whether a mushroom is edible or not that has proven to be the most successful are as follows. 

    P_1) odor=NOT(almond.OR.anise.OR.none)
	     120 poisonous cases missed, 98.52% accuracy

	P_2) spore-print-color=green
	     48 cases missed, 99.41% accuracy
         
	P_3) odor=none.AND.stalk-surface-below-ring=scaly.AND.
	          (stalk-color-above-ring=NOT.brown) 
	     8 cases missed, 99.90% accuracy
         
	P_4) habitat=leaves.AND.cap-color=white
	         100% accuracy     

	Rule P_4) may also be

	P_4') population=clustered.AND.cap_color=white


In [2]:
#here I have loaded in the data and specified that the dat begins on the 1st row so that I may asign headers 
df = CSV.read("agaricus-lepiota.data", datarow=1, copycols=true)

#I am assigning headers to each attribute 
df = names!(df, [:class, :cap_shape, :cap_surface, :cap_color, :bruises, :odor, :gill_attachment, :gill_spacing, :gill_size, 
        :gill_color, :stalk_shape, :stalk_root, :stalk_surface_above_ring, :stalk_surface_below_ring, :stalk_color_above_ring,
        :stalk_color_below_ring, :veil_type, :veil_color, :ring_number, :ring_type, :spore_print_color, :population, :habitat])

#just checking!
first(df, 6)



Unnamed: 0_level_0,class,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing
Unnamed: 0_level_1,String,String,String,String,String,String,String,String
1,p,x,s,n,t,p,f,c
2,e,x,s,y,t,a,f,c
3,e,b,s,w,t,l,f,c
4,p,x,y,w,t,p,f,c
5,e,x,s,g,f,n,f,w
6,e,x,y,y,t,a,f,c


# Data Cleaning

Before I can work with the data I have to change the data into bianary information. 

I first itterate through each of the original attributes, determine how many unique identifiers are in each attribute, create new attributes with these unique identifiers, then itterate through each row of the dataframe and determine which of the unique attributes is true for a given species. 

In [18]:
#how many original attributes are there?
n_mushrooms = nrow(df)
size = NaN*zeros(n_mushrooms)

#what are the names of all of the columns 
attributes = names(df)

#initiate the new dataframe
df_mushrooms = DataFrame()

#iterate through each of the columns 
for (i, attribute) in enumerate(attributes)
    
    #find the unique types within each attribute
    attribute_types = unique(df[!, attribute])
    
    #iterate through each of those unique types 
    for (j, name)  in enumerate(attribute_types)
        #create a symbol that can be used to name the column with the new name 
        new_attributes = Symbol(":$(attribute)_$name")
        #print(name)
        new_values = []
            for (h, value) in enumerate(eachrow(df[:, attribute]))
                #print(value)
                if value[1] == String(name) #this almost works but not quite 
                    push!(new_values, true)
                   
                else 
                    push!(new_values, false)
                    
                end
            end
        
        #add the new column to the dataframe 
        df_mushrooms[!, new_attributes] = vcat(new_values)
    end
   
end


Unnamed: 0_level_0,:class_p,:class_e,:cap_shape_x,:cap_shape_b,:cap_shape_s,:cap_shape_f,:cap_shape_k
Unnamed: 0_level_1,Any,Any,Any,Any,Any,Any,Any
1,1,0,1,0,0,0,0
2,0,1,1,0,0,0,0
3,0,1,0,1,0,0,0
4,1,0,1,0,0,0,0
5,0,1,1,0,0,0,0
6,0,1,1,0,0,0,0
7,0,1,0,1,0,0,0
8,0,1,0,1,0,0,0
9,1,0,1,0,0,0,0
10,0,1,0,1,0,0,0


I now need to fill in the correct data with Boolean values

n_mushrooms = nrow(df)

X = zeros(n_mushrooms, 22)
fill!(X, NaN)
y = zeros(n_mushrooms)
fill!(y, NaN)

for (i, mushrooms) in enumerate(eachrow(df))
    X[i, 1] = mushrooms[:cap_shape]
    X[i, 2] = mushrooms[:cap_surface]
    X[i, 3] = mushrooms[:cap_color]
    X[i, 4] = mushrooms[:bruises]
    X[i, 5] = mushrooms[:odor]
    X[i, 6] = mushrooms[:gill_attachment]
    X[i, 7] = mushrooms[:gill_spacing]
    X[i, 8] = mushrooms[:gill_size]
    X[i, 9] = mushrooms[:gill_color]
    X[i, 10] = mushrooms[:stalk_shape]
    X[i, 11] = mushrooms[:stalk_root]
    X[i, 12] = mushrooms[:stalk_surface_above_ring]
    X[i, 13] = mushrooms[:stalk_surface_below_ring]
    X[i, 14] = mushrooms[:stalk_color_above_ring]
    X[i, 15] = mushrooms[:stalk_color_below_ring]
    X[i, 16] = mushrooms[:veil_type]
    X[i, 17] = mushrooms[:veil_color]
    X[i, 18] = mushrooms[:ring_number]
    X[i, 19] = mushrooms[:ring_type]
    X[i, 20] = mushrooms[:spore_print_color]
    X[i, 21] = mushrooms[:population]
    X[i, 22] = mushrooms[:habitat]
  
    y[i] = mushrooms[:class]
end
X

In [19]:
CSV.write("df_mushrooms.csv", df_mushrooms)


"df_mushrooms.csv"