# IDENTIFYING POISONOUS MUSHROOMS WITH RULE LEARNERS

Rosa Karina Torres Calderon

In [4]:
import pandas as pd
import numpy as np
import weka.core.jvm as jvm
import weka.core.converters as conv
from weka.classifiers import Classifier, Evaluation
from weka.core.classes import Random

# Step 1- Reading the data set

In [5]:
mushroom = pd.read_csv("mushrooms.csv")
print(mushroom.head(5))
print(" ")
print('Tamaño del data frame: ' + str(mushroom.shape))

  type cap_shape cap_surface cap_color bruises odor gill_attachment  \
0    p         x           s         n       t    p               f   
1    e         x           s         y       t    a               f   
2    e         b           s         w       t    l               f   
3    p         x           y         w       t    p               f   
4    e         x           s         g       f    n               f   

  gill_spacing gill_size gill_color  ... stalk_surface_below_ring  \
0            c         n          k  ...                        s   
1            c         b          k  ...                        s   
2            c         b          n  ...                        s   
3            c         n          n  ...                        s   
4            w         b          k  ...                        s   

  stalk_color_above_ring stalk_color_below_ring veil_type veil_color  \
0                      w                      w         p          w   
1             

# Step 2- Exploring and preparing the data

In [6]:
mushroom.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
type                        8124 non-null object
cap_shape                   8124 non-null object
cap_surface                 8124 non-null object
cap_color                   8124 non-null object
bruises                     8124 non-null object
odor                        8124 non-null object
gill_attachment             8124 non-null object
gill_spacing                8124 non-null object
gill_size                   8124 non-null object
gill_color                  8124 non-null object
stalk_shape                 8124 non-null object
stalk_root                  8124 non-null object
stalk_surface_above_ring    8124 non-null object
stalk_surface_below_ring    8124 non-null object
stalk_color_above_ring      8124 non-null object
stalk_color_below_ring      8124 non-null object
veil_type                   8124 non-null object
veil_color                  8124 non-null object
ring_number

In [7]:
# Analyzing veil_type feature
print(mushroom["veil_type"].value_counts()) 

p    8124
Name: veil_type, dtype: int64


In [8]:
# As all the examples in the data are classified as partial (p) we will delate the feature
mushroom = mushroom.drop(['veil_type'],axis=1)
print('Tamaño del data frame: ' + str(mushroom.shape))

Tamaño del data frame: (8124, 22)


In [9]:
# Convert type feature values e-p  to edible-poisonous
mushroom["type"] = mushroom["type"].map({"e": "edible", "p":"poisonous"})
mushroom["type"] = mushroom.type.astype("category")

In [10]:
# Visualize the distribution of the class variable in our dataset
print("Número de tipos de hongos: ")
print(mushroom["type"].value_counts()) 

Número de tipos de hongos: 
edible       4208
poisonous    3916
Name: type, dtype: int64


In [11]:
# Get percentage of edible and poisonous mushrooms
percent = mushroom["type"].value_counts(); 
edible = (percent[0]*100)/(percent[0]+percent[1])
poisonous = (percent[1]*100)/(percent[0]+percent[1])
print("Porcentaje hongos comestibles: " + str(edible) + ' %')
print("Porcentaje hongos venenosos: " + str(poisonous) + ' %')

Porcentaje hongos comestibles: 51.7971442639094 %
Porcentaje hongos venenosos: 48.2028557360906 %


In [12]:
#mushroom_target = mushroom.loc[:,'type']
#mushroom = mushroom.drop('type', axis=1)

# Step 3- Training a Model on the data

For the purposes of this experiment, we will not split the dataset in train and test; we only want to find rules that accurately depict complete set of known mushrooms types

In [15]:
# One Rule implementation
jvm.start()
data = conv.load_any_file("mushrooms.csv")
data.class_is_first()
cname = "weka.classifiers.rules.OneR"
cls = Classifier(classname=cname, options=None)
cls.build_classifier(data)
print('\nMushrooms R1')
print(cls)

INFO:weka.core.jvm:JVM already running, call jvm.stop() first



Mushrooms R1
odor:
	p	-> p
	a	-> e
	l	-> e
	n	-> e
	f	-> p
	c	-> p
	y	-> p
	s	-> p
	m	-> p
(8004/8124 instances correct)



# Step 4- Evaluating model performance

In [16]:
evaluation = Evaluation(data)
evaluation.crossvalidate_model(cls, data, 10, Random(42))  # 10-fold CV
print("=== Summary ==", evaluation.summary())
print(evaluation.matrix("=== Confusion matrix ==="))
print("pctCorrect: " + str(evaluation.percent_correct))
print("incorrect: " + str(evaluation.incorrect))

=== Summary == 
Correctly Classified Instances        8004               98.5229 %
Incorrectly Classified Instances       120                1.4771 %
Kappa statistic                          0.9704
Mean absolute error                      0.0148
Root mean squared error                  0.1215
Relative absolute error                  2.958  %
Root relative squared error             24.323  %
Total Number of Instances             8124     

=== Confusion matrix ===
    a    b   <-- classified as
 3796  120 |    a = p
    0 4208 |    b = e

pctCorrect: 98.52289512555392
incorrect: 120.0


# Step 5- Improving model performance

In [17]:
# Ripper rule learning algorithm implementation
data.class_is_first()
cname = "weka.classifiers.rules.JRip"
cls = Classifier(classname=cname, options=None)
cls.build_classifier(data)
print('\n',cls,'\n')


 JRIP rules:

(odor = f) => type=p (2160.0/0.0)
(gill_size = n) and (gill_color = b) => type=p (1152.0/0.0)
(gill_size = n) and (odor = p) => type=p (256.0/0.0)
(odor = c) => type=p (192.0/0.0)
(spore_print_color = r) => type=p (72.0/0.0)
(stalk_surface_below_ring = y) and (stalk_surface_above_ring = k) => type=p (68.0/0.0)
(habitat = l) and (cap_color = w) => type=p (8.0/0.0)
(stalk_color_above_ring = y) => type=p (8.0/0.0)
 => type=e (4208.0/0.0)

Number of Rules : 9
 



In [18]:
evaluation = Evaluation(data)
evaluation.crossvalidate_model(cls, data, 10, Random(42))  # 10-fold CV
print("=== Summary ==", evaluation.summary())
print(evaluation.matrix("=== Confusion matrix ==="))
print("pctCorrect: " + str(evaluation.percent_correct))
print("incorrect: " + str(evaluation.incorrect))

=== Summary == 
Correctly Classified Instances        8124              100      %
Incorrectly Classified Instances         0                0      %
Kappa statistic                          1     
Mean absolute error                      0     
Root mean squared error                  0     
Relative absolute error                  0      %
Root relative squared error              0      %
Total Number of Instances             8124     

=== Confusion matrix ===
    a    b   <-- classified as
 3916    0 |    a = p
    0 4208 |    b = e

pctCorrect: 100.0
incorrect: 0.0


In [19]:
jvm.stop()