In [3]:
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

# classifiers.py
# Copyright (C) 2014-2019 Fracpete (pythonwekawrapper at gmail dot com)

import os
import traceback
import weka.core.jvm as jvm
import wekaexamples.helper as helper
from weka.core.converters import Loader
from weka.classifiers import Classifier, SingleClassifierEnhancer, MultipleClassifiersCombiner, FilteredClassifier, \
    PredictionOutput, Kernel, KernelClassifier
from weka.classifiers import Evaluation
from weka.filters import Filter
from weka.core.classes import Random, from_commandline
import weka.plot.classifiers as plot_cls
import weka.plot.graph as plot_graph
import weka.core.types as types



jvm.start(packages=True)
    
# load a dataset
train_file = helper.get_data_dir() + os.sep + "iris.csv"
helper.print_info("Loading dataset: " + train_file)
loader = Loader("weka.core.converters.CSVLoader")
train_data = loader.load_file(train_file)
train_data.class_is_last()

# classifier help
helper.print_title("Creating help string")
classifier = Classifier(classname="weka.classifiers.trees.J48")
#print(classifier.to_help())

# partial classname
helper.print_title("Creating classifier from partial classname")
clsname = ".J48"
classifier = Classifier(classname=clsname)
print(clsname + " --> " + classifier.classname)

# build a classifier and output model
helper.print_title("Training J48 classifier on iris")
classifier = Classifier(classname="weka.classifiers.trees.J48")
# Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor"
# property of the J48 classifier itself. However, being of type float rather than double, we need
# to convert it to the correct type first using the double_to_float function:
classifier.set_property("confidenceFactor", types.double_to_float(0.3))
classifier.build_classifier(train_data)
print(classifier)
print(classifier.graph)
print(classifier.to_source("MyJ48"))
plot_graph.plot_dot_graph(classifier.graph)

# evaluate model on test set
helper.print_title("Evaluating J48 classifier on iris")
evaluation = Evaluation(train_data)
evl = evaluation.test_model(classifier, train_data)
print(evl)
print(evaluation.summary())


DEBUG:weka.core.jvm:Adding bundled jars
DEBUG:weka.core.jvm:Classpath=['C:\\Users\\jindunli\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\javabridge\\jars\\rhino-1.7R4.jar', 'C:\\Users\\jindunli\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\javabridge\\jars\\runnablequeue.jar', 'C:\\Users\\jindunli\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\javabridge\\jars\\cpython.jar', 'C:\\Users\\jindunli\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\weka\\lib\\python-weka-wrapper.jar', 'C:\\Users\\jindunli\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\weka\\lib\\weka.jar']
DEBUG:weka.core.jvm:MaxHeapSize=default
DEBUG:weka.core.jvm:Package support enabled



Loading dataset: C:\Users\jindunli\Desktop\Test Code\wekaexamples\data\iris.csv


ERROR:weka.plot.graph:Pygraphviz is not installed, cannot generate graph plot!



Creating help string

Creating classifier from partial classname
.J48 --> weka.classifiers.trees.J48

Training J48 classifier on iris
J48 pruned tree
------------------

petalwidth <= 0.6: Iris-setosa (50.0)
petalwidth > 0.6
|   petalwidth <= 1.7
|   |   petallength <= 4.9: Iris-versicolor (48.0/1.0)
|   |   petallength > 4.9
|   |   |   petalwidth <= 1.5: Iris-virginica (3.0)
|   |   |   petalwidth > 1.5: Iris-versicolor (3.0/1.0)
|   petalwidth > 1.7: Iris-virginica (46.0/1.0)

Number of Leaves  : 	5

Size of the tree : 	9

digraph J48Tree {
N0 [label="petalwidth" ]
N0->N1 [label="<= 0.6"]
N1 [label="Iris-setosa (50.0)" shape=box style=filled ]
N0->N2 [label="> 0.6"]
N2 [label="petalwidth" ]
N2->N3 [label="<= 1.7"]
N3 [label="petallength" ]
N3->N4 [label="<= 4.9"]
N4 [label="Iris-versicolor (48.0/1.0)" shape=box style=filled ]
N3->N5 [label="> 4.9"]
N5 [label="petalwidth" ]
N5->N6 [label="<= 1.5"]
N6 [label="Iris-virginica (3.0)" shape=box style=filled ]
N5->N7 [label="> 1.5"]
N7 [l

In [4]:
c=str(classifier)
c=c.split("\n")
c=c[3:-5]
c

['petalwidth <= 0.6: Iris-setosa (50.0)',
 'petalwidth > 0.6',
 '|   petalwidth <= 1.7',
 '|   |   petallength <= 4.9: Iris-versicolor (48.0/1.0)',
 '|   |   petallength > 4.9',
 '|   |   |   petalwidth <= 1.5: Iris-virginica (3.0)',
 '|   |   |   petalwidth > 1.5: Iris-versicolor (3.0/1.0)',
 '|   petalwidth > 1.7: Iris-virginica (46.0/1.0)']

In [6]:
import pandas as pd

df=pd.DataFrame(c)
df.columns=['tree']

new = df["tree"].str.split("(", n = 1, expand = True) 
  
# making separate first name column from new data frame 
df["tree details"]= new[0] 
  
# making separate last name column from new data frame 
df["split result"]= new[1] 
  
# Dropping old Name columns 
df.drop(columns =["tree"], inplace = True) 

new = df["split result"].str.split("/", n = 1, expand = True) 
# making separate first name column from new data frame 
df["split result1"]= new[0] 
  
# making separate last name column from new data frame 
df["split result2"]= new[1] 
  
# Dropping old Name columns 
df.drop(columns =["split result"], inplace = True) 

df["split result1"] = df['split result1'].str.replace(')','')
df["split result2"] = df['split result2'].str.replace(')','')
df["split result1"] = df['split result1'].astype('float')
df["split result2"] = df['split result2'].astype('float')

count_chars='|'
df['split number']=df['tree details'].apply(lambda x: x.count(count_chars)+1)

df['tree details']=df['tree details'].str.replace('|','')

df = df[['split number', 'tree details','split result1','split result2']]

In [7]:
df

Unnamed: 0,split number,tree details,split result1,split result2
0,1,petalwidth <= 0.6: Iris-setosa,50.0,
1,1,petalwidth > 0.6,,
2,2,petalwidth <= 1.7,,
3,3,petallength <= 4.9: Iris-versicolor,48.0,1.0
4,3,petallength > 4.9,,
5,4,petalwidth <= 1.5: Iris-virginica,3.0,
6,4,petalwidth > 1.5: Iris-versicolor,3.0,1.0
7,2,petalwidth > 1.7: Iris-virginica,46.0,1.0
