In [54]:
#!/usr/bin/python



In [57]:
import json
import re
import os
import sys

In [58]:
re_head = re.compile("J48 (un)?pruned tree")
re_divider_line = re.compile("^-*\n$")
re_blank_line = re.compile("^[ \t\n]*$")
re_splitter = re.compile("[ :]")
re_range = re.compile(
    r"^'\("
    r"(-inf|-?[0-9]+(\.[0-9]+)?)"
    r"-"
    r"(-?[0-9]+(\.[0-9]+)?\]|inf\))"
    r"'$")


In [59]:
def parse_value(token):
    """Returns an float if the token represents a number, a range if the token
    represents a range of numbers, otherwise return the token as is."""
    try:
        return float(token)
    except ValueError:
        # Look for ranges of the form '(start-end]', ' included
        if re_range.match(token):
            range_str = token[2:-2]

            # Careful not to use a minus sign as a dash.
            separator_dash = range_str.find("-", 1)
            return (parse_value(range_str[:separator_dash]), 
                    parse_value(range_str[separator_dash+1:]))
        else:
            # Not a number or range - so it must be nominal, leave it as it.
            return token



In [60]:
def parse_line(line):
    """Split the line into a tuple
    (depth, feature, comparator, value, classification/None)"""
    # Avoid empty strings from double whitespaces and the likes.
    split = [ l for l in re_splitter.split(line) if len(l) > 0 ]
    depth = 0
    for part in split:
        if part == "|":
            depth += 1 
        else:
            break
    return (depth, split[depth], split[depth + 1], 
            parse_value(split[depth + 2]),
            split[depth + 3] if len(split) > depth + 3 else None)


In [61]:
def parse_tree(lines):
    """Parses input lines into a decision tree."""
    current_index = [0] # need mutable container because of closure limitations
    print (lines)
    def parse(current_depth):
        """Helper recursive function."""
        node_feature = None
        children = []
        while current_index[0] < len(lines):
            line = lines[current_index[0]]
            depth, feature, comparator, value, classif = parse_line(line)
            if depth < current_depth:
                # Finished parsing this node.
                break
            elif depth == current_depth:
                if node_feature is None:
                    node_feature = feature
                elif node_feature != feature:
                    raise Exception("Error : Feature mismatch - expected %s"
                        "but got : \n%s"
                        % (node_feature, line))

                # Another branch
                current_index[0] += 1
                if classif is None:
                    children.append((comparator, value, 
                                     parse(current_depth + 1)))
                else:
                    children.append((comparator, value, classif))
            else:
                raise Exception("Error : Input jumps two levels at once\n%s."
                                % line)

        return (node_feature, children)

    return parse(0)

In [63]:
#input_filename = "J48Tree.txt"
input_filename = "sample.txt"
if os.path.isfile(input_filename):
            f = open(input_filename)
            lines = f.readlines()
            print (lines)
            f.close()
else:
            raise Exception("Error : File %s not found!" % input_filename)
    #else:
       # lines = sys.stdin.readlines()

if not lines:
        raise Exception("Error : Empty input!")
#tree_lines = get_tree_lines(lines)
#tree = parse_tree(tree_lines)
tree = parse_tree(lines)
    #print json.dumps(tree)




['outlook = sunny\n', '|   humidity <= 75: yes (2.0)\n', '|   humidity > 75: no (3.0)\n', 'outlook = overcast: yes (4.0)\n', 'outlook = rainy\n', '|   windy = TRUE: no (2.0)\n', '|   windy = FALSE: yes (3.0)\n', 'outlook = custom\n', "|   humidity = '(-inf--1.0]': no (4.0)\n", "|   humidity = '(-1.0-5.0]': yes (1.0)\n", "|   humidity = '(5.0-inf)': no (2.0)"]
['outlook = sunny\n', '|   humidity <= 75: yes (2.0)\n', '|   humidity > 75: no (3.0)\n', 'outlook = overcast: yes (4.0)\n', 'outlook = rainy\n', '|   windy = TRUE: no (2.0)\n', '|   windy = FALSE: yes (3.0)\n', 'outlook = custom\n', "|   humidity = '(-inf--1.0]': no (4.0)\n", "|   humidity = '(-1.0-5.0]': yes (1.0)\n", "|   humidity = '(5.0-inf)': no (2.0)"]
