In [1]:
%run CAR_creation.ipynb

In [2]:
import re
import numpy as np


def make_intervalfunc(minv, maxv, left_inclusivity, right_inclusivity):
    def inner_func(value):
        if greaterthan(value, minv, left_inclusivity) and lesserthan(value, maxv, right_inclusivity):
            return True
        else:
            return False
        
    return inner_func
        
def greaterthan(a, b, inclusivity):
    if inclusivity:
        if a >= b: return True
    elif a > b: return True
    
    return False
        
def lesserthan(a, b, inclusivity):
    if inclusivity:
        if a <= b: return True
    elif a < b: return True
    
    return False


class Interval:

    def __init__(self, minval, maxval, left_inclusive, right_inclusive):
        self.minval = minval
        self.maxval = maxval
        self.left_inclusive = left_inclusive
        self.right_inclusive = right_inclusive
        
        
        self.left_bracket = "<" if left_inclusive else "("
        self.right_bracket = ">" if right_inclusive else ")"
        
        self.__membership_func = np.vectorize(
            make_intervalfunc(self.minval, self.maxval, self.left_inclusive, self.right_inclusive)
        )
            
    
    def __hash__(self):
        return hash(repr(self))
    
    def __eq__(self, other):
        return hash(self) == hash(other)
            
    def refit(self, vals):
        """refit values to a finer grid
        """
        values = np.array(vals)
        
        mask = self.test_membership(values)
        new_array = values[mask]

        left, right = min(new_array), max(new_array)

        return Interval(left, right, True, True)
        
            
    def test_membership(self, value):
        return self.__membership_func(value)
    
    def isin(self, value):
        return self.test_membership([value])[0]
        

    def string(self):
        return "{}{};{}{}".format(self.left_bracket, self.minval, self.maxval, self.right_bracket)
        
    def __repr__(self):
        return "Interval[{}{};{}{}]".format(self.left_bracket, self.minval, self.maxval, self.right_bracket)

In [3]:
import re
import numpy as np

class IntervalReader():
    
    
    interval_regex = re.compile("(<|\()(\d+(?:\.(?:\d)+)?);(\d+(?:\.(?:\d)+)?)(\)|>)")
    
    
    def __init__(self):
        # opened interval brackets
        self.__open_bracket = "(", ")"
        
        # closed interval brackets
        self.__closed_bracket = "<", ">"
        
        # negative and positive infinity symbol,
        # e.g. -inf, +inf
        self.__infinity_symbol = "-inf", "+inf"
        
        # decimal separator, e.g. ".", ","
        self.__decimal_separator = "."
        
        # interval members separator
        self.__members_separator = ";"
        
        self.compile_reader()
        
        
    def compile_reader(self):

        left_bracket_open = re.escape(self.open_bracket[0])
        left_bracket_closed = re.escape(self.closed_bracket[0])
        
        right_bracket_open = re.escape(self.open_bracket[1])
        right_braket_closed = re.escape(self.closed_bracket[1])
        
        # e.g. (   <    |   \(    ) 
        #      (   {}   |   {}    )
        left_bracket_regex_string = "({}|{})".format(
            left_bracket_open,
            left_bracket_closed
        )
        
        # e.g. (   >   |   \)    ) 
        #      (   {}   |   {}    )
        right_bracket_regex_string = "({}|{})".format(
            right_bracket_open,
            right_braket_closed
        )
        
        # ((   \d+  (?:  \.   (?:\d)+  )?   )|-inf)
        # (   \d+  (?:  {}   (?:\d)+  )?   )
        left_number_regex_string = "(\-?\d+(?:{}(?:\d)+)?|{})".format(
            re.escape(self.decimal_separator),
            re.escape(self.infinity_symbol[0]),
        )
        
        
        # ((   \d+  (?:  \.   (?:\d)+  )?   )|+inf)
        # (   \d+  (?:  {}   (?:\d)+  )?   )
        right_number_regex_string = "(\-?\d+(?:{}(?:\d)+)?|{})".format(
            re.escape(self.decimal_separator),
            re.escape(self.infinity_symbol[1]),
        )
        
        members_separator_regex = "{}".format(
            re.escape(self.members_separator)
        )
        
        
        interval_regex_string = "{}{}{}{}{}".format(
            left_bracket_regex_string,
            left_number_regex_string,
            members_separator_regex,
            right_number_regex_string,
            right_bracket_regex_string
        )
        
        self.__interval_regex = re.compile(interval_regex_string)
        
        
    def read(self, interval_string):
        # returns array of results, take first member
        args = self.__interval_regex.findall(interval_string)[0]
        
        left_bracket, minval, maxval, right_bracket = args
        
        left_inclusive = True if left_bracket == self.closed_bracket[0] else False
        right_inclusive = True if right_bracket == self.closed_bracket[1] else False
        
        
        minval_final = float(minval) if minval != self.infinity_symbol[0] else np.NINF 
        maxval_final = float(maxval) if maxval != self.infinity_symbol[1] else np.PINF
        
        interval = Interval(
            minval_final,
            maxval_final,
            left_inclusive,
            right_inclusive
        )
        
        return interval
      
        
    # boilerplate getter/setter code    
    
    @property
    def open_bracket(self):
        return self.__open_bracket
    
    @open_bracket.setter
    def open_bracket(self, val):
        self.__open_bracket = val
        return self
    
    @property
    def closed_bracket(self):
        return self.__closed_bracket
    
    @closed_bracket.setter
    def closed_bracket(self, val):
        self.__closed_bracket = val
        return self
        
    @property
    def infinity_symbol(self):
        return self.__infinity_symbol
    
    @infinity_symbol.setter
    def infinity_symbol(self, val):
        self.__infinity_symbol = val
        return self
    
    @property
    def decimal_separator(self):
        return self.__decimal_separator
    
    @decimal_separator.setter
    def decimal_separator(self, val):
        self.__decimal_separator = val
        return self
    
    @property
    def members_separator(self):
        return self.__members_separator
    
    @members_separator.setter
    def members_separator(self, val):
        self.__members_separator = val
        return self
    
    
        
interval_reader = IntervalReader()

interval_reader.compile_reader()

interval_reader.read("<1.2;2.3>")

Interval[<1.2;2.3>]

In [4]:
%run ../../main.py

import copy

class QuantitativeCAR:
    
    interval_reader = IntervalReader()
    
    def __init__(self, rule):
        self.antecedent = self.__create_intervals_from_antecedent(rule.antecedent)
        self.consequent = copy.copy(rule.consequent)
        
        self.confidence = rule.confidence
        self.support = rule.support
        self.rulelen = rule.rulelen
        self.rid = rule.rid
        
        # property which indicates wheter the rule was extended or not
        self.was_extended = False
        # literal which extended the rule
        self.extension_literal = None
        
        
    def __create_intervals_from_antecedent(self, antecedent):
        interval_antecedent = []
        
        for literal in antecedent:
            attribute, value = literal
            
            interval = interval_reader.read(value)
            
            interval_antecedent.append((attribute, interval))
        
        
        return self.__sort_antecedent(interval_antecedent)
    
    
    def __sort_antecedent(self, antecedent):
        return sorted(antecedent)
    
    
    def update_properties(self, quant_dataframe):
        """updates rule properties using instance
        of QuantitativeDataFrame
        
        properties:
            support, confidence, rulelen
        
        """
        
        if type(quant_dataframe) != QuantitativeDataFrame:
            raise Exception(
                "type of quant_dataframe must be QuantitativeDataFrame"
            )
            
        
        support, confidence = quant_dataframe.calculate_rule_statistics(self)
        
        self.support = support
        self.confidence = confidence
        # length of antecedent + length of consequent
        self.rulelen = len(self.antecedent) + 1
        
    
    def copy(self):
        return copy.deepcopy(self)
        
        
    def __repr__(self):
        ant = self.antecedent
        ant_string_arr = [ key + "=" + val.string() for key, val in ant ]
        ant_string = "{" + ",".join(ant_string_arr) + "}"
        
        args = [
            ant_string,
            "{" + self.consequent.string() + "}",
            self.support,
            self.confidence,
            self.rulelen,
            self.rid
        ]
        
        text = "CAR {} => {} sup: {:.2f} conf: {:.2f} len: {}, id: {}".format(*args)

        return text
    
    
    def __gt__(self, other):
        """
        precedence operator. Determines if this rule
        has higher precedence. Rules are sorted according
        to their confidence, support, length and id.
        """
        if (self.confidence > other.confidence):
            return True
        elif (self.confidence == other.confidence and
              self.support > other.support):
            return True
        elif (self.confidence == other.confidence and
              self.support == other.support and
              self.rulelen < other.rulelen):
            return True
        elif(self.confidence == other.confidence and
              self.support == other.support and
              self.rulelen == other.rulelen and
              self.rid < other.rid):
            return True
        else:
            return False
        
    
    def __lt__(self, other):
        """
        rule precedence operator
        """
        return not self > other
    
    
    def __eq__(self, other):
        return self.rid == other.rid


In [5]:
class RuleCoverCache:
    pass
    

In [6]:
class LiteralCache:
    """class which stores literals
    and corresponding truth values
    e.g. [
        "food=banana": [True, True, False, False, True],
        "food=apple" : [True, True, True, True, False]
    ]
    
    """
    
    def __init__(self):
        self.__cache = {}

    def insert(self, literal, truth_values):
        self.__cache[literal] = truth_values
        
    def get(self, literal):
        return self.__cache[literal]
        
    def __contains__(self, literal):
        """function for using in
        on LiteralCache object
        """
        
        return literal in self.__cache.keys()
    
    
    
cache = LiteralCache()

cache.insert("food=apple", np.array([True, True, False, False, True]))
cache.insert("food=gingerbread", np.array([False, False, False, False, True]))

assert "food=apple" in cache
assert "blabla" not in cache
assert "food=gingerbread" in cache

In [7]:
import pandas
import numpy as np


class QuantitativeDataFrame:
    
    def __init__(self, dataframe):
        if type(dataframe) != pandas.DataFrame:
            raise Exception("type of dataframe must be pandas.dataframe")
        
        
        self.__dataframe = dataframe
        
        # sorted and unique columns of the dataframe
        # saved as a numpy array
        self.__preprocessed_columns = self.__preprocess_columns(dataframe)
        
        
        # literal cache for computing rule statistics
        # - support and confidence
        self.__literal_cache = LiteralCache()

        # so that it doesn't have to be computed over and over
        self.size = dataframe.index.size
        
        
    @property
    def dataframe(self):
        return self.__dataframe
    
    
    def column(self, colname):
        return self.__preprocessed_columns[colname]
    
    
    def mask(self, vals):
        return self.__dataframe[vals]
    
    
    def find_covered_by_antecedent_mask(self, antecedent):
        """
        returns:
            mask - an array of boolean values indicating which instances
            are covered by antecedent
        """
        
        # todo: compute only once to make function faster
        dataset_size = self.__dataframe.index.size
        
        for literal in antecedent:
            attribute, interval = literal
            
            # the column that concerns the
            # iterated attribute
            # instead of pandas.Series, grab the ndarray
            # using values attribute
            relevant_column = self.__dataframe[[attribute]].values.reshape(dataset_size)
            
            # this tells us which instances satisfy the literal
            current_mask = self.get_literal_coverage(literal, relevant_column)
            
            # add cummulated and current mask using logical AND
            cummulated_mask &= current_mask
    
    
    def find_covered_by_literal_mask(self, literal):
        """
        returns:
            mask - an array of boolean values indicating which instances
            are covered by literal
        """
        
        for literal in rule.antecedent:
            attribute, interval = literal
            
            # the column that concerns the
            # iterated attribute
            # instead of pandas.Series, grab the ndarray
            # using values attribute
            relevant_column = self.__dataframe[[attribute]].values.reshape(dataset_size)
            
            # this tells us which instances satisfy the literal
            current_mask = self.get_literal_coverage(literal, relevant_column)
            
            # add cummulated and current mask using logical AND
            cummulated_mask &= current_mask
    
    
    def find_covered_by_rule_mask(self, rule):
        """
        returns:
            covered_by_antecedent_mask:
                - array of boolean values indicating which
                dataset rows satisfy antecedent
                
            covered_by_consequent_mask:
                - array of boolean values indicating which
                dataset rows satisfy conseqeunt
        """
        
        dataset_size = self.__dataframe.index.size
        
        # initialize a mask filled with True values
        # it will get modified as futher literals get
        # tested
        
        # for optimization - create cummulated mask once
        # in constructor
        cummulated_mask = np.array([True] * dataset_size)
        
        for literal in rule.antecedent:
            attribute, interval = literal
            
            # the column that concerns the
            # iterated attribute
            # instead of pandas.Series, grab the ndarray
            # using values attribute
            relevant_column = self.__dataframe[[attribute]].values.reshape(dataset_size)
            
            # this tells us which instances satisfy the literal
            current_mask = self.get_literal_coverage(literal, relevant_column)
            
            # add cummulated and current mask using logical AND
            cummulated_mask &= current_mask
            
            
        
        instances_satisfying_antecedent_mask = cummulated_mask
        instances_satisfying_consequent_mask = self.__get_consequent_coverage_mask(rule)
        instances_satisfying_consequent_mask = instances_satisfying_consequent_mask.reshape(dataset_size)
        
        return instances_satisfying_antecedent_mask, instances_satisfying_consequent_mask
        
        
    
    def calculate_rule_statistics(self, rule):
        """calculates rule's confidence and
        support using efficient numpy functions
        
        
        returns:
        --------
        
            support:
                float
            
            confidence:
                float
        """
        
        dataset_size = self.__dataframe.index.size
        
        # initialize a mask filled with True values
        # it will get modified as futher literals get
        # tested
        
        # for optimization - create cummulated mask once
        # in constructor
        cummulated_mask = np.array([True] * dataset_size)
        
        for literal in rule.antecedent:
            attribute, interval = literal
            
            # the column that concerns the
            # iterated attribute
            # instead of pandas.Series, grab the ndarray
            # using values attribute
            relevant_column = self.__dataframe[[attribute]].values.reshape(dataset_size)
            
            # this tells us which instances satisfy the literal
            current_mask = self.get_literal_coverage(literal, relevant_column)
            
            # add cummulated and current mask using logical AND
            cummulated_mask &= current_mask
            
            
        
        
        instances_satisfying_antecedent = self.__dataframe[cummulated_mask].index
        instances_satisfying_antecedent_count = instances_satisfying_antecedent.size
        
        # using cummulated mask to filter out instances that satisfy consequent
        # but do not satisfy antecedent
        instances_satisfying_consequent_mask = self.__get_consequent_coverage_mask(rule)
        instances_satisfying_consequent_mask = instances_satisfying_consequent_mask.reshape(dataset_size)
        
        instances_satisfying_consequent_and_antecedent = self.__dataframe[
            instances_satisfying_consequent_mask & cummulated_mask
        ].index
        
        instances_satisfying_consequent_and_antecedent_count = instances_satisfying_consequent_and_antecedent.size
        instances_satisfying_consequent_count = self.__dataframe[instances_satisfying_consequent_mask].index.size
        
        # instances satisfying consequent both antecedent and consequent 
        support = instances_satisfying_consequent_and_antecedent_count / dataset_size
        confidence = instances_satisfying_consequent_and_antecedent_count / instances_satisfying_antecedent_count
        
        return support, confidence
    
    
    def __get_consequent_coverage_mask(self, rule):
        consequent = rule.consequent
        attribute, value = consequent
        
        class_column = self.__dataframe[[attribute]].values
        
        literal_key = "{}={}".format(attribute, value)

        mask = []
        
        if literal_key in self.__literal_cache:
            mask = self.__literal_cache.get(literal_key)
        else:
            mask = class_column == value
        
        return mask
    
    
    def get_literal_coverage(self, literal, values):
        """returns mask which describes the instances that
        satisfy the interval
        
        function uses cached results for efficiency
        """
        
        if type(values) != np.ndarray:
            raise Exception("Type of values must be numpy.ndarray")
            
        mask = []
        
        attribute, interval = literal
        
        literal_key = "{}={}".format(attribute, interval)
        
        # check if the result is already cached, otherwise
        # calculate and save the result
        if literal_key in self.__literal_cache:
            mask = self.__literal_cache.get(literal_key)
        else:
            mask = interval.test_membership(values)
            
            self.__literal_cache.insert(literal_key, mask)
            
        # reshape mask into single dimension
        mask = mask.reshape(values.size)
            
        return mask
    
    
    def __preprocess_columns(self, dataframe):
        
        # covert to dict
        # column -> list
        # need to convert it to numpy array
        dataframe_dict = dataframe.to_dict(orient="list")
        
        dataframe_ndarray = {}
        
        
        for column, value_list in dataframe_dict.items():
            transformed_list = np.sort(np.unique(value_list))
            dataframe_ndarray[column] = transformed_list
            
        return dataframe_ndarray
        
        
    
    
    
qds = QuantitativeDataFrame(movies_undiscr_txns)

ds = movies_undiscr_txns





In [None]:
import pandas

class RuleExtender:
    
    def __init__(self, dataframe):
    
        if type(dataframe) != QuantitativeDataFrame:
            raise Exception(
                "type of dataset must be pandas.DataFrame"
            )
            
        self.__dataframe = dataframe
        
        
        
    def transform(self, rules):
        
        copied_rules = [ rule.copy() for rule in rules ]
        
        extended_rules = [ self.__extend(rule) for rule in copied_rules ]
        
        return extended_rules
    
    
    
    def __extend(self, rule):
        ext = self.__extend_rule(rule)
        
        return ext
        
    def __extend_rule(self, rule, min_improvement=0, min_conditional_improvement=-0.01):
        
        # check improvemnt argument ranges
        
        current_best = rule
        direct_extensions = self.__get_extensions(rule)
        
        current_best.update_properties(self.__dataframe)
        
        print("direct extensions")
        print(direct_extensions)
        print("========")
        
        
        while True:
            extension_succesful = False
            
            for candidate in direct_extensions:
                
                candidate.update_properties(self.__dataframe)
                
                delta_confidence = candidate.confidence - current_best.confidence
                delta_support = candidate.support - current_best.support
                
                print("candidate support", candidate.support)
                print("current best support", current_best.support)
                
                print("candidate:")
                print(candidate)
                print("delta confidence", delta_confidence)
                print("delta support", delta_support)
                print("======")
                
                if self.__crisp_accept(delta_confidence, delta_support, min_improvement):
                    print("crisp accent", candidate)
                    current_best = candidate
                    extension_succesful = True
                    break
                    
                
                if self.__conditional_accept(delta_confidence, min_conditional_improvement):
                    enlargement = candidate
                    
                    while True:
                        
                        enlargement = self.get_beam_extensions(enlargement)
                        
                        if not enlargement:
                            break
                            
                        candidate.update_properties(self.__dataframe)
                        enlargement.update_properties(self.__dataframe)

                        delta_confidence = enlargement.confidence - current_best.confidence
                        delta_support = enlargement.support - current_best.support

                        if self.__crisp_accept(delta_confidence, delta_support, min_improvement):
                            current_best = enlargement
                            extension_succesful = True
                            
                        elif self.__conditional_accept(delta_confidence, min_conditional_improvement):
                            continue
                        
                        else:
                            break
            
            
                    if extension_succesful == True:
                        break
                        

                else:
                    # continue to next candidate
                    continue
           
        
            if extension_succesful == False:
                break
                    
        return current_best
        
        
    def __get_extensions(self, rule):
        extended_rules = []
        
        for literal in rule.antecedent:
            attribute, interval = literal
            
            neighborhood = self.__get_direct_extensions(literal)
            
            for extended_literal in neighborhood:
                # copy the rule so the extended literal
                # can replace the default literal
                copied_rule = rule.copy()
                
                # find the index of the literal
                # so that it can be replaced
                current_literal_index = copied_rule.antecedent.index(literal)
                
                copied_rule.antecedent[current_literal_index] = extended_literal
                copied_rule.was_extended = True
                copied_rule.extended_literal = extended_literal
                
                extended_rules.append(copied_rule)
                
        return extended_rules
            
    
    def __get_direct_extensions(self, literal):
        """
        ensure sort and unique
        before calling functions
        """
        
        attribute, interval = literal
        
        vals = self.__dataframe.column(attribute)
        vals_len = vals.size

        mask = interval.test_membership(vals)

        # indices of interval members
        # we want to extend them 
        # once to the left
        # and once to the right
        # bu we have to check if resulting
        # indices are not larger than value size
        member_indexes = np.where(mask)[0]

        first_index = member_indexes[0]
        last_index = member_indexes[-1]

        first_index_modified = first_index - 1
        last_index_modified = last_index + 1
        
        no_left_extension = False
        no_right_extension = False

        if first_index_modified < 0:
            no_left_extension = True

        # if last_index_modified is larger than
        # available indices
        if last_index_modified > vals_len - 1:
            no_right_extension = True


        new_left_bound = interval.minval
        new_right_bound = interval.maxval

        if not no_left_extension:
            new_left_bound = vals[first_index_modified]

        if not no_right_extension:
            new_right_bound = vals[last_index_modified]


        # prepare return values
        extensions = []

        if not no_left_extension:
            extension = new_left_bound, interval.maxval
            
            # when values are [1, 2, 3, 3, 4, 5]
            # and the corresponding interval is (2, 4)
            # instead of resulting interval being (1, 4)
            
            temp_interval = Interval(
                new_left_bound,
                interval.maxval,
                True,
                interval.right_inclusive
            )

            extensions.append((attribute, temp_interval))

        if not no_right_extension:
            extensoin = interval.minval, new_right_bound

            temp_interval = Interval(
                interval.minval,
                new_right_bound,
                interval.left_inclusive,
                True
            )

            extensions.append((attribute, temp_interval))

        return extensions
        
    
    # make private
    def get_beam_extensions(self, rule):
        if not rule.was_extended:
            return None

        # literal which extended the rule
        literal = rule.extended_literal
        
        extended_literal = self.__get_direct_extensions(literal)
        
        if not extended_literal:
            return None
        
        copied_rule = rule.copy()
        
        literal_index = copied_rule.antecedent.index(literal)
        
        # so that literal is not an array
        copied_rule.antecedent[literal_index] = extended_literal[0]
        copied_rule.was_extended = True
        copied_rule.extended_literal = extended_literal[0]
        
        return copied_rule

    
    
    def __crisp_accept(self, delta_confidence, delta_support, min_improvement):
        if delta_confidence >= min_improvement and delta_support >= 0:
            return True
        else:
            return False
    
    def __conditional_accept(self, delta_conf, min_improvement):
        if delta_conf >= min_improvement:
            return True
        
        
        
rule_ext = RuleExtender(qds)      

qrules = [ QuantitativeCAR(r) for r in rules ]

qrule_to_extend = qrules[0]
qrule_to_extend.antecedent = [
    ("a-list-celebrities", Interval(0, 2, True, True)),
    ("estimated-budget", Interval(0, 195, True, True))
]

extended = rule_ext.transform([qrule_to_extend])


for qr in qrules:
    qr.update_properties(qds)

[ print(qr) for qr in [qrule_to_extend] ]
print()
print()
[ print(ext) for ext in extended ]



In [None]:
class RuleRefitter:
    """Refits the rule to a finer grid
    """
    
    
    def __init__(self, quantitative_dataframe):
        self.__dataframe = quantitative_dataframe
        
        
    def transform(self, rules):
        copied_rules = [ rule.copy() for rule in rules  ]
        refitted = [ self.__refit(rule) for rule in copied_rules ]
        
        return refitted
        
    def __refit(self, rule):
        """refits a single rule
        """

        for idx, literal in enumerate(rule.antecedent):
            attribute, interval = literal
        
            current_attribute_values = self.__dataframe.column(attribute)

            refitted_interval = interval.refit(current_attribute_values)

            rule.antecedent[idx] = attribute, refitted_interval
            
            
        return rule
            
    
    
            
    
    
rule_refitter = RuleRefitter(qds)

rule_refitter.transform(qrules)

In [10]:
class RuleLiteralPruner:
    
    def __init__(self, quantitative_dataframe):
        self.__dataframe = quantitative_dataframe
        
        
    def transform(self, rules):
        copied_rules = [ rule.copy() for rule in rules  ]
        trimmed = [ self.__trim(rule) for rule in copied_rules ]
        
        return trimmed
    
    
    def produce_combinations(self, array):
        arr_len = len(array)
    
        for i in range(arr_len):
            combination = array[0:i] + array[i+1:arr_len]
        
            yield combination
    
    
    def __trim(self, rule):
        if type(rule) != QuantitativeCAR:
            raise Exception("type of rule must be QuantClassAssociationRule")

            
        attr_removed = False
    
        literals = rule.antecedent
        consequent = rule.consequent
        
        rule.update_properties(self.__dataframe)
        
        dataset_len = self.__dataframe.size

        if len(literals) < 1:
            return rule

        while True:
            for literals_combination in self.produce_combinations(literals):
                if not literals_combination:
                    continue
                    
                copied_rule = rule.copy()
                
                copied_rule.antecedent = literals_combination
                copied_rule.update_properties(self.__dataframe)

                if copied_rule.confidence > rule.confidence:
                    rule.support = copied_rule.support
                    rule.confidence = copied_rule.confidence
                    rule.rulelen = copied_rule.rulelen
                    
                    rule.antecedent = copied_rule.antecedent

                    attr_removed = True
                    
                    break
                    
                else:
                    attr_removed = False

            if attr_removed == False:
                break
                
                
        return rule
    
    
literal_pruner = RuleLiteralPruner(qds)

literal_pruner.transform(qrules)

[CAR {a-list-celebrities=<0.0;2.0)} => {class=box-office-bomb} sup: 0.17 conf: 1.00 len: 2, id: 42,
 CAR {estimated-budget=<250.0;300.0)} => {class=main-stream-hit} sup: 0.06 conf: 1.00 len: 2, id: 4,
 CAR {a-list-celebrities=<4.0;6.0),estimated-budget=<0.0;50.0)} => {class=critical-success} sup: 0.09 conf: 1.00 len: 3, id: 24,
 CAR {a-list-celebrities=<6.0;8.0)} => {class=critical-success} sup: 0.06 conf: 0.67 len: 2, id: 1,
 CAR {a-list-celebrities=<4.0;6.0),estimated-budget=<100.0;150.0)} => {class=main-stream-hit} sup: 0.06 conf: 1.00 len: 3, id: 10,
 CAR {a-list-celebrities=<4.0;6.0)} => {class=main-stream-hit} sup: 0.17 conf: 0.50 len: 2, id: 16,
 CAR {estimated-budget=<200.0;250.0)} => {class=box-office-bomb} sup: 0.06 conf: 0.67 len: 2, id: 9,
 CAR {estimated-budget=<0.0;50.0)} => {class=box-office-bomb} sup: 0.14 conf: 0.62 len: 2, id: 29,
 CAR {a-list-celebrities=<4.0;6.0)} => {class=main-stream-hit} sup: 0.17 conf: 0.50 len: 2, id: 31]

In [11]:
class RuleTrimmer:
    """Trims the rule
    """
    
    
    def __init__(self, quantitative_dataframe):
        self.__dataframe = quantitative_dataframe
        
        
    def transform(self, rules):
        copied_rules = [ rule.copy() for rule in rules  ]
        trimmed = [ self.__trim(rule) for rule in copied_rules ]
        
        return trimmed
    
    
    def __trim(self, rule):
        if type(rule) != QuantitativeCAR:
            raise Exception("type of rule must be QuantClassAssociationRule")

            
        covered_by_antecedent_mask, covered_by_consequent_mask = self.__dataframe.find_covered_by_rule_mask(rule)
        
        covered_by_rule_mask = covered_by_antecedent_mask & covered_by_consequent_mask
        
        # instances covered by rule
        correctly_covered_by_r = self.__dataframe.mask(covered_by_rule_mask)
        
        antecedent = rule.antecedent

        for idx, literal in enumerate(antecedent):

            attribute, interval = literal
            
            current_column = correctly_covered_by_r[[attribute]].values
            current_column_unique = np.unique(current_column)

            if not current_column.any():
                continue

            minv = np.asscalar(min(current_column))
            maxv = np.asscalar(max(current_column))

            new_interval = Interval(minv, maxv, True, True)

            antecedent[idx] = attribute, new_interval

        return rule
    
    
    
rule_trimmer = RuleTrimmer(qds)


[ print(r) for r in qrules ]

print()

rule_trimmer.transform(qrules)

CAR {a-list-celebrities=<0.0;2.0)} => {class=box-office-bomb} sup: 0.17 conf: 1.00 len: 2, id: 42
CAR {estimated-budget=<250.0;300.0)} => {class=main-stream-hit} sup: 0.06 conf: 1.00 len: 2, id: 4
CAR {a-list-celebrities=<4.0;6.0),estimated-budget=<0.0;50.0)} => {class=critical-success} sup: 0.09 conf: 1.00 len: 3, id: 24
CAR {a-list-celebrities=<6.0;8.0)} => {class=critical-success} sup: 0.06 conf: 0.67 len: 2, id: 1
CAR {a-list-celebrities=<4.0;6.0),estimated-budget=<100.0;150.0)} => {class=main-stream-hit} sup: 0.06 conf: 1.00 len: 3, id: 10
CAR {a-list-celebrities=<4.0;6.0),estimated-budget=<150.0;200.0)} => {class=main-stream-hit} sup: 0.00 conf: 0.00 len: 3, id: 16
CAR {estimated-budget=<200.0;250.0)} => {class=box-office-bomb} sup: 0.06 conf: 0.67 len: 2, id: 9
CAR {estimated-budget=<0.0;50.0)} => {class=box-office-bomb} sup: 0.14 conf: 0.62 len: 2, id: 29
CAR {a-list-celebrities=<4.0;6.0)} => {class=main-stream-hit} sup: 0.17 conf: 0.50 len: 2, id: 31



[CAR {a-list-celebrities=<1;1>} => {class=box-office-bomb} sup: 0.17 conf: 1.00 len: 2, id: 42,
 CAR {estimated-budget=<260;264>} => {class=main-stream-hit} sup: 0.06 conf: 1.00 len: 2, id: 4,
 CAR {a-list-celebrities=<4;5>,estimated-budget=<23;45>} => {class=critical-success} sup: 0.09 conf: 1.00 len: 3, id: 24,
 CAR {a-list-celebrities=<6;7>} => {class=critical-success} sup: 0.06 conf: 0.67 len: 2, id: 1,
 CAR {a-list-celebrities=<4;5>,estimated-budget=<106;143>} => {class=main-stream-hit} sup: 0.06 conf: 1.00 len: 3, id: 10,
 CAR {a-list-celebrities=<4.0;6.0),estimated-budget=<150.0;200.0)} => {class=main-stream-hit} sup: 0.00 conf: 0.00 len: 3, id: 16,
 CAR {estimated-budget=<202;223>} => {class=box-office-bomb} sup: 0.06 conf: 0.67 len: 2, id: 9,
 CAR {estimated-budget=<10;43>} => {class=box-office-bomb} sup: 0.14 conf: 0.62 len: 2, id: 29,
 CAR {a-list-celebrities=<4;5>} => {class=main-stream-hit} sup: 0.17 conf: 0.50 len: 2, id: 31]

In [12]:
import collections
from scipy import stats

class RulePostPruner:
    
    def __init__(self, quantitative_dataset):
        self.__dataframe = quantitative_dataset
        
        
    def transform(self, rules):
        copied_rules = [ rule.copy() for rule in rules ]

        pruned_rules = self.prune(copied_rules)
        
        return pruned_rules
        
    def preprocess_dataframe(self):
        return self.__dataframe.dataframe.index.values
        
        
        
        
    def get_most_frequent_class(self):
        """ 
        requires class column to be the last in dataframe
        
        gets the most frequent class from dataset
        - naive implementation
        """
        
        index_counts, possible_classes = pd.factorize(self.__dataframe.dataframe.iloc[:, -1].values)
        counts = np.bincount(index_counts)
        counts_max = counts.max()
        most_frequent_classes = possible_classes[counts == counts_max]
        
        # return only one
        return most_frequent_classes[0], counts_max
    
    
    def get_most_frequent_from_numpy(self, ndarray):
        """gets a mode from numpy array
        """
        unique, pos = np.unique(ndarray, return_inverse=True) 
        counts = np.bincount(pos)                  
        maxpos = counts.argmax()                      

        return (unique[maxpos], counts[maxpos])
        
    
    def find_covered(self):
        pass
        
        
    def prune(self, rules):
        
        dataset = self.preprocess_dataframe()
        dataset_len = dataset.size
        # True if datacase is not covered yet
        dataset_mask = [ True ] * dataset_len
        
        cutoff_rule = rules[-1]
        cutoff_class, cutoff_class_count = self.get_most_frequent_class()
        
        default_class = cutoff_class

        total_errors_without_default = 0
        
        lowest_total_error = dataset_len - cutoff_class_count
        
        # implement comparators
        rules.sort(reverse=True)
        
        for rule in rules:
            covered_antecedent, covered_consequent = self.__dataframe.find_covered_by_rule_mask(rule)

            
            # dataset -= covered_antecedent
            #dataset_mask = dataset_mask & np.logical_not(covered_antecedent)

            correctly_covered = covered_antecedent & covered_consequent & dataset_mask
            
            #print("correctly covered from mask", np.sum(correctly_covered & dataset_mask))
            
            if not any(correctly_covered & dataset_mask):
                rules.remove(rule)
            else:
                misclassified = np.sum(covered_antecedent) - np.sum(correctly_covered)
                
                total_errors_without_default += misclassified
                
                # dataset -= covered_antecedent
                dataset_mask = dataset_mask & np.logical_not(covered_antecedent)
                
                modified_dataset = dataset[dataset_mask]
                class_values = self.__dataframe.dataframe["class"][dataset_mask].values
                
                default_class, default_class_count = self.get_most_frequent_from_numpy(class_values)
                
                # don't forget to update dataset length
                default_rule_error = np.sum(dataset_mask) - default_class_count
                total_errors_with_default = default_rule_error + total_errors_without_default
                
                
                if total_errors_with_default < lowest_total_error:
                    cutoff_rule = rule
                    lowest_total_error = total_errors_with_default
                    print(default_class)
                    cutoff_class = default_class
        
        
        # remove all rules below cutoff rule
        index_to_cut = rules.index(cutoff_rule)
        rules_pruned = rules[:index_to_cut+1]
        
        # append new default rule
        empty_rule = cutoff_rule.copy()
        empty_rule.antecedent = []
        empty_rule.consequent = self.__dataframe.dataframe.columns[-1], cutoff_class
        
        
        #rules_pruned.append(empty_rule)
        
        return rules_pruned, cutoff_class
        
        
        
        

rulepostpruner = RulePostPruner(qds)


qrules2 = [ rule.copy() for rule in qrules ]


rulepostpruner.transform(qrules2)


box-office-bomb
box-office-bomb
box-office-bomb
box-office-bomb


([CAR {a-list-celebrities=<0.0;2.0)} => {class=box-office-bomb} sup: 0.17 conf: 1.00 len: 2, id: 42,
  CAR {a-list-celebrities=<4.0;6.0),estimated-budget=<0.0;50.0)} => {class=critical-success} sup: 0.09 conf: 1.00 len: 3, id: 24,
  CAR {estimated-budget=<250.0;300.0)} => {class=main-stream-hit} sup: 0.06 conf: 1.00 len: 2, id: 4,
  CAR {a-list-celebrities=<4.0;6.0),estimated-budget=<100.0;150.0)} => {class=main-stream-hit} sup: 0.06 conf: 1.00 len: 3, id: 10,
  CAR {a-list-celebrities=<6.0;8.0)} => {class=critical-success} sup: 0.06 conf: 0.67 len: 2, id: 1],
 'box-office-bomb')

In [13]:
class RuleOverlapPruner:
    
    def __init__(self, quantitative_dataset):
        self.__dataframe = quantitative_dataset
        
        
    def transform(self, rules, default_class):
        copied_rules = [ rule.copy() for rule in rules ]

        pruned_rules = self.prune_transaction_based(copied_rules, default_class)        
        
        return pruned_rules
    
    def prune_transaction_based(self, rules, default_class):
        """Transaction based
        """
        
        new_rules = []
        
        for idx, rule in enumerate(rules):
            
            rule_classname, rule_classval = rule.consequent
            
            if rule_classval == default_class:
                continue
                
                
            correctly_covered_antecedent, correctly_covered_consequent = self.__dataframe.find_covered_by_rule_mask(rule)
            correctly_covered = correctly_covered_antecedent & correctly_covered_consequent
            
            non_empty_intersection = False
            
            for candidate_clash in rules[:idx]:
                
                cand_classname, cand_classval = candidate_clash.consequent
                
                if cand_classval == default_class:
                    continue
                    
                    
                cand_clash_covered_antecedent, cand_clash_covered_consequent = self.__dataframe.find_covered_by_rule_mask(candidate_clash)
                
                
                if any(cand_clash_covered_antecedent & correctly_covered):
                    non_empty_intersection = True
                    break
                    
            if non_empty_intersection == True:
                new_rules.append(rule)
                
            
            return new_rules
        
    
    
    

    
    def prune_range_based(self, rules, default_class):
        
        """Transaction based
        """
        
        new_rules = []
        
        for idx, rule in enumerate(rules):
            
            rule_classname, rule_classval = rule.consequent
            
            if rule_classval == default_class:
                continue
                
                
            literals = dict(rule.antecedent)
            attributes = literals.keys()

            clashing_rule_found = False
            
            """
            correctly_covered_antecedent, correctly_covered_consequent = self.__dataframe.find_covered_by_rule_mask(rule)
            correctly_covered = correctly_covered_antecedent & correctly_covered_consequent
            
            non_empty_intersection = False
            """
            
            
            for candidate_clash in rules[:idx]:
                
                cand_classname, cand_classval = candidate_clash.consequent
                
                if cand_classval == default_class:
                    continue
                    
                attributes_candclash = dict(candidate_clash.antecedent).keys()
                shared_attributes = set(attributes) & set(attributes_candclash)
                
                if not shared_attributes:
                    clashing_rule_found = True
                    break
                    
                clash_cand_antecedent_dict = dict(candidate_clash.antecedent)
                literals_in_clash_shared_att = [ (key, clash_cand_antecedent_dict[key]) for key in shared_attributes  ]
                
                at_least_one_attribute_disjunct = False
                
                for literal in literals_in_clash_shared_att:
                    attribute, value = literal
                    
                    temp_literal = attribute, literals[attribute]
                    
                    
                    
            if non_empty_intersection == True:
                new_rules.append(rule)
                
            
        return new_rules

In [14]:
rule_overlap_pruner = RuleOverlapPruner(qds)


rule_overlap_pruner.transform(qrules, "box-office-bomb")

[]

In [15]:
5

5

In [16]:
from pyarc import CBA, TransactionDB
import pandas as pd

data_train = pd.read_csv("c:/code/python/machine_learning/assoc_rules/train/segment0.csv")
data_test = pd.read_csv("c:/code/python/machine_learning/assoc_rules/test/segment0.csv")

txns_train = TransactionDB.from_DataFrame(data_train)
txns_test = TransactionDB.from_DataFrame(data_test)


cba = CBA(support=0.1, confidence=0.7, algorithm="m1")
cba.fit(txns_train)

rules_to_optimize = cba.clf.rules


len(rules_to_optimize)

21

In [17]:
accuracy = cba.rule_model_accuracy(txns_test) 
accuracy

rules_to_optimize

[CAR {intensity-mean=82.9815_to_inf} => {class=sky} sup: 0.14 conf: 1.00 len: 2, id: 1012,
 CAR {hue-mean=0.64353_to_inf} => {class=grass} sup: 0.14 conf: 1.00 len: 2, id: 408,
 CAR {value-mean=48.83335_to_77.8889,intensity-mean=44.92595_to_63.57405,saturation-mean=0.2751625_to_0.321112,hue-mean=-2.094395_to_-1.85084} => {class=path} sup: 0.09 conf: 0.87 len: 5, id: 508,
 CAR {intensity-mean=44.92595_to_63.57405,saturation-mean=0.2751625_to_0.321112,hue-mean=-2.094395_to_-1.85084} => {class=path} sup: 0.09 conf: 0.87 len: 4, id: 604,
 CAR {value-mean=48.83335_to_77.8889,hue-mean=-2.094395_to_-1.85084,saturation-mean=0.2751625_to_0.321112,rawgreen-mean=36.05555_to_55.83335} => {class=path} sup: 0.09 conf: 0.86 len: 5, id: 682,
 CAR {saturation-mean=0.2751625_to_0.321112,rawgreen-mean=36.05555_to_55.83335,exred-mean=-23.8889_to_-13.94445} => {class=path} sup: 0.10 conf: 0.84 len: 4, id: 776,
 CAR {value-mean=48.83335_to_77.8889,saturation-mean=0.2751625_to_0.321112,rawgreen-mean=36.05555

In [18]:
rules_to_optimize[4]

CAR {value-mean=48.83335_to_77.8889,hue-mean=-2.094395_to_-1.85084,saturation-mean=0.2751625_to_0.321112,rawgreen-mean=36.05555_to_55.83335} => {class=path} sup: 0.09 conf: 0.86 len: 5, id: 682

In [19]:
segment_undiscr = pd.read_csv("C:/code/tmp/segment.csv")

cols = segment_undiscr.columns

segment_undiscr.columns = list(map(lambda s: s.lower(), cols))


segment_undiscr["class"] = segment_undiscr.index
segment_undiscr.reset_index(inplace=True)
segment_undiscr.drop(["index"], axis=1, inplace=True)
segment_undiscr["class"] = segment_undiscr["class"].map(lambda s: s.lower())

segment_undiscr.head()



Unnamed: 0,region-centroid-col,region-centroid-row,region-pixel-count,short-line-density-5,short-line-density-2,vedge-mean,vedge-sd,hedge-mean,hedge-sd,intensity-mean,rawred-mean,rawblue-mean,rawgreen-mean,exred-mean,exblue-mean,exgreen-mean,value-mean,saturation-mean,hue-mean,class
0,140.0,125.0,9,0.0,0.0,0.277778,0.062963,0.666667,0.311111,6.185185,7.333334,7.666666,3.555556,3.444444,4.444445,-7.888889,7.777778,0.545635,-1.121818,brickface
1,188.0,133.0,9,0.0,0.0,0.333333,0.266667,0.5,0.077778,6.666666,8.333334,7.777778,3.888889,5.0,3.333333,-8.333333,8.444445,0.53858,-0.924817,brickface
2,105.0,139.0,9,0.0,0.0,0.277778,0.107407,0.833333,0.522222,6.111111,7.555555,7.222222,3.555556,4.333334,3.333333,-7.666666,7.555555,0.532628,-0.965946,brickface
3,34.0,137.0,9,0.0,0.0,0.5,0.166667,1.111111,0.474074,5.851852,7.777778,6.444445,3.333333,5.777778,1.777778,-7.555555,7.777778,0.573633,-0.744272,brickface
4,39.0,111.0,9,0.0,0.0,0.722222,0.374074,0.888889,0.429629,6.037037,7.0,7.666666,3.444444,2.888889,4.888889,-7.777778,7.888889,0.562919,-1.175773,brickface


In [20]:
interval_reader = IntervalReader()

interval_reader.closed_bracket = "", "NULL"
interval_reader.open_bracket = "NULL", ""
interval_reader.infinity_symbol = "inf", "inf"
interval_reader.members_separator = "_to_"

interval_reader.compile_reader()

i = interval_reader.read("82.9815_to_inf")

In [21]:
i.refit([83, 84, 85])

Interval[<83;85>]

In [22]:
QuantitativeCAR.interval_reader = interval_reader

QuantitativeCAR.interval_reader

<__main__.IntervalReader at 0x20c7eb8e710>

In [23]:
quant_rules_to_optimize = [ QuantitativeCAR(r) for r in rules_to_optimize ]

In [24]:
quant_rules_to_optimize

[CAR {intensity-mean=<82.9815;inf)} => {class=sky} sup: 0.14 conf: 1.00 len: 2, id: 1012,
 CAR {hue-mean=<0.64353;inf)} => {class=grass} sup: 0.14 conf: 1.00 len: 2, id: 408,
 CAR {hue-mean=<-2.094395;-1.85084),intensity-mean=<44.92595;63.57405),saturation-mean=<0.2751625;0.321112),value-mean=<48.83335;77.8889)} => {class=path} sup: 0.09 conf: 0.87 len: 5, id: 508,
 CAR {hue-mean=<-2.094395;-1.85084),intensity-mean=<44.92595;63.57405),saturation-mean=<0.2751625;0.321112)} => {class=path} sup: 0.09 conf: 0.87 len: 4, id: 604,
 CAR {hue-mean=<-2.094395;-1.85084),rawgreen-mean=<36.05555;55.83335),saturation-mean=<0.2751625;0.321112),value-mean=<48.83335;77.8889)} => {class=path} sup: 0.09 conf: 0.86 len: 5, id: 682,
 CAR {exred-mean=<-23.8889;-13.94445),rawgreen-mean=<36.05555;55.83335),saturation-mean=<0.2751625;0.321112)} => {class=path} sup: 0.10 conf: 0.84 len: 4, id: 776,
 CAR {rawgreen-mean=<36.05555;55.83335),saturation-mean=<0.2751625;0.321112),value-mean=<48.83335;77.8889)} => {c

In [25]:
dataset_segment_quantitative = QuantitativeDataFrame(segment_undiscr)

In [26]:
quant_rules_to_optimize[:1]

[CAR {intensity-mean=<82.9815;inf)} => {class=sky} sup: 0.14 conf: 1.00 len: 2, id: 1012]

In [27]:
rule_refitter = RuleRefitter(dataset_segment_quantitative)

refitted_qrules = rule_refitter.transform(quant_rules_to_optimize)

In [28]:
rule_literal_pruner = RuleLiteralPruner(dataset_segment_quantitative)
literal_pruned_qrules = rule_literal_pruner.transform(refitted_qrules)

literal_pruned_qrules

[CAR {intensity-mean=<90.62963;143.44444>} => {class=sky} sup: 0.14 conf: 1.00 len: 2, id: 1012,
 CAR {hue-mean=<1.7566451;2.8649306>} => {class=grass} sup: 0.14 conf: 1.00 len: 2, id: 408,
 CAR {hue-mean=<-2.0943026999999996;-1.8860723000000001>,intensity-mean=<45.037037;63.22221999999999>,saturation-mean=<0.2753682;0.3181372>,value-mean=<49.0;77.77778>} => {class=path} sup: 0.10 conf: 0.95 len: 5, id: 508,
 CAR {hue-mean=<-2.0943026999999996;-1.8860723000000001>,intensity-mean=<45.037037;63.22221999999999>,saturation-mean=<0.2753682;0.3181372>} => {class=path} sup: 0.10 conf: 0.95 len: 4, id: 604,
 CAR {hue-mean=<-2.0943026999999996;-1.8860723000000001>,rawgreen-mean=<36.11111;55.66666800000001>,saturation-mean=<0.2753682;0.3181372>,value-mean=<49.0;77.77778>} => {class=path} sup: 0.10 conf: 0.91 len: 5, id: 682,
 CAR {exred-mean=<-23.222220999999998;-14.0>,rawgreen-mean=<36.11111;55.66666800000001>,saturation-mean=<0.2753682;0.3181372>} => {class=path} sup: 0.10 conf: 0.96 len: 4, i

In [45]:
rule_trimmer = RuleTrimmer(dataset_segment_quantitative)
trimmed_rules = rule_trimmer.transform(literal_pruned_qrules)


trimmed_rules

[CAR {intensity-mean=<90.62963;143.44444>} => {class=sky} sup: 0.14 conf: 1.00 len: 2, id: 1012,
 CAR {hue-mean=<1.7566451;2.8649306>} => {class=grass} sup: 0.14 conf: 1.00 len: 2, id: 408,
 CAR {hue-mean=<-2.0752265;-1.9474715>,intensity-mean=<47.25926;63.22221999999999>,saturation-mean=<0.27882218;0.3181372>,value-mean=<58.55555699999999;77.77778>} => {class=path} sup: 0.10 conf: 0.95 len: 5, id: 508,
 CAR {hue-mean=<-2.0752265;-1.9474715>,intensity-mean=<47.25926;63.22221999999999>,saturation-mean=<0.27882218;0.3181372>} => {class=path} sup: 0.10 conf: 0.95 len: 4, id: 604,
 CAR {hue-mean=<-2.0752265;-1.9474715>,rawgreen-mean=<36.555557;55.66666800000001>,saturation-mean=<0.27838665;0.3181372>,value-mean=<50.666668;77.77778>} => {class=path} sup: 0.10 conf: 0.91 len: 5, id: 682,
 CAR {exred-mean=<-23.222220999999998;-14.0>,rawgreen-mean=<36.333335999999996;55.66666800000001>,saturation-mean=<0.28533262;0.3181372>} => {class=path} sup: 0.10 conf: 0.96 len: 4, id: 776,
 CAR {rawgreen-

In [30]:
rule_extender = RuleExtender(dataset_segment_quantitative)
extended_rules = rule_extender.transform(trimmed_rules)

In [31]:
idx = 9
extended_rules[idx], trimmed_rules[idx]

(CAR {exred-mean=<-48.22222;-13.333332999999998>,hue-mean=<-2.0752265;-1.9730555>,intensity-mean=<47.25926;63.22221999999999>,rawgreen-mean=<41.444443;55.66666800000001>} => {class=path} sup: 0.09 conf: 0.79 len: 5, id: 588,
 CAR {exred-mean=<-21.0;-14.0>,hue-mean=<-2.0752265;-1.9730555>,intensity-mean=<47.25926;63.22221999999999>,rawgreen-mean=<41.444443;55.66666800000001>} => {class=path} sup: 0.09 conf: 0.75 len: 5, id: 588)

In [32]:
rule_postpruner = RulePostPruner(dataset_segment_quantitative)
postpruned_rules, def_class = rule_postpruner.transform(extended_rules)

postpruned_rules

brickface
brickface
brickface


[CAR {hue-mean=<1.7566451;2.8649306>} => {class=grass} sup: 0.14 conf: 1.00 len: 2, id: 408,
 CAR {intensity-mean=<90.62963;143.44444>} => {class=sky} sup: 0.14 conf: 1.00 len: 2, id: 1012,
 CAR {hue-mean=<-2.1093748;-1.9474715>,intensity-mean=<47.25926;63.22221999999999>,saturation-mean=<0.27882218;0.3181372>} => {class=path} sup: 0.10 conf: 1.00 len: 4, id: 604]

In [39]:
class QuantitativeClassifier:
    
    
    def __init__(self, rules, default_class):
        self.rules = rules
        self.default_class = default_class
        
        
    def test(quantitative_dataframe):
        if type(quantitative_dataframe) != QuantitativeDataFrame:
            raise Exception("Type of quantitative_dataframe must be QuantitativeDataFrame")
            
            
        for index, row in dataset_segment_quantitative.dataframe.iterrows():
            pass

            
r = postpruned_rules[0]
antecedent_dict = dict(r.antecedent)    
    

predicted_classes = []
    
for _, row in dataset_segment_quantitative.dataframe.iterrows():
    appended = False
    for rule in postpruned_rules:
        antecedent_dict = dict(rule.antecedent)  
        counter = True

        for name, value in row.iteritems():
            if name in antecedent_dict:
                result = antecedent_dict[name].isin(value)
                counter &= result

        if counter:
            _, predicted_class = rule.consequent
            predicted_classes.append(predicted_class)
            appended = True
            break
            
    if not appended:
        predicted_classes.append(def_class)


predicted_classes




['brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'br

In [40]:
from sklearn.metrics import accuracy_score

accuracy_score(predicted_classes, dataset_segment_quantitative.dataframe["class"].values)

0.53333333333333333

In [34]:
%run ../../main.py

In [41]:
from pyarc.qcba.classifier import QuantitativeClassifier

qclf = QuantitativeClassifier(postpruned_rules, def_class)
qclf.test(dataset_segment_quantitative)

['brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'sky',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'brickface',
 'br

In [36]:
from sklearn.metrics import accuracy_score


accuracy_score(dataset_segment_quantitative.dataframe["class"].values, predicted_classes)

0.2857142857142857

In [37]:
list(zip(dataset_segment_quantitative.dataframe["class"].values, predicted_classes))

[('brickface', 'brickface'),
 ('brickface', 'brickface'),
 ('brickface', 'brickface'),
 ('brickface', 'brickface'),
 ('brickface', 'brickface'),
 ('brickface', 'brickface'),
 ('brickface', 'brickface'),
 ('brickface', 'brickface'),
 ('brickface', 'brickface'),
 ('brickface', 'brickface'),
 ('brickface', 'brickface'),
 ('brickface', 'brickface'),
 ('brickface', 'brickface'),
 ('brickface', 'brickface'),
 ('brickface', 'brickface'),
 ('brickface', 'brickface'),
 ('brickface', 'brickface'),
 ('brickface', 'brickface'),
 ('brickface', 'brickface'),
 ('brickface', 'brickface'),
 ('brickface', 'brickface'),
 ('brickface', 'brickface'),
 ('brickface', 'brickface'),
 ('brickface', 'brickface'),
 ('brickface', 'brickface'),
 ('brickface', 'brickface'),
 ('brickface', 'brickface'),
 ('brickface', 'brickface'),
 ('brickface', 'brickface'),
 ('brickface', 'brickface'),
 ('sky', 'brickface'),
 ('sky', 'brickface'),
 ('sky', 'brickface'),
 ('sky', 'brickface'),
 ('sky', 'brickface'),
 ('sky', 'brick

In [38]:
dataset_segment_quantitative.dataframe

Unnamed: 0,region-centroid-col,region-centroid-row,region-pixel-count,short-line-density-5,short-line-density-2,vedge-mean,vedge-sd,hedge-mean,hedge-sd,intensity-mean,rawred-mean,rawblue-mean,rawgreen-mean,exred-mean,exblue-mean,exgreen-mean,value-mean,saturation-mean,hue-mean,class
0,140.0,125.0,9,0.000000,0.0,0.277778,0.062963,0.666667,0.311111,6.185185,7.333334,7.666666,3.555556,3.444444,4.444445,-7.888889,7.777778,0.545635,-1.121818,brickface
1,188.0,133.0,9,0.000000,0.0,0.333333,0.266667,0.500000,0.077778,6.666666,8.333334,7.777778,3.888889,5.000000,3.333333,-8.333333,8.444445,0.538580,-0.924817,brickface
2,105.0,139.0,9,0.000000,0.0,0.277778,0.107407,0.833333,0.522222,6.111111,7.555555,7.222222,3.555556,4.333334,3.333333,-7.666666,7.555555,0.532628,-0.965946,brickface
3,34.0,137.0,9,0.000000,0.0,0.500000,0.166667,1.111111,0.474074,5.851852,7.777778,6.444445,3.333333,5.777778,1.777778,-7.555555,7.777778,0.573633,-0.744272,brickface
4,39.0,111.0,9,0.000000,0.0,0.722222,0.374074,0.888889,0.429629,6.037037,7.000000,7.666666,3.444444,2.888889,4.888889,-7.777778,7.888889,0.562919,-1.175773,brickface
5,16.0,128.0,9,0.000000,0.0,0.500000,0.077778,0.666667,0.311111,5.555555,6.888889,6.666666,3.111111,4.000000,3.333333,-7.333334,7.111111,0.561508,-0.985811,brickface
6,26.0,67.0,9,0.111111,0.0,1.000000,0.888890,2.444445,3.185185,20.000000,19.555555,25.888890,14.555555,-1.333333,17.666666,-16.333334,25.888890,0.436939,-1.623202,brickface
7,14.0,110.0,9,0.000000,0.0,1.722222,5.351850,2.666667,1.022223,17.925926,18.888890,21.444445,13.444445,2.888889,10.555555,-13.444445,21.444445,0.368848,-1.345096,brickface
8,11.0,108.0,9,0.000000,0.0,1.333333,0.800000,1.388889,0.951852,17.666666,19.000000,21.111110,12.888889,4.000000,10.333333,-14.333333,21.111110,0.388756,-1.302133,brickface
9,85.0,101.0,9,0.000000,0.0,1.333333,1.288888,1.277778,1.218518,21.296297,21.222221,26.777779,15.888889,-0.222222,16.444445,-16.222221,26.777779,0.404792,-1.558599,brickface
