In [41]:
from teacher.fuzzy import FuzzyVariable, FuzzyContinuousSet, FuzzyDiscreteSet
from teacher.datasets import load_german, load_compas, load_adult, load_heloc
import numpy as np
import json
import copy
import random
from sklearn.model_selection import train_test_split
from teacher.fuzzy import get_fuzzy_points, get_fuzzy_variables


In [18]:
random.seed(42)
np.random.seed(42)

In [19]:
from flocalx.rule import Rule, FuzzyRule, NumericAntecedent, CategoricalAntecedent, FuzzyAntecedent, RuleSet, FuzzyRuleSet, FLocalX

In [20]:
DATASETS = {
    'adult': load_adult,
    'compas': load_compas,
    'german': load_german,
    'fico': load_heloc
}

In [21]:
fs1 = FuzzyContinuousSet("low",[0, 0.5, 1])
fs2 = FuzzyContinuousSet("medium",[0.5, 1, 1.5])
fs3 = FuzzyContinuousSet("high",[1, 1.5, 2])

In [22]:
fv = FuzzyVariable("fv", [fs1, fs2, fs3])

In [23]:
a = NumericAntecedent(0, [0, 1])

In [24]:
b = CategoricalAntecedent(1, [['red', True]])

In [25]:
r = Rule([a, b], 1)

In [26]:
print(r)

[0 is [0, 1], 1 is [['red', True]]] -> 1


In [27]:
x = [0.5, 'red']

In [28]:
r.match(x)

1

In [29]:
c = FuzzyAntecedent(0, fs1)

In [30]:
r2 = FuzzyRule([c, b], 1, 0.5)
x2 = [0.25, 'red']

r2.match(x2)

0.25

In [31]:
db = 'adult'
method = 'flare_NN'
operators = ['similar_rule_fusion']
n_explanations = 1000

folder = 'data'
# list all files in folder

from os import listdir
from os.path import isfile, join
my_path = './data/rulesets/fuzzy/'
onlyfiles = [f for f in listdir(my_path) if isfile(join(my_path, f)) and db in f and method in f]

In [32]:
onlyfiles

['adult_flare_NN_13640.json',
 'adult_flare_NN_13210.json',
 'adult_flare_NN_9490.json',
 'adult_flare_NN_20120.json',
 'adult_flare_NN_20570.json',
 'adult_flare_NN_24860.json',
 'adult_flare_NN_6620.json',
 'adult_flare_NN_6270.json',
 'adult_flare_NN_14490.json',
 'adult_flare_NN_10980.json',
 'adult_flare_NN_18260.json',
 'adult_flare_NN_10650.json',
 'adult_flare_NN_18630.json',
 'adult_flare_NN_10200.json',
 'adult_flare_NN_23130.json',
 'adult_flare_NN_23560.json',
 'adult_flare_NN_5630.json',
 'adult_flare_NN_5260.json',
 'adult_flare_NN_630.json',
 'adult_flare_NN_260.json',
 'adult_flare_NN_13990.json',
 'adult_flare_NN_17480.json',
 'adult_flare_NN_3160.json',
 'adult_flare_NN_3530.json',
 'adult_flare_NN_7820.json',
 'adult_flare_NN_11780.json',
 'adult_flare_NN_16100.json',
 'adult_flare_NN_16550.json',
 'adult_flare_NN_12840.json',
 'adult_flare_NN_25660.json',
 'adult_flare_NN_25230.json',
 'adult_flare_NN_8540.json',
 'adult_flare_NN_4830.json',
 'adult_flare_NN_8110.js

In [42]:
ds = 'adult'
ruleset = []
for file in onlyfiles:
    with open(join(my_path, file), 'r') as f:
        ruleset += json.load(f)

if n_explanations > len(ruleset):
    n_explanations = len(ruleset)
ruleset = random.sample(ruleset, n_explanations)

dataset = DATASETS[ds](normalize=True)
class_name = dataset['class_name']

df = dataset['df']
X = df.drop(class_name, axis=1)
y = df[class_name]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

antecedent_order = {v: k for k, v in dataset['idx_features'].items()}
dataset_info = {
    'antecedent_order': antecedent_order,
    'discrete': set(dataset['discrete']),
    'continuous': set(dataset['continuous'])
}

In [35]:
flocal = FLocalX.from_json(ruleset, dataset_info, ['similar_rule_fusion'])

In [36]:
flocal.size(), flocal.rule_size()

(121, 2.4297520661157024)

In [43]:
continuous = dataset['continuous']  
discrete = dataset['discrete']

X_num = X_train[continuous]

fuzzy_points = get_fuzzy_points('equal_width', continuous, X_num, sets=5)
cate = [col for col in discrete if col != class_name]

discrete_fuzzy_values = {col: X_train[col].unique() for col in cate}
fuzzy_variables_order = {col: i for i, col in enumerate(X_train.columns)}
fuzzy_variables = get_fuzzy_variables(fuzzy_points, discrete_fuzzy_values, fuzzy_variables_order)


In [48]:
fuzzy_variables[4]

FuzzyVariable(name='occupation', fuzzy_sets=[FuzzyDiscreteSet(name='Prof-specialty', value='Prof-specialty'), FuzzyDiscreteSet(name='Exec-managerial', value='Exec-managerial'), FuzzyDiscreteSet(name='Craft-repair', value='Craft-repair'), FuzzyDiscreteSet(name='Farming-fishing', value='Farming-fishing'), FuzzyDiscreteSet(name='Other-service', value='Other-service'), FuzzyDiscreteSet(name='Machine-op-inspct', value='Machine-op-inspct'), FuzzyDiscreteSet(name='Sales', value='Sales'), FuzzyDiscreteSet(name='Handlers-cleaners', value='Handlers-cleaners'), FuzzyDiscreteSet(name='Transport-moving', value='Transport-moving'), FuzzyDiscreteSet(name='Protective-serv', value='Protective-serv'), FuzzyDiscreteSet(name='Adm-clerical', value='Adm-clerical'), FuzzyDiscreteSet(name='Priv-house-serv', value='Priv-house-serv'), FuzzyDiscreteSet(name='Tech-support', value='Tech-support'), FuzzyDiscreteSet(name='Armed-Forces', value='Armed-Forces')])

In [62]:
len(flocal.rules)

121

In [63]:
len(map_to_new_fuzzy_variables(flocal.rules, fuzzy_variables))

90

In [None]:
flocal.score(X_df.iloc[0:100], dataset['y'][0:100])

{1: 151.40400667088687, 0: 548.2334096159361}


0.75

In [None]:
flocal.fit(X_df.iloc[0:100], dataset['y'][0:100])

In [None]:
flocal.size(), flocal.rule_size()

(102, 2.4607843137254903)

In [None]:
flocal.score(X_df.iloc[0:100], dataset['y'][0:100])

{0: 550.8085006585272, 1: 216.37223388081244}


0.75

In [None]:
same_antecedent_rules = flocal._group_rules_by_antecedent()

In [None]:
same_antecedent_rules

{((0, 1), 1): [14, 72],
 ((0,), 1): [18, 35, 39, 46, 62],
 ((0, 2, 3), 1): [32, 93],
 ((5, 8), 0): [53, 91],
 ((0, 2, 11), 1): [61, 90, 100],
 ((2, 4, 8), 1): [69, 95]}

In [None]:
    combined_rules = [
        [list(flocal.rules)[i] for i in same_antecedent_rules[k]]
        for k in same_antecedent_rules
    ]

In [None]:
for g in combined_rules:
    print(g)
    flocal._combine_rules_with_same_antecedent([g], X_df.iloc[0:100], dataset['y'][0:100])


[1.0 : [0 is 1.2769844838173166, 1 is [['Local-gov', True]]] -> 1, 2.0 : [0 is 1.3502970677269004, 1 is [['Private', True]]] -> 1]
[2.0 : [0 is 0.2139520171283513] -> 1, 1.0 : [0 is 0.030670557354391753] -> 1, 1.0 : [0 is -0.5558301139222788] -> 1, 1.0 : [0 is 0.47054606081189465] -> 1, 2.0 : [0 is 0.5805149366762704] -> 1]
[4.0 : [0 is 0.3605771849475189, 2 is [['Some-college', True], ['Doctorate', True]], 3 is [['Married-civ-spouse', True]]] -> 1, 1.0 : [0 is -0.1892671943743597, 2 is [['Assoc-voc', True]], 3 is [['Married-civ-spouse', True]]] -> 1]
[1.0 : [5 is [['Husband', True]], 8 is -0.14592048355885345] -> 0, 2.0 : [5 is [['Not-in-family', True]], 8 is -0.008076831491555012] -> 0]
[1.0 : [0 is -0.482517530012695, 2 is [['Bachelors', True]], 11 is [['United-States', True]]] -> 1, 0.9772727272727273 : [0 is -0.042642026555192065, 2 is [['Bachelors', True]], 11 is [['United-States', True]]] -> 1, 1.0 : [0 is 0.2506083090831432, 2 is [['Bachelors', True]], 11 is [['United-States', 

In [None]:
a, b = combined_rules[0]
b

2.0 : [0 is 1.3502970677269004, 1 is [['Private', True]]] -> 1

In [None]:
a.fusion(b)

3.0 : [0 is 1.3136407757721085, 1 is [['Local-gov', True], ['Private', True]]] -> 1

In [None]:
combined_rules

[[1.0 : [0 is 1.2769844838173166, 1 is [['Local-gov', True]]] -> 1,
  2.0 : [0 is 1.3502970677269004, 1 is [['Private', True]]] -> 1],
 [2.0 : [0 is 0.2139520171283513] -> 1,
  1.0 : [0 is 0.030670557354391753] -> 1,
  1.0 : [0 is -0.5558301139222788] -> 1,
  1.0 : [0 is 0.47054606081189465] -> 1,
  2.0 : [0 is 0.5805149366762704] -> 1],
 [4.0 : [0 is 0.3605771849475189, 2 is [['Some-college', True], ['Doctorate', True]], 3 is [['Married-civ-spouse', True]]] -> 1,
  1.0 : [0 is -0.1892671943743597, 2 is [['Assoc-voc', True]], 3 is [['Married-civ-spouse', True]]] -> 1],
 [1.0 : [5 is [['Husband', True]], 8 is -0.14592048355885345] -> 0,
  2.0 : [5 is [['Not-in-family', True]], 8 is -0.008076831491555012] -> 0],
 [1.0 : [0 is -0.482517530012695, 2 is [['Bachelors', True]], 11 is [['United-States', True]]] -> 1,
  0.9772727272727273 : [0 is -0.042642026555192065, 2 is [['Bachelors', True]], 11 is [['United-States', True]]] -> 1,
  1.0 : [0 is 0.2506083090831432, 2 is [['Bachelors', True]]

In [None]:
len(flocal.rules)

102

In [None]:
flocal.score(X_df.iloc[0:100], dataset['y'][0:100])

{0: 550.8085006585272, 1: 216.37223388081244}


0.75

In [None]:
def get_similar_rules(ruleset, x, lam=0.3):
    lam = 0.3
    matches = sorted([(i, rule.match(x)) for i, rule in enumerate(ruleset) if rule.match(x) > 0.0001], reverse=True, key=lambda x: x[1])
    if not matches:
        return []
    group = [matches[0][0]]
    prev_match = matches[0][1]
    for (r, match) in matches[1:]:
        factor = prev_match / match
        if factor > 1 + lam:
            break
        group.append(r) 
    return group

In [None]:
def group_rules(ruleset, X, lam=0.3):
    return [get_similar_rules(ruleset, x, lam) for x in X]

In [None]:
grouped_rules = group_rules(list(flocal.rules), X_df.to_numpy()[0:100], lam=0.05)

In [None]:
rule_super_structure = {}
rule_list = list(flocal.rules)
for group in grouped_rules:
    for x in group:
        if x not in rule_super_structure:
            rule_super_structure[x] = [rule_list[x], {a:1 for a in group if a != x}, 1]
        else:
            rule_super_structure[x][2] += 1
            for a in group:
                if a != x:
                    if a not in rule_super_structure[x][1]:
                        rule_super_structure[x][1][a] = 1
                    else:
                        rule_super_structure[x][1][a] += 1

refined_rule_super_structure = {}
min_occur = 3
overlap = 0.7

for r in rule_super_structure:
    if rule_super_structure[r][2] >= min_occur:
        closest_rules = {i: v for i, v in rule_super_structure[r][1].items() if overlap <= v / rule_super_structure[r][2]}
        if closest_rules:
            refined_rule_super_structure[r] = [rule_super_structure[r][0], closest_rules]
        

In [None]:
refined_rule_super_structure.keys()

dict_keys([11, 16, 49, 53, 70, 75, 6, 34, 82, 39, 50, 59, 67, 77, 18, 10, 44, 9, 30, 8, 7, 88, 12, 68])

In [None]:
super_grouped_rules = []
visited = set([])
for r in refined_rule_super_structure:
    if r not in visited:
        all_r = set([r])
        len_all_r = len(all_r)
        change = True
        while change:
            for k in all_r:
                if k not in visited:
                    try:
                        all_r = all_r.union(set(refined_rule_super_structure[k][1].keys()))
                    except KeyError:
                        pass
            if len(all_r) == len_all_r:
                change = False
            else:
                len_all_r = len(all_r)
        visited.update(all_r)
        super_grouped_rules.append(sorted(list(all_r)))

In [None]:
fused = set([])
for i, gr in enumerate(super_grouped_rules):
    gr = set(gr)
    if i not in fused:
        for j, ggr in enumerate(super_grouped_rules[i+1:]):
            if set(gr).intersection(set(ggr)):
                gr.update(set(ggr))
                fused.add(j)
        fused.add(i)
    print(gr)

{67, 68, 70, 6, 8, 9, 10, 11, 75, 77, 7, 12, 16, 82, 88, 30, 34, 39, 44, 49, 50, 53, 59}
{70, 75, 16, 49, 53}
{34, 6, 16, 49, 82}
{16, 49, 39}
{67, 68, 6, 7, 8, 9, 10, 12, 77, 16, 82, 88, 30, 34, 44, 49, 50, 53, 59}
{18, 62}
{16, 49, 10, 53}
{16, 44}
{16, 9, 49}
{67, 77, 16, 49, 59, 30}
{8, 16, 49}
{16, 49, 88, 7}
{7, 12, 16, 49, 88}
{34, 68, 6, 7, 16, 49, 82, 88}


In [None]:
for k in all_r:
    try:
        all_r = all_r.union(set(refined_rule_super_structure[k][1].keys()))
    except KeyError:
        pass
all_r

{6, 7, 16, 34, 49, 68, 82, 88}

In [None]:
for k in all_r:
    try:
        all_r = all_r.union(set(refined_rule_super_structure[k][1].keys()))
    except KeyError:
        pass
all_r

{6, 7, 16, 34, 49, 68, 82, 88}

In [None]:
def group_rules_by_antecedent(ruleset):
    grouped_rules = {}
    for i, rule in enumerate(ruleset):
        key = (tuple(sorted([x.variable for x in rule.antecedent])), rule.consequent)
        if key not in grouped_rules:
            grouped_rules[key] = [i]
        else:
            grouped_rules[key].append(i)
    return {k: v for k, v in grouped_rules.items() if len(v) > 1}

In [None]:
aggregable_rules = group_rules_by_antecedent(flocal.rules)

In [None]:
len(flocal.rules)

102

In [None]:
len(aggregable_rules)

6

In [None]:
def rule_fusion(rule1, rule2):
    new_antecedent = []
    for a, b in zip(rule1.antecedent, rule2.antecedent):
        if isinstance(a, FuzzyAntecedent) and isinstance(b, FuzzyAntecedent):
            new_antecedent.append(FuzzyAntecedent(a.variable, FuzzyContinuousSet.merge(a.fuzzy_set, b.fuzzy_set)))
        else:
            new_antecedent.append(CategoricalAntecedent(a.variable, a.values + b.values, operator='or'))
    
    return FuzzyRule(new_antecedent, rule1.consequent, rule1.weight + rule2.weight)

In [None]:
def improves(first, second, fusion, X, y, loss=0.95):
    return fusion.confidence(X, y) > loss * max(first.confidence(X, y), second.confidence(X, y))


In [None]:
def fuse_ruleset(ruleset, X, y):
    changes = True
    i = 0
    while changes and len(ruleset) > 1:
        # print('Iteration', i)
        ruleset = sorted(ruleset, key=lambda x: (x.support(X), x.confidence(X, y)), reverse=True)
        # print(ruleset)
        first = ruleset.pop(0)
        new_ruleset = []
        changes = False
        while ruleset:
            try:
                second = ruleset.pop(0)
            except:
                # print('Breaking')
                new_ruleset.append(first)
                break
            # print(f"Fusing rules {first} and {second}")
            fusion = rule_fusion(first, second)
            if improves(first, second, fusion, X, y):
                # print(f'Fusion improves: {fusion}')
                changes = True
                new_ruleset.append(fusion)
                if len(ruleset) > 1:
                    first = ruleset.pop(0)
                else:
                    new_ruleset += ruleset
                    break
                # print(f'New first: {first}')
            else:
                if len(ruleset) > 0:
                    new_ruleset.append(first)
                    first = second
                else:
                    new_ruleset.append(first)
                    new_ruleset.append(second)
                    break
        
        ruleset = new_ruleset
        i += 1

    return ruleset
    

In [None]:
new_fused_ruleset = [
    fuse_ruleset([list(flocal.rules)[i] for i in aggregable_rules[k]], X_df.iloc[0:100], dataset['y'][0:100])
    for k in aggregable_rules
]

In [None]:
obsolete_rules = []
for k in aggregable_rules:
    obsolete_rules += aggregable_rules[k]

In [None]:
old_ruleset = [r for i, r in enumerate(flocal.rules) if i not in obsolete_rules]

In [None]:
new_ruleset = []
for rule in old_ruleset:
    new_ruleset.append(rule)

for group in new_fused_ruleset:
    for rule in group:
        new_ruleset.append(rule)

In [None]:
len(new_ruleset)

102

In [None]:
flocal2 = copy.deepcopy(flocal)

In [None]:
flocal2.rules = set(new_ruleset)

In [None]:
flocal.score(X_df.iloc[0:5000], dataset['y'][0:5000])

{0: 30216.548567268816, 1: 9723.962613586245}


0.7558

In [None]:
flocal2.score(X_df.iloc[0:5000], dataset['y'][0:5000])

{1: 9723.962613586245, 0: 30216.548567268816}


0.7558

In [None]:
len(flocal.rules)

102

In [None]:
len(flocal2.rules)

102