# Generic Rule Induction Notebook

Continuoulsy refined.

In [26]:
#%store -z acc_list 
#%store -z prec_list 

## README

- GLRM needs to run in proper (conda) aix360 environment
- BRCG runs with proper (conda) aix360 environment
- Use aix360i environment for RIPPER

### Configuration


In [27]:
from config import config_dict
from config import config_dict_imbalanced

# document config order
CONFIG = config_dict_imbalanced["CONFIG-I62"]
#CONFIG = config_dict["CONFIG14"]

print('Proceed with configuration:', CONFIG["NAME"])

Proceed with configuration: fraud_detection


In [28]:
import pandas as pd
import numpy as np
import seaborn as sns
# import os
from sklearn.model_selection import train_test_split #, GridSearchCV
from sklearn.metrics import matthews_corrcoef,fbeta_score,confusion_matrix,f1_score,precision_score, recall_score, accuracy_score, balanced_accuracy_score, confusion_matrix, r2_score, explained_variance_score, mean_absolute_error, max_error
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
import wittgenstein as lw
import time
import warnings
import re

from aix360.algorithms.rbm import BRCGExplainer, BooleanRuleCG, GLRMExplainer, LinearRuleRegression
if CONFIG['BINARIZER'] == 'QUANTILE':
    from aix360.algorithms.rbm import FeatureBinarizer
elif CONFIG['BINARIZER'] == 'TREES':
    from aix360.algorithms.rbm import FeatureBinarizerFromTrees
if CONFIG['ALGO'] == 'RIPPER':
    from aix360i.algorithms.rule_induction.ripper import Ripper
if CONFIG['ALGO'] == 'CORELS':
    from corels import *



# from explainer import Explainer

# TODO create reference for performance using boosted trees

# import wittgenstein as lw
# from clf_utils import make_tree_dataset, make_forest, score_forest
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.preprocessing import OneHotEncoder, LabelEncoder


### Data

In [29]:
def convert(char):
    if char == CONFIG['POS_CLASS']:
        return 1
    else:
        return 0

df = pd.read_csv(CONFIG['DATA_SET'],dtype=CONFIG['DATA_TYPES'], nrows= 15000)
df = df.drop(columns=CONFIG['DROP'])
if CONFIG['ALGO'] == 'BRCG':
    df[CONFIG['TARGET_LABEL']] = df[CONFIG['TARGET_LABEL']].map(convert)
    CONFIG['POS_CLASS'] = 1
    # maybe this could also be achieved through explicit binarization of target vector
df.info()
df[CONFIG['TARGET_LABEL']].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10492 entries, 0 to 10491
Data columns (total 32 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  10492 non-null  int64  
 1   Time        10492 non-null  float64
 2   V1          10492 non-null  float64
 3   V2          10492 non-null  float64
 4   V3          10492 non-null  float64
 5   V4          10492 non-null  float64
 6   V5          10492 non-null  float64
 7   V6          10492 non-null  float64
 8   V7          10492 non-null  float64
 9   V8          10492 non-null  float64
 10  V9          10492 non-null  float64
 11  V10         10492 non-null  float64
 12  V11         10492 non-null  float64
 13  V12         10492 non-null  float64
 14  V13         10492 non-null  float64
 15  V14         10492 non-null  float64
 16  V15         10492 non-null  float64
 17  V16         10492 non-null  float64
 18  V17         10492 non-null  float64
 19  V18         10492 non-nul

0    10000
1      492
Name: Class, dtype: int64

In [30]:

# BRCG Miniloan sample
#if CONFIG['UNDERSAMP'] == True:
#    under_sampling_fraud = df[df[CONFIG['TARGET_LABEL']] == 1].sample(n=4000, random_state = 42)
#    under_size = under_sampling_fraud.size
#    under_sampling_legit = (df[df[CONFIG['TARGET_LABEL']] == 0])
#    df = pd.concat([under_sampling_legit, under_sampling_fraud])

#Ripper Miniloan sample
#if CONFIG['UNDERSAMP'] == True:
#    under_sampling_fraud = df[df[CONFIG['TARGET_LABEL']] == 1]
#    under_size = under_sampling_fraud.size
#    under_sampling_legit = (df[df[CONFIG['TARGET_LABEL']] == 0].sample(n=4000, random_state = 42))
#    df = pd.concat([under_sampling_legit, under_sampling_fraud])

In [31]:
CONFIG['POS_CLASS']

1

In [32]:
if CONFIG['TYPE'] == 'BINARY':
    print(df[CONFIG['TARGET_LABEL']].value_counts())
elif CONFIG['TYPE'] == 'CONTINUOUS':
    df[CONFIG['TARGET_LABEL']].describe()
else:
    print('Unrecognized problem type')

0    10000
1      492
Name: Class, dtype: int64


### Train, Test Split

In [33]:
if CONFIG['TRAIN_TEST_SPLIT'] == 'FIXED':
    if CONFIG['MODE'] == 'PREDICTIVE':
        train = df[df['is_test_set'] == False]
        test = df[df['is_test_set'] == True]
    elif CONFIG['MODE'] == 'DESCRIPTIVE':
        train = df
        test = df

    train = train.drop(columns=['is_test_set'])
    test = test.drop(columns=['is_test_set'])

    y_train = train[CONFIG['TARGET_LABEL']]
    x_train = train.drop(columns=[CONFIG['TARGET_LABEL']])

    y_test = test[CONFIG['TARGET_LABEL']]
    x_test = test.drop(columns=[CONFIG['TARGET_LABEL']])
else:
    x_train, x_test, y_train, y_test = train_test_split(df.drop(columns=[CONFIG['TARGET_LABEL']]), df[CONFIG['TARGET_LABEL']], test_size=CONFIG['TRAIN_TEST_SPLIT'], random_state=42)

print('Training:', x_train.shape, y_train.shape)
print('Test:', x_test.shape, y_test.shape)

Training: (7344, 31) (7344,)
Test: (3148, 31) (3148,)


### Reference Performance

In [34]:
if CONFIG['TYPE'] == 'CONTINUOUS':
    print('needs prior encoding of categoricals')
    # gbr = GradientBoostingRegressor(n_estimators=500, random_state=0)
    # gbr.fit(x_train, y_train)
    # # print('Training R^2:', r2_score(yTrain, gbr.predict(dfTrain)))
    # print('Test R^2:', r2_score(y_test, gbr.predict(x_test)))

### Binarization

In [35]:
if CONFIG['BINARIZER'] == 'TREES':
    binarizer =  FeatureBinarizerFromTrees(negations=True, randomState=42) # FeatureBinarizer(negations=False), FeatureBinarizerFromTrees(negations=True, randomState=42)
    binarizer = binarizer.fit(x_train, y_train)
    x_train_bin = binarizer.transform(x_train) #  x_train_bin = binarizer.fit_transform(x_train)
    x_test_bin = binarizer.transform(x_test) #  X_fb = self.fb.fit_transform(X_train)
elif CONFIG['BINARIZER'] == 'QUANTILE':
    binarizer =  FeatureBinarizer(numThresh=9,negations=True, randomState=42) # FeatureBinarizer(negations=False), FeatureBinarizerFromTrees(negations=True, randomState=42)
    binarizer = binarizer.fit(x_train)
    x_train_bin = binarizer.transform(x_train) #  x_train_bin = binarizer.fit_transform(x_train)
    x_test_bin = binarizer.transform(x_test) #  X_fb = self.fb.fit_transform(X_train)  
elif CONFIG['BINARIZER'] == 'NATIVE':
    x_train_bin = x_train
    x_test_bin = x_test
else:
    print('UNRECOGNIZED BINARIZER')

x_train_bin.info() # verbose=True
x_train_bin.head()
#x_train_bin[CONFIG['EXAMPLE_FEATURE']][:10]


<class 'pandas.core.frame.DataFrame'>
Int64Index: 7344 entries, 2577 to 7270
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   (Amount, <=, 161.200005)  7344 non-null   int32
 1   (Amount, >, 161.200005)   7344 non-null   int32
 2   (V10, <=, 0.700005)       7344 non-null   int32
 3   (V10, >, 0.700005)        7344 non-null   int32
 4   (V12, <=, -2.111203)      7344 non-null   int32
 5   (V12, >, -2.111203)       7344 non-null   int32
 6   (V14, <=, -3.777392)      7344 non-null   int32
 7   (V14, >, -3.777392)       7344 non-null   int32
 8   (V16, <=, 2.292556)       7344 non-null   int32
 9   (V16, >, 2.292556)        7344 non-null   int32
 10  (V17, <=, -3.042388)      7344 non-null   int32
 11  (V17, <=, 1.589249)       7344 non-null   int32
 12  (V17, >, -3.042388)       7344 non-null   int32
 13  (V17, >, 1.589249)        7344 non-null   int32
 14  (V21, <=, -0.34854)       7344 non-nu

feature,Amount,Amount,V10,V10,V12,V12,V14,V14,V16,V16,V17,V17,V17,V17,V21,V21,V23,V23,V7,V7
operation,<=,>,<=,>,<=,>,<=,>,<=,>,<=,<=,>,>,<=,>,<=,>,<=,>
value,161.200005,161.200005,0.700005,0.700005,-2.111203,-2.111203,-3.777392,-3.777392,2.292556,2.292556,-3.042388,1.589249,-3.042388,1.589249,-0.348540,-0.348540,-13.168944,-13.168944,2.272979,2.272979
2577,1,0,0,1,0,1,0,1,1,0,0,1,1,0,0,1,0,1,1,0
4588,1,0,1,0,0,1,0,1,1,0,0,1,1,0,0,1,0,1,1,0
3098,1,0,1,0,0,1,0,1,1,0,0,1,1,0,1,0,0,1,1,0
6522,1,0,1,0,0,1,0,1,1,0,0,1,1,0,0,1,0,1,1,0
8281,1,0,1,0,0,1,0,1,1,0,0,1,1,0,0,1,0,1,1,0


### Rule Induction

In [36]:
start_time = time.time()
print('Starting training for', CONFIG['ALGO'])

if CONFIG['ALGO'] == 'BRCG':
    estimator = BooleanRuleCG() # Explainer()
    # estimator.train(x_train, y_train)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        estimator.fit(x_train_bin, y_train)
elif CONFIG['ALGO'] == 'RIPPER':
    estimator = Ripper()
    estimator.fit(x_train_bin, y_train, pos_value=CONFIG['POS_CLASS'])
elif CONFIG['ALGO'] == 'Witt_RIPPER':
    estimator = lw.RIPPER()
    estimator.fit(x_train_bin, y_train,class_feat=CONFIG["TARGET_LABEL"] , pos_value=CONFIG['POS_CLASS'])
elif CONFIG['ALGO'] == 'GLRM':
    linear_model = LinearRuleRegression() # lambda0=0.0005,lambda1=0.0001
    explainer = GLRMExplainer(linear_model)
    explainer.fit(x_train_bin, y_train)
elif CONFIG['ALGO'] == 'CORELS':
    estimator = CorelsClassifier(n_iter=10000, 
                      # feautres per statement
                     c = 0.0001 # Higher values penalise longer rulelists
                    )
    estimator.fit(x_train_bin, y_train , prediction_name = CONFIG["TARGET_LABEL"])
    
else:
    print('Unrecognized algorithm:', CONFIG['ALGO'])

end_time = time.time()
print('Training time: ' + str(end_time - start_time))

Starting training for RIPPER
Training time: 0.07599902153015137


In [37]:
CONFIG['POS_CLASS']

1

### Evaluation

In [38]:
acc_list = []
prec_list = []
%store -r acc_list
%store -r prec_list

if CONFIG['TYPE'] == 'BINARY':
    y_pred = estimator.predict(x_test_bin)
    print('Accuracy:', accuracy_score(y_test, y_pred))
    print('Balanced accuracy:', balanced_accuracy_score(y_test, y_pred, adjusted=True))
    print('Precision:', precision_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS']))
    print('Recall:', recall_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS']))
    print('F1', f1_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS']))
    print('ConfusionMatrix', confusion_matrix(y_test, y_pred))
    print('F-2', fbeta_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS'], beta= 2))
    print('Mathhews', matthews_corrcoef(y_test, y_pred))

    acc_list.append(recall_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS']))
    prec_list.append(precision_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS']))
    print('Training time: ' + str(end_time - start_time))
    %store acc_list
    %store prec_list
   
elif CONFIG['TYPE'] == 'CONTINUOUS':
    y_pred = explainer.predict(x_test_bin)
    print(f'R2 Score = {r2_score(y_test, y_pred)}')
    print(f'Explained Variance = {explained_variance_score(y_test, y_pred)}')
    print(f'Mean abs. error = {mean_absolute_error(y_test, y_pred)}')
    print(f'Max error = {max_error(y_test, y_pred)}')
    


Accuracy: 0.9548919949174078
Balanced accuracy: 0.0
Precision: 0.0
Recall: 0.0
F1 0.0
ConfusionMatrix [[3006    0]
 [ 142    0]]
F-2 0.0
Mathhews 0.0
Training time: 0.07599902153015137
Stored 'acc_list' (list)
Stored 'prec_list' (list)


  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


In [39]:
y_test.value_counts()

0    3006
1     142
Name: Class, dtype: int64

In [40]:
CONFIG['POS_CLASS']

1

In [41]:
CONFIG['BINARIZER']

'TREES'

In [42]:
CONFIG['ALGO']

'RIPPER'

In [43]:
df[CONFIG["TARGET_LABEL"]].value_counts()

0    10000
1      492
Name: Class, dtype: int64

In [44]:
CONFIG['POS_CLASS']

1

In [45]:
präds = []
if CONFIG['TYPE'] == 'CONTINUOUS':
    explanation = explainer.explain()
    print(explanation)
elif CONFIG['ALGO'] == 'BRCG':
    model = estimator.explain()
    if not model['isCNF']:
        print('Number of rules:', len(model['rules']))
        print(model['rules'])
elif CONFIG['ALGO'] == 'RIPPER':
    
    print('Rule count: ' + str(sum([len(rules) for rules in estimator.rule_map.values()])))
    print('Rule set:')
    #print(estimator.rule_list_to_pretty_string())
    # Get predicates --> if Key Error Switch 0 to 1 or 1 to 0
    for i in range(len(estimator.rule_map[0])):
        print(len(estimator.rule_map[0][i]))
        präds.append(len(estimator.rule_map[0][i]))
    print("Sum Prädikate:", sum(präds))
elif CONFIG['ALGO'] == 'CORELS':
    r_length = len(estimator.rl().rules)
    print("Rule Length:", r_length)
    # Get predicates
    for i in range(len(estimator.rl().rules[0]["antecedents"])):
        an = len(estimator.rl().rules[i]["antecedents"])
        präds.append(len(estimator.rl().rules[i]["antecedents"]))
        print(f"Antecedents Length Rule {i}:" , an)
    print("Sum Prädikate:", sum(präds))
elif CONFIG['ALGO'] == 'Witt_RIPPER':
    print("Rule Length:", len(estimator.ruleset_))

# uncomment the following line for a full optimized view of the model as data frame for GLRM rules
# explanation.style

Rule count: 0
Rule set:


KeyError: 0

In [None]:
len(estimator.rule_map.values())


1

In [None]:
estimator.rule_map[1][i]

KeyError: 1