# Generic Rule Induction Notebook

Continuoulsy refined.

## README

- GLRM needs to run in proper (conda) aix360 environment
- BRCG runs with proper (conda) aix360 environment
- Use aix360i environment for RIPPER

### Configuration


In [1]:
from data_configs import CONFIG_DICT_IMBALANCED


CONFIG = CONFIG_DICT_IMBALANCED['CONFIG-I3']
print('Proceed with configuration:', CONFIG['NAME'])
print(CONFIG)

PIPELINE = ('NATIVE','R2N') # ('XGBPREP','XGBOOST'),('TREES','BRCG'), ('QUANTILE','BRCG'), ('TREES','RIPPER'), ('QUANTILE','RIPPER'), ('NATIVE','RIPPER'), ('TREES','CORELS'), ('QUANTILE','CORELS')
(bina, algo) = PIPELINE

TRAIN_TEST_SPLIT =  0.3

Proceed with configuration: taiwan_binary
{'NAME': 'taiwan_binary', 'DATA_SET': '../data/TaiwanCreditData.csv', 'DATA_TYPES': {'Amount': <class 'float'>, 'Age': <class 'float'>, 'Bill_Sep': <class 'float'>, 'Bill_Aug': <class 'float'>, 'Bill_Jul': <class 'float'>, 'Bill_Jun': <class 'float'>, 'Bill_May': <class 'float'>, 'Bill_Apr': <class 'float'>, 'PayAmount_Sep': <class 'float'>, 'PayAmount_Aug': <class 'float'>, 'PayAmount_Jul': <class 'float'>, 'PayAmount_Jun': <class 'float'>, 'PayAmount_May': <class 'float'>, 'PayAmount_Apr': <class 'float'>}, 'DROP': ['Probabilities'], 'TARGET_LABEL': 'DefaultNextMonth', 'TYPE': 'BINARY', 'EXAMPLE_FEATURE': 'PayAmount_Apr', 'POS_CLASS': 1, 'META_DATA': {'use_case': 'credit_approval', 'flag': 'organic'}}


In [2]:
import pandas as pd
import numpy as np
# import os
from sklearn.model_selection import train_test_split #, GridSearchCV
from sklearn.metrics import matthews_corrcoef,fbeta_score,confusion_matrix,f1_score,precision_score, recall_score, accuracy_score, balanced_accuracy_score, confusion_matrix, r2_score, explained_variance_score, mean_absolute_error, max_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from xgboost import XGBClassifier
import time
import warnings
import re

if bina == 'QUANTILE':
    from aix360.algorithms.rbm import FeatureBinarizer
elif bina == 'TREES':
    from aix360.algorithms.rbm import FeatureBinarizerFromTrees
if algo == 'RIPPER':
    from aix360i.algorithms.rule_induction.ripper import Ripper
elif algo == 'BRCG':
    # from aix360.algorithms.rbm import BooleanRuleCG # BRCGExplainer
    from aix360i.algorithms.rule_induction.rbm.boolean_rule_cg import BooleanRuleCG as BRCG
    # from aix360i.algorithms.rule_induction.rbm import BRCGplus
elif algo == 'CORELS':
    from corels import *
elif algo == 'R2N':
    from aix360i.algorithms.rule_induction.r2n.r2n_algo import R2Nalgo
elif algo == 'Witt_RIPPER':
    import wittgenstein as lw
elif algo == 'GLRM':
    from aix360.algorithms.rbm import GLRMExplainer, LinearRuleRegression


### Data

In [3]:
def convert(char):
    if char == CONFIG['POS_CLASS']:
        return 1
    else:
        return 0

df = pd.read_csv(CONFIG['DATA_SET'],dtype=CONFIG['DATA_TYPES'])
print('Read', len(df), 'rows from', CONFIG['DATA_SET'])
df = df.drop(columns=CONFIG['DROP'])

# Preprocessing: normalizing data for specific algorithms
if algo in ('BRCG', 'XGBOOST', 'CORELS', 'R2N'): 
    df[CONFIG['TARGET_LABEL']] = df[CONFIG['TARGET_LABEL']].map(convert)
    POS_CLASS = 1
else:
    POS_CLASS = CONFIG['POS_CLASS']
 
df.info()
df.head()

Read 30000 rows from ../data/TaiwanCreditData.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Amount            30000 non-null  float64
 1   Sex               30000 non-null  object 
 2   Education         30000 non-null  object 
 3   MaritalStatus     30000 non-null  object 
 4   Age               30000 non-null  float64
 5   PayStatus_Sep     30000 non-null  object 
 6   PayStatus_Aug     30000 non-null  object 
 7   PayStatus_Jul     30000 non-null  object 
 8   PayStatus_Jun     30000 non-null  object 
 9   PayStatus_May     30000 non-null  object 
 10  PayStatus_Apr     30000 non-null  object 
 11  Bill_Sep          30000 non-null  float64
 12  Bill_Aug          30000 non-null  float64
 13  Bill_Jul          30000 non-null  float64
 14  Bill_Jun          30000 non-null  float64
 15  Bill_May          30000 non-null  flo

Unnamed: 0,Amount,Sex,Education,MaritalStatus,Age,PayStatus_Sep,PayStatus_Aug,PayStatus_Jul,PayStatus_Jun,PayStatus_May,...,Bill_Jun,Bill_May,Bill_Apr,PayAmount_Sep,PayAmount_Aug,PayAmount_Jul,PayAmount_Jun,PayAmount_May,PayAmount_Apr,DefaultNextMonth
0,20000.0,F,Univ,Married,24.0,Two,Two,Clear,Clear,Unk,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,120000.0,F,Univ,Single,26.0,Clear,Two,Zero,Zero,Zero,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,90000.0,F,Univ,Single,34.0,Zero,Zero,Zero,Zero,Zero,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,50000.0,F,Univ,Married,37.0,Zero,Zero,Zero,Zero,Zero,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,50000.0,M,Univ,Married,57.0,Clear,Zero,Clear,Zero,Zero,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [4]:
if CONFIG['TYPE'] == 'BINARY':
    target_dist = df[CONFIG['TARGET_LABEL']].value_counts()
    print(target_dist)
    print('Pos label occurrence:', target_dist[POS_CLASS]) 
elif CONFIG['TYPE'] == 'CONTINUOUS':
    df[CONFIG['TARGET_LABEL']].describe()
else:
    print('Unrecognized problem type')

0    23364
1     6636
Name: DefaultNextMonth, dtype: int64
Pos label occurrence: 6636


### Train, Test Split

In [5]:
if TRAIN_TEST_SPLIT == 'FIXED':
    if CONFIG['MODE'] == 'PREDICTIVE':
        train = df[df['is_test_set'] == False]
        test = df[df['is_test_set'] == True]
    elif CONFIG['MODE'] == 'DESCRIPTIVE':
        train = df
        test = df

    train = train.drop(columns=['is_test_set'])
    test = test.drop(columns=['is_test_set'])

    y_train = train[CONFIG['TARGET_LABEL']]
    x_train = train.drop(columns=[CONFIG['TARGET_LABEL']])

    y_test = test[CONFIG['TARGET_LABEL']]
    x_test = test.drop(columns=[CONFIG['TARGET_LABEL']])
else:
    x_train, x_test, y_train, y_test = train_test_split(df.drop(columns=[CONFIG['TARGET_LABEL']]), df[CONFIG['TARGET_LABEL']], test_size=TRAIN_TEST_SPLIT, random_state=42)

print('Training:', x_train.shape, y_train.shape)
print('Test:', x_test.shape, y_test.shape)

Training: (21000, 23) (21000,)
Test: (9000, 23) (9000,)


### Reference Performance

In [6]:
if CONFIG['TYPE'] == 'CONTINUOUS':
    print('needs prior encoding of categoricals')
    # gbr = GradientBoostingRegressor(n_estimators=500, random_state=0)
    # gbr.fit(x_train, y_train)
    # # print('Training R^2:', r2_score(yTrain, gbr.predict(dfTrain)))
    # print('Test R^2:', r2_score(y_test, gbr.predict(x_test)))
elif CONFIG['TYPE'] == 'BINARY':
    x_train_cp = x_train.copy()
    y_train_cp = y_train.copy()
    y_train_cp = y_train.map(convert)  # use manual encoding to make sure that pos_value = 1
    x_test_cp = x_test.copy()
    y_test_cp = y_test.copy()
    y_test_cp = y_test.map(convert)  # use manual encoding to make sure that pos_value = 1
    categorical_features = x_train_cp.select_dtypes(include=['object']).columns
    print(categorical_features)
    # numerical_feat = df.select_dtypes(include=['int64', 'float64']).columns
    for col in categorical_features:
        label_encoder = LabelEncoder()
        label_encoder = label_encoder.fit(df[col])
        x_train_cp[col] = label_encoder.transform(x_train_cp[col])
        x_test_cp[col] = label_encoder.transform(x_test_cp[col])
    xgb_model = XGBClassifier(use_label_encoder=False)
    xgb_model.fit(x_train_cp, y_train_cp)
    y_pred = xgb_model.predict(x_test_cp)

    print('Accuracy:', accuracy_score(y_test_cp, y_pred))
    print('Balanced accuracy:', balanced_accuracy_score(y_test_cp, y_pred))
    print('Precision:', precision_score(y_test_cp, y_pred, pos_label=1))
    print('Recall:', recall_score(y_test_cp, y_pred, pos_label=1))
    # print(xgb_model)



Index(['Sex', 'Education', 'MaritalStatus', 'PayStatus_Sep', 'PayStatus_Aug',
       'PayStatus_Jul', 'PayStatus_Jun', 'PayStatus_May', 'PayStatus_Apr'],
      dtype='object')
Parameters: { use_label_encoder } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Accuracy: 0.8097777777777778
Balanced accuracy: 0.6461009972170686
Precision: 0.60801393728223
Recall: 0.35612244897959183


### Binarization

In [7]:
if bina == 'TREES':
    binarizer =  FeatureBinarizerFromTrees(negations=True, randomState=42) # FeatureBinarizer(negations=False), FeatureBinarizerFromTrees(negations=True, randomState=42)
    binarizer = binarizer.fit(x_train, y_train)
    x_train_bin = binarizer.transform(x_train) #  x_train_bin = binarizer.fit_transform(x_train)
    x_test_bin = binarizer.transform(x_test) #  X_fb = self.fb.fit_transform(X_train)
elif bina == 'QUANTILE':
    binarizer =  FeatureBinarizer(numThresh=9,negations=True) # FeatureBinarizer(negations=False), FeatureBinarizerFromTrees(negations=True, randomState=42)
    binarizer = binarizer.fit(x_train)
    x_train_bin = binarizer.transform(x_train) #  x_train_bin = binarizer.fit_transform(x_train)
    x_test_bin = binarizer.transform(x_test) #  X_fb = self.fb.fit_transform(X_train)  
elif bina == 'NATIVE':
    x_train_bin = x_train
    x_test_bin = x_test
else:
    print('UNRECOGNIZED BINARIZER')

x_train_bin.info() # verbose=True
x_train_bin.head()
#x_train_bin[CONFIG['EXAMPLE_FEATURE']][:10]


<class 'pandas.core.frame.DataFrame'>
Int64Index: 21000 entries, 28465 to 23654
Data columns (total 23 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Amount         21000 non-null  float64
 1   Sex            21000 non-null  object 
 2   Education      21000 non-null  object 
 3   MaritalStatus  21000 non-null  object 
 4   Age            21000 non-null  float64
 5   PayStatus_Sep  21000 non-null  object 
 6   PayStatus_Aug  21000 non-null  object 
 7   PayStatus_Jul  21000 non-null  object 
 8   PayStatus_Jun  21000 non-null  object 
 9   PayStatus_May  21000 non-null  object 
 10  PayStatus_Apr  21000 non-null  object 
 11  Bill_Sep       21000 non-null  float64
 12  Bill_Aug       21000 non-null  float64
 13  Bill_Jul       21000 non-null  float64
 14  Bill_Jun       21000 non-null  float64
 15  Bill_May       21000 non-null  float64
 16  Bill_Apr       21000 non-null  float64
 17  PayAmount_Sep  21000 non-null  float64
 18  Pa

Unnamed: 0,Amount,Sex,Education,MaritalStatus,Age,PayStatus_Sep,PayStatus_Aug,PayStatus_Jul,PayStatus_Jun,PayStatus_May,...,Bill_Jul,Bill_Jun,Bill_May,Bill_Apr,PayAmount_Sep,PayAmount_Aug,PayAmount_Jul,PayAmount_Jun,PayAmount_May,PayAmount_Apr
28465,240000.0,F,Grad,Married,40.0,Unk,Unk,Unk,Unk,Unk,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27622,50000.0,F,Grad,Single,23.0,Clear,Clear,Clear,Clear,Clear,...,2299.0,4800.0,9810.0,660.0,2548.0,2321.0,4800.0,9810.0,660.0,2980.0
28376,50000.0,F,Univ,Married,36.0,Two,Two,Two,Two,Zero,...,49125.0,47956.0,43578.0,35126.0,0.0,4700.0,0.0,2004.0,3500.0,0.0
10917,200000.0,F,HS,Married,54.0,Six,Five,Four,Three,Two,...,104686.0,102549.0,101400.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27234,240000.0,M,Grad,Married,35.0,Clear,Clear,Clear,Zero,Clear,...,21790.0,17102.0,13367.0,22659.0,2017.0,21817.0,1120.0,13434.0,22772.0,22820.0


In [8]:
if bina in ['TREES', 'QUANTILE'] and algo == 'RIPPER':
    # RIPPER cannot process multi-index produced by these binarizers, hence flatten multi-index
    x_train_bin = pd.DataFrame(x_train_bin.to_records())
    x_test_bin = pd.DataFrame(x_test_bin.to_records())
    x_train_bin = x_train_bin.drop("index", axis = 1)
    x_test_bin = x_test_bin.drop("index", axis = 1)
    x_train_bin.columns = pd.Index(np.arange(1,len(x_train_bin.columns)+1).astype(str))
    x_test_bin.columns = pd.Index(np.arange(1,len(x_test_bin.columns)+1).astype(str))

### Rule Induction

In [9]:
start_time = time.time()
print('Starting training for', algo)

if algo == 'BRCG':
    estimator = BRCG(silent=True)
    estimator.fit(x_train_bin, y_train)
elif algo == 'RIPPER':
    estimator = Ripper()
    estimator.fit(x_train_bin, y_train, pos_value=POS_CLASS)
elif algo == 'Witt_RIPPER':
    estimator = lw.RIPPER()
    estimator.fit(x_train_bin, y_train,class_feat=CONFIG["TARGET_LABEL"] , pos_value=POS_CLASS)
elif algo == 'GLRM':
    linear_model = LinearRuleRegression() # lambda0=0.0005,lambda1=0.0001
    explainer = GLRMExplainer(linear_model)
    explainer.fit(x_train_bin, y_train)
elif algo == 'CORELS':
    estimator = CorelsClassifier(n_iter=10000, 
                     max_card=2, # feautres per statement
                     c = 0.0001 # Higher values penalise longer rulelists
                    )
    estimator.fit(x_train_bin, y_train , prediction_name = CONFIG["TARGET_LABEL"])
elif algo == 'R2N':
    # estimator = algo.R2Nalgo(n_seeds=3, max_epochs=100, decay_rate=0.998, coef = 10**-3, normalize_num=False)
    estimator = algo.R2Nalgo(n_seeds=2, max_epochs=5*10**2, min_temp = 10**-4, decay_rate=0.98, coef = 5*10**-4, normalize_num=True,negation=False)
    try:
        estimator.fit(x_train_bin, y_train) 
    except Exception:
        exception_caught = True
        print(Exception)
else:
    print('Unrecognized algorithm:', algo)

end_time = time.time()
print('Training time: ' + str(end_time - start_time))

Starting training for <module 'aix360i.algorithms.rule_induction.r2n.r2n_algo' from '/Users/hvo/Development/vsc/python/benchmarking/aix360i/aix360i/aix360i/algorithms/rule_induction/r2n/r2n_algo.py'>
Unrecognized algorithm: <module 'aix360i.algorithms.rule_induction.r2n.r2n_algo' from '/Users/hvo/Development/vsc/python/benchmarking/aix360i/aix360i/aix360i/algorithms/rule_induction/r2n/r2n_algo.py'>
Training time: 0.0017390251159667969


### Evaluation

In [10]:
acc_list = []
prec_list = []
%store -r acc_list
%store -r prec_list

if CONFIG['TYPE'] == 'BINARY':
    y_pred = estimator.predict(x_test_bin)
    print('Accuracy:', accuracy_score(y_test, y_pred))
    print('Balanced accuracy:', balanced_accuracy_score(y_test, y_pred))
    print('Precision:', precision_score(y_test, y_pred, pos_label=POS_CLASS))
    print('Recall:', recall_score(y_test, y_pred, pos_label=POS_CLASS))
    print('F1', f1_score(y_test, y_pred, pos_label=POS_CLASS))
    print('ConfusionMatrix', confusion_matrix(y_test, y_pred))
    print('F-2', fbeta_score(y_test, y_pred, pos_label=POS_CLASS, beta= 2))
    print('Mathhews', matthews_corrcoef(y_test, y_pred))


    acc_list.append(recall_score(y_test, y_pred, pos_label=POS_CLASS))
    prec_list.append(precision_score(y_test, y_pred, pos_label=POS_CLASS))
    %store acc_list
    %store prec_list
   
elif CONFIG['TYPE'] == 'CONTINUOUS':
    y_pred = explainer.predict(x_test_bin)
    print(f'R2 Score = {r2_score(y_test, y_pred)}')
    print(f'Explained Variance = {explained_variance_score(y_test, y_pred)}')
    print(f'Mean abs. error = {mean_absolute_error(y_test, y_pred)}')
    print(f'Max error = {max_error(y_test, y_pred)}')
    


NameError: name 'estimator' is not defined

In [None]:
POS_CLASS

In [None]:
if CONFIG['TYPE'] == 'CONTINUOUS':
    explanation = explainer.explain()
    print(explanation)
elif algo == 'BRCG':
    # model = estimator.explain()
    # if not model['isCNF']:
    #     print('Number of rules:', len(model['rules']))
    #     print(model['rules'])
    # BRCG trxf export
    rule_set = estimator.explain()
    print(rule_set.conjunctions)
elif algo == 'RIPPER':
    print('Rule count: ' + str(sum([len(rules) for rules in estimator.rule_map.values()])))
    print('Rule set:')
    print(estimator.rule_list_to_pretty_string())

    rule_set_list = []
    rule_set = estimator.export_rules_to_trxf_dnf_ruleset(POS_CLASS)
    conjunctions = rule_set.conjunctions
    for c in conjunctions:
        conjunction_dict = {}
        predicates = c.predicates
        for p in predicates:
            name = str(p.feature) + str(p.relation)
            value = p.value
            conjunction_dict[name] = value
        rule_set_list.append(conjunction_dict)
    print(rule_set_list)
    rule_set_df = pd.DataFrame(rule_set_list)
    print(rule_set_df.head())
elif algo == 'CORELS':
    r_length = len(estimator.rl().rules)
    print("Rule Length:", r_length)
    for i in range(len(estimator.rl().rules[0]["antecedents"])):
        an = len(estimator.rl().rules[i]["antecedents"])
        print(f"Antecedents Length Rule {i}:" , an)
elif algo == 'Witt_RIPPER':
    print("Rule Length:", len(estimator.ruleset_))

# uncomment the following line for a full optimized view of the model as data frame for GLRM rules
# explanation.style