# Generic Rule Induction Notebook

Continuoulsy refined.

In [1]:
#%store -z acc_list 
#%store -z prec_list 

## README

- GLRM needs to run in proper (conda) aix360 environment
- BRCG runs with proper (conda) aix360 environment
- Use aix360i environment for RIPPER

### Configuration


In [2]:
from config import config_dict
from config import config_dict_imbalanced

CONFIG = config_dict_imbalanced['CONFIG-I4']
# CONFIG = config_dict['CONFIG6']
print('Proceed with configuration:', CONFIG['NAME'])
print(CONFIG)

Proceed with configuration: miniloan-RIPPER
{'NAME': 'miniloan-RIPPER', 'DATA_SET': '../data/miniloan-decisions-100K.csv', 'DATA_TYPES': {'creditScore': <class 'float'>, 'income': <class 'float'>, 'loanAmount': <class 'float'>, 'monthDuration': <class 'float'>, 'yearlyReimbursement': <class 'float'>}, 'DROP': ['Unnamed: 0'], 'MODE': 'PREDICTIVE', 'TRAIN_TEST_SPLIT': 0.3, 'BINARIZER': 'NATIVE', 'ALGO': 'RIPPER', 'TARGET_LABEL': 'approval', 'TYPE': 'BINARY', 'EXAMPLE_FEATURE': 'income', 'POS_CLASS': False, 'BASELINE': True, 'ONEHOT': False}


In [3]:
import pandas as pd
import numpy as np
# import os
from sklearn.model_selection import train_test_split #, GridSearchCV
from sklearn.metrics import matthews_corrcoef,fbeta_score,confusion_matrix,f1_score,precision_score, recall_score, accuracy_score, balanced_accuracy_score, confusion_matrix, r2_score, explained_variance_score, mean_absolute_error, max_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from xgboost import XGBClassifier
import time
import warnings
import re

if CONFIG['BINARIZER'] == 'QUANTILE':
    from aix360.algorithms.rbm import FeatureBinarizer
elif CONFIG['BINARIZER'] == 'TREES':
    from aix360.algorithms.rbm import FeatureBinarizerFromTrees
if CONFIG['ALGO'] == 'RIPPER':
    from aix360i.algorithms.rule_induction.ripper import Ripper
elif CONFIG['ALGO'] == 'BRCG':
    from aix360.algorithms.rbm import BooleanRuleCG # BRCGExplainer
elif CONFIG['ALGO'] == 'CORELS':
    from corels import *
elif CONFIG['ALGO'] == 'Witt_RIPPER':
    import wittgenstein as lw
elif CONFIG['ALGO'] == 'GLRM':
    from aix360.algorithms.rbm import GLRMExplainer, LinearRuleRegression


Importing dev version v0.982 of RIPPER


### Data

In [4]:

def convert(char):
    if char == CONFIG['POS_CLASS']:
        return 1
    else:
        return 0

df = pd.read_csv(CONFIG['DATA_SET'],dtype=CONFIG['DATA_TYPES'])
df = df.drop(columns=CONFIG['DROP'])
if CONFIG['ALGO'] == 'BRCG' or CONFIG['ALGO'] == 'CORELS' :
    df[CONFIG['TARGET_LABEL']] = df[CONFIG['TARGET_LABEL']].map(convert)
    # BRCG trains for value 1 as POS_CLASS
df.info()
df[CONFIG['TARGET_LABEL']].value_counts()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   name                 100000 non-null  object 
 1   creditScore          100000 non-null  float64
 2   income               100000 non-null  float64
 3   loanAmount           100000 non-null  float64
 4   monthDuration        100000 non-null  float64
 5   rate                 100000 non-null  float64
 6   approval             100000 non-null  bool   
 7   yearlyReimbursement  100000 non-null  float64
dtypes: bool(1), float64(6), object(1)
memory usage: 5.4+ MB


Unnamed: 0,name,creditScore,income,loanAmount,monthDuration,rate,approval,yearlyReimbursement
0,John Doe,736.0,113243.0,783440.0,162.0,0.030005,True,70648.0
1,John Doe,527.0,224186.0,1787385.0,183.0,0.054819,True,173197.0
2,John Doe,460.0,39954.0,1733494.0,75.0,0.055183,True,328563.0
3,John Doe,751.0,219998.0,1350004.0,72.0,0.0434,True,255970.0
4,John Doe,684.0,72470.0,1210944.0,160.0,0.037772,True,115742.0


In [5]:
if CONFIG['TYPE'] == 'BINARY':
    print(df[CONFIG['TARGET_LABEL']].value_counts())
elif CONFIG['TYPE'] == 'CONTINUOUS':
    df[CONFIG['TARGET_LABEL']].describe()
else:
    print('Unrecognized problem type')

True     78046
False    21954
Name: approval, dtype: int64


### Train, Test Split

In [6]:
if CONFIG['TRAIN_TEST_SPLIT'] == 'FIXED':
    if CONFIG['MODE'] == 'PREDICTIVE':
        train = df[df['is_test_set'] == False]
        test = df[df['is_test_set'] == True]
    elif CONFIG['MODE'] == 'DESCRIPTIVE':
        train = df
        test = df

    train = train.drop(columns=['is_test_set'])
    test = test.drop(columns=['is_test_set'])

    y_train = train[CONFIG['TARGET_LABEL']]
    x_train = train.drop(columns=[CONFIG['TARGET_LABEL']])

    y_test = test[CONFIG['TARGET_LABEL']]
    x_test = test.drop(columns=[CONFIG['TARGET_LABEL']])
else:
    x_train, x_test, y_train, y_test = train_test_split(df.drop(columns=[CONFIG['TARGET_LABEL']]), df[CONFIG['TARGET_LABEL']], test_size=CONFIG['TRAIN_TEST_SPLIT'], random_state=42)

print('Training:', x_train.shape, y_train.shape)
print('Test:', x_test.shape, y_test.shape)

Training: (70000, 7) (70000,)
Test: (30000, 7) (30000,)


### Reference Performance

In [7]:
if CONFIG['TYPE'] == 'CONTINUOUS':
    print('needs prior encoding of categoricals')
    # gbr = GradientBoostingRegressor(n_estimators=500, random_state=0)
    # gbr.fit(x_train, y_train)
    # # print('Training R^2:', r2_score(yTrain, gbr.predict(dfTrain)))
    # print('Test R^2:', r2_score(y_test, gbr.predict(x_test)))
elif CONFIG['TYPE'] == 'BINARY':
    x_train_cp = x_train.copy()
    y_train_cp = y_train.copy()
    y_train_cp = y_train.map(convert)  # use manual encoding to make sure that pos_value = 1
    x_test_cp = x_test.copy()
    y_test_cp = y_test.copy()
    y_test_cp = y_test.map(convert)  # use manual encoding to make sure that pos_value = 1
    categorical_features = df.select_dtypes(include=['object']).columns
    # numerical_feat = df.select_dtypes(include=['int64', 'float64']).columns
    for col in categorical_features:
        label_encoder = LabelEncoder()
        label_encoder = label_encoder.fit(df[col])
        x_train_cp[col] = label_encoder.transform(x_train_cp[col])
        x_test_cp[col] = label_encoder.transform(x_test_cp[col])
    xgb_model = XGBClassifier(use_label_encoder=False)
    xgb_model.fit(x_train_cp, y_train_cp)
    y_pred = xgb_model.predict(x_test_cp)

    print('Accuracy:', accuracy_score(y_test_cp, y_pred))
    print('Balanced accuracy:', balanced_accuracy_score(y_test_cp, y_pred))
    print('Precision:', precision_score(y_test_cp, y_pred, pos_label=1))
    print('Recall:', recall_score(y_test_cp, y_pred, pos_label=1))
    # print(xgb_model)



Accuracy: 0.9982
Balanced accuracy: 0.997365266751356
Precision: 0.9958822632301357
Recall: 0.9958822632301357


### Binarization

In [8]:
if CONFIG['BINARIZER'] == 'TREES':
    binarizer =  FeatureBinarizerFromTrees(negations=False, randomState=42) # FeatureBinarizer(negations=False), FeatureBinarizerFromTrees(negations=True, randomState=42)
    binarizer = binarizer.fit(x_train, y_train)
    x_train_bin = binarizer.transform(x_train) #  x_train_bin = binarizer.fit_transform(x_train)
    x_test_bin = binarizer.transform(x_test) #  X_fb = self.fb.fit_transform(X_train)
elif CONFIG['BINARIZER'] == 'QUANTILE':
    binarizer =  FeatureBinarizer(numThresh=9,negations=False) # FeatureBinarizer(negations=False), FeatureBinarizerFromTrees(negations=True, randomState=42)
    binarizer = binarizer.fit(x_train)
    x_train_bin = binarizer.transform(x_train) #  x_train_bin = binarizer.fit_transform(x_train)
    x_test_bin = binarizer.transform(x_test) #  X_fb = self.fb.fit_transform(X_train)  
elif CONFIG['BINARIZER'] == 'NATIVE':
    x_train_bin = x_train
    x_test_bin = x_test
else:
    print('UNRECOGNIZED BINARIZER')

x_train_bin.info() # verbose=True
x_train_bin.head()
#x_train_bin[CONFIG['EXAMPLE_FEATURE']][:10]


<class 'pandas.core.frame.DataFrame'>
Int64Index: 70000 entries, 76513 to 15795
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   name                 70000 non-null  object 
 1   creditScore          70000 non-null  float64
 2   income               70000 non-null  float64
 3   loanAmount           70000 non-null  float64
 4   monthDuration        70000 non-null  float64
 5   rate                 70000 non-null  float64
 6   yearlyReimbursement  70000 non-null  float64
dtypes: float64(6), object(1)
memory usage: 4.3+ MB


Unnamed: 0,name,creditScore,income,loanAmount,monthDuration,rate,yearlyReimbursement
76513,John Doe,323.0,142660.0,1371289.0,155.0,0.057102,150327.0
60406,John Doe,781.0,138610.0,88183.0,286.0,0.049289,6296.0
27322,John Doe,458.0,190396.0,1073466.0,150.0,0.057622,120683.0
53699,John Doe,569.0,36948.0,817845.0,359.0,0.050611,53114.0
65412,John Doe,633.0,60520.0,1113649.0,289.0,0.037175,70057.0


### Rule Induction

In [9]:
start_time = time.time()
print('Starting training for', CONFIG['ALGO'])

if CONFIG['ALGO'] == 'BRCG':
    estimator = BooleanRuleCG() # Explainer()
    # estimator.train(x_train, y_train)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        estimator.fit(x_train_bin, y_train)
elif CONFIG['ALGO'] == 'RIPPER':
    estimator = Ripper()
    estimator.fit(x_train_bin, y_train, pos_value=CONFIG['POS_CLASS'])
elif CONFIG['ALGO'] == 'Witt_RIPPER':
    estimator = lw.RIPPER()
    estimator.fit(x_train_bin, y_train,class_feat=CONFIG["TARGET_LABEL"] , pos_value=CONFIG['POS_CLASS'])
elif CONFIG['ALGO'] == 'GLRM':
    linear_model = LinearRuleRegression() # lambda0=0.0005,lambda1=0.0001
    explainer = GLRMExplainer(linear_model)
    explainer.fit(x_train_bin, y_train)
elif CONFIG['ALGO'] == 'CORELS':
    estimator = CorelsClassifier(n_iter=10000, 
                     max_card=2, # feautres per statement
                     c = 0.0001 # Higher values penalise longer rulelists
                    )
    estimator.fit(x_train_bin, y_train , prediction_name = CONFIG["TARGET_LABEL"])
    
else:
    print('Unrecognized algorithm:', CONFIG['ALGO'])

end_time = time.time()
print('Training time: ' + str(end_time - start_time))

Starting training for RIPPER
set to True
Training time: 72.63830399513245


### Evaluation

In [10]:
acc_list = []
prec_list = []
%store -r acc_list
%store -r prec_list

if CONFIG['TYPE'] == 'BINARY':
    y_pred = estimator.predict(x_test_bin)
    print('Accuracy:', accuracy_score(y_test, y_pred))
    print('Balanced accuracy:', balanced_accuracy_score(y_test, y_pred))
    print('Precision:', precision_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS']))
    print('Recall:', recall_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS']))
    print('F1', f1_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS']))
    print('ConfusionMatrix', confusion_matrix(y_test, y_pred))
    print('F-2', fbeta_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS'], beta= 2))
    print('Mathhews', matthews_corrcoef(y_test, y_pred))

    acc_list.append(recall_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS']))
    prec_list.append(precision_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS']))
    %store acc_list
    %store prec_list
   
elif CONFIG['TYPE'] == 'CONTINUOUS':
    y_pred = explainer.predict(x_test_bin)
    print(f'R2 Score = {r2_score(y_test, y_pred)}')
    print(f'Explained Variance = {explained_variance_score(y_test, y_pred)}')
    print(f'Mean abs. error = {mean_absolute_error(y_test, y_pred)}')
    print(f'Max error = {max_error(y_test, y_pred)}')
    


Accuracy: 0.9954333333333333
Balanced accuracy: 0.991695265503403
Precision: 0.9939981532779316
Recall: 0.9850541406130853
F1 0.9895059364228265
ConfusionMatrix [[ 6459    98]
 [   39 23404]]
F-2 0.9868300433905763
Mathhews 0.9866041622720696
Stored 'acc_list' (list)
Stored 'prec_list' (list)


In [11]:
if CONFIG['TYPE'] == 'CONTINUOUS':
    explanation = explainer.explain()
    print(explanation)
elif CONFIG['ALGO'] == 'BRCG':
    model = estimator.explain()
    if not model['isCNF']:
        print('Number of rules:', len(model['rules']))
        print(model['rules'])
elif CONFIG['ALGO'] == 'RIPPER':
    print('Rule count: ' + str(sum([len(rules) for rules in estimator.rule_map.values()])))
    print('Rule set:')
    print(estimator.rule_list_to_pretty_string())
elif CONFIG['ALGO'] == 'CORELS':
    r_length = len(estimator.rl().rules)
    print("Rule Length:", r_length)
    for i in range(len(estimator.rl().rules[0]["antecedents"])):
        an = len(estimator.rl().rules[i]["antecedents"])
        print(f"Antecedents Length Rule {i}:" , an)
elif CONFIG['ALGO'] == 'Witt_RIPPER':
    print("Rule Length:", len(estimator.ruleset_))

# uncomment the following line for a full optimized view of the model as data frame for GLRM rules
# explanation.style

Rule count: 30
Rule set:

if {
	(yearlyReimbursement <= 41958.0 and loanAmount <= 525036.0 and income >= 133066.0) or
	(income >= 154544.0 and loanAmount <= 786122.0 and yearlyReimbursement <= 47581.0) or
	(loanAmount <= 220052.0 and income >= 54242.0 and yearlyReimbursement <= 17756.0) or
	(loanAmount <= 990363.0 and income >= 219984.0 and yearlyReimbursement <= 66944.0) or
	(income >= 95850.0 and loanAmount <= 518574.0 and yearlyReimbursement <= 30080.0) or
	(yearlyReimbursement <= 49096.0 and loanAmount <= 842450.0 and income >= 160598.0) or
	(loanAmount <= 161856.0 and yearlyReimbursement <= 11875.0 and income >= 33849.0) or
	(loanAmount <= 995714.0 and income >= 261448.0 and yearlyReimbursement <= 79885.0) or
	(income >= 84388.0 and loanAmount <= 959464.0 and yearlyReimbursement <= 27325.0 and rate >= 0.03055418433499172) or
	(loanAmount <= 996392.0 and yearlyReimbursement <= 60519.0 and income >= 197486.0) or
	(income >= 114246.0 and monthDuration >= 35.0 and yearlyReimbursement 