# Generic Rule Induction Notebook

Continuoulsy refined.

## README

- GLRM needs to run in proper (conda) aix360 environment
- BRCG runs with proper (conda) aix360 environment
- Use aix360i environment for RIPPER

### Configuration


In [3]:
from config import config_dict


CONFIG = config_dict["CONFIG12"]
print('Proceed with configuration:', CONFIG["NAME"])

Proceed with configuration: taiwan_binary


In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
# import os
from sklearn.model_selection import train_test_split #, GridSearchCV
from sklearn.metrics import fbeta_score,confusion_matrix,f1_score,precision_score, recall_score, accuracy_score, balanced_accuracy_score, confusion_matrix, r2_score, explained_variance_score, mean_absolute_error, max_error
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
import time
import warnings
import re

from aix360.algorithms.rbm import BRCGExplainer, BooleanRuleCG, GLRMExplainer, LinearRuleRegression
if CONFIG['BINARIZER'] == 'QUANTILE':
    from aix360.algorithms.rbm import FeatureBinarizer
elif CONFIG['BINARIZER'] == 'TREES':
    from aix360.algorithms.rbm import FeatureBinarizerFromTrees
if CONFIG['ALGO'] == 'RIPPER':
    from aix360i.algorithms.rule_induction.ripper import Ripper
if CONFIG['ALGO'] == 'CORELS':
    from corels import *





# from explainer import Explainer

# TODO create reference for performance using boosted trees

# import wittgenstein as lw
# from clf_utils import make_tree_dataset, make_forest, score_forest
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.preprocessing import OneHotEncoder, LabelEncoder


Importing dev version v0.981 of RIPPER


### Data

In [5]:
def convert(char):
    if char == CONFIG['POS_CLASS']:
        return 1
    else:
        return 0

df = pd.read_csv(CONFIG['DATA_SET'],dtype=CONFIG['DATA_TYPES'])
df = df.drop(columns=CONFIG['DROP'])
if CONFIG['ALGO'] == 'BRCG' or CONFIG['ALGO'] == 'CORELS' :
    df[CONFIG['TARGET_LABEL']] = df[CONFIG['TARGET_LABEL']].map(convert)
    CONFIG['POS_CLASS'] = 1 
    # maybe this could also be achieved through explicit binarization of target vector
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Amount            30000 non-null  int64 
 1   Sex               30000 non-null  object
 2   Education         30000 non-null  object
 3   MaritalStatus     30000 non-null  object
 4   Age               30000 non-null  int64 
 5   PayStatus_Sep     30000 non-null  object
 6   PayStatus_Aug     30000 non-null  object
 7   PayStatus_Jul     30000 non-null  object
 8   PayStatus_Jun     30000 non-null  object
 9   PayStatus_May     30000 non-null  object
 10  PayStatus_Apr     30000 non-null  object
 11  Bill_Sep          30000 non-null  int64 
 12  Bill_Aug          30000 non-null  int64 
 13  Bill_Jul          30000 non-null  int64 
 14  Bill_Jun          30000 non-null  int64 
 15  Bill_May          30000 non-null  int64 
 16  Bill_Apr          30000 non-null  int64 
 17  PayAmount_Se

Unnamed: 0,Amount,Sex,Education,MaritalStatus,Age,PayStatus_Sep,PayStatus_Aug,PayStatus_Jul,PayStatus_Jun,PayStatus_May,...,Bill_Jun,Bill_May,Bill_Apr,PayAmount_Sep,PayAmount_Aug,PayAmount_Jul,PayAmount_Jun,PayAmount_May,PayAmount_Apr,DefaultNextMonth
0,20000,F,Univ,Married,24,Two,Two,Clear,Clear,Unk,...,0,0,0,0,689,0,0,0,0,1
1,120000,F,Univ,Single,26,Clear,Two,Zero,Zero,Zero,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,F,Univ,Single,34,Zero,Zero,Zero,Zero,Zero,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,F,Univ,Married,37,Zero,Zero,Zero,Zero,Zero,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,M,Univ,Married,57,Clear,Zero,Clear,Zero,Zero,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [6]:
if CONFIG['TYPE'] == 'BINARY':
    print(df[CONFIG['TARGET_LABEL']].value_counts())
elif CONFIG['TYPE'] == 'CONTINUOUS':
    df[CONFIG['TARGET_LABEL']].describe()
else:
    print('Unrecognized problem type')

0    23364
1     6636
Name: DefaultNextMonth, dtype: int64


### Train, Test Split

In [7]:
if CONFIG['TRAIN_TEST_SPLIT'] == 'FIXED':
    if CONFIG['MODE'] == 'PREDICTIVE':
        train = df[df['is_test_set'] == False]
        test = df[df['is_test_set'] == True]
    elif CONFIG['MODE'] == 'DESCRIPTIVE':
        train = df
        test = df

    train = train.drop(columns=['is_test_set'])
    test = test.drop(columns=['is_test_set'])

    y_train = train[CONFIG['TARGET_LABEL']]
    x_train = train.drop(columns=[CONFIG['TARGET_LABEL']])

    y_test = test[CONFIG['TARGET_LABEL']]
    x_test = test.drop(columns=[CONFIG['TARGET_LABEL']])
else:
    x_train, x_test, y_train, y_test = train_test_split(df.drop(columns=[CONFIG['TARGET_LABEL']]), df[CONFIG['TARGET_LABEL']], test_size=CONFIG['TRAIN_TEST_SPLIT'], random_state=42)

print('Training:', x_train.shape, y_train.shape)
print('Test:', x_test.shape, y_test.shape)

Training: (21000, 23) (21000,)
Test: (9000, 23) (9000,)


### Reference Performance

In [8]:
if CONFIG['TYPE'] == 'CONTINUOUS':
    print('needs prior encoding of categoricals')
    # gbr = GradientBoostingRegressor(n_estimators=500, random_state=0)
    # gbr.fit(x_train, y_train)
    # # print('Training R^2:', r2_score(yTrain, gbr.predict(dfTrain)))
    # print('Test R^2:', r2_score(y_test, gbr.predict(x_test)))

### Binarization

In [9]:
if CONFIG['BINARIZER'] == 'TREES':
    binarizer =  FeatureBinarizerFromTrees(negations=True, randomState=42) # FeatureBinarizer(negations=False), FeatureBinarizerFromTrees(negations=True, randomState=42)
    binarizer = binarizer.fit(x_train, y_train)
    x_train_bin = binarizer.transform(x_train) #  x_train_bin = binarizer.fit_transform(x_train)
    x_test_bin = binarizer.transform(x_test) #  X_fb = self.fb.fit_transform(X_train)
elif CONFIG['BINARIZER'] == 'QUANTILE':
    binarizer =  FeatureBinarizer(numThresh=9,negations=True) # FeatureBinarizer(negations=False), FeatureBinarizerFromTrees(negations=True, randomState=42)
    binarizer = binarizer.fit(x_train)
    x_train_bin = binarizer.transform(x_train) #  x_train_bin = binarizer.fit_transform(x_train)
    x_test_bin = binarizer.transform(x_test) #  X_fb = self.fb.fit_transform(X_train)  
elif CONFIG['BINARIZER'] == 'NATIVE':
    x_train_bin = x_train
    x_test_bin = x_test
else:
    print('UNRECOGNIZED BINARIZER')

x_train_bin.info() # verbose=True
x_train_bin.head()
#x_train_bin[CONFIG['EXAMPLE_FEATURE']][:10]


<class 'pandas.core.frame.DataFrame'>
Int64Index: 21000 entries, 28465 to 23654
Data columns (total 23 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Amount         21000 non-null  int64 
 1   Sex            21000 non-null  object
 2   Education      21000 non-null  object
 3   MaritalStatus  21000 non-null  object
 4   Age            21000 non-null  int64 
 5   PayStatus_Sep  21000 non-null  object
 6   PayStatus_Aug  21000 non-null  object
 7   PayStatus_Jul  21000 non-null  object
 8   PayStatus_Jun  21000 non-null  object
 9   PayStatus_May  21000 non-null  object
 10  PayStatus_Apr  21000 non-null  object
 11  Bill_Sep       21000 non-null  int64 
 12  Bill_Aug       21000 non-null  int64 
 13  Bill_Jul       21000 non-null  int64 
 14  Bill_Jun       21000 non-null  int64 
 15  Bill_May       21000 non-null  int64 
 16  Bill_Apr       21000 non-null  int64 
 17  PayAmount_Sep  21000 non-null  int64 
 18  PayAmount_Aug  21000 n

Unnamed: 0,Amount,Sex,Education,MaritalStatus,Age,PayStatus_Sep,PayStatus_Aug,PayStatus_Jul,PayStatus_Jun,PayStatus_May,...,Bill_Jul,Bill_Jun,Bill_May,Bill_Apr,PayAmount_Sep,PayAmount_Aug,PayAmount_Jul,PayAmount_Jun,PayAmount_May,PayAmount_Apr
28465,240000,F,Grad,Married,40,Unk,Unk,Unk,Unk,Unk,...,0,0,0,0,0,0,0,0,0,0
27622,50000,F,Grad,Single,23,Clear,Clear,Clear,Clear,Clear,...,2299,4800,9810,660,2548,2321,4800,9810,660,2980
28376,50000,F,Univ,Married,36,Two,Two,Two,Two,Zero,...,49125,47956,43578,35126,0,4700,0,2004,3500,0
10917,200000,F,HS,Married,54,Six,Five,Four,Three,Two,...,104686,102549,101400,0,0,0,0,0,0,0
27234,240000,M,Grad,Married,35,Clear,Clear,Clear,Zero,Clear,...,21790,17102,13367,22659,2017,21817,1120,13434,22772,22820


In [10]:
x_train_bin = x_train_bin.reset_index(drop = True)

### Rule Induction

In [11]:
start_time = time.time()
print('Starting training for', CONFIG['ALGO'])

if CONFIG['ALGO'] == 'BRCG':
    estimator = BooleanRuleCG() # Explainer()
    # estimator.train(x_train, y_train)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        estimator.fit(x_train_bin, y_train)
elif CONFIG['ALGO'] == 'RIPPER':
    
    estimator = Ripper()
    estimator.fit(x_train_bin, y_train, pos_value=CONFIG['POS_CLASS'])
elif CONFIG['ALGO'] == 'GLRM':
    linear_model = LinearRuleRegression() # lambda0=0.0005,lambda1=0.0001
    explainer = GLRMExplainer(linear_model)
    explainer.fit(x_train_bin, y_train)
elif CONFIG['ALGO'] == 'CORELS':
    estimator = CorelsClassifier(n_iter=10000, 
                     max_card=2, # feautres per statement
                     c = 0.0001 # Higher values penalise longer rulelists
                    )
    estimator.fit(x_train_bin, y_train , prediction_name = CONFIG["TARGET_LABEL"])
    
else:
    print('Unrecognized algorithm:', CONFIG['ALGO'])

end_time = time.time()
print('Training time: ' + str(end_time - start_time))

Starting training for RIPPER
Training time: 6.287001132965088


### Evaluation

In [12]:
if CONFIG['TYPE'] == 'BINARY':
    y_pred = estimator.predict(x_test_bin)
    print('Accuracy:', accuracy_score(y_test, y_pred))
    print('Balanced accuracy:', balanced_accuracy_score(y_test, y_pred))
    print('Precision:', precision_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS']))
    print('Recall:', recall_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS']))
    print('F1', f1_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS']))
    print('ConfusionMatrix', confusion_matrix(y_test, y_pred))
    print('F-2', fbeta_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS'], beta= 2))
   
   
elif CONFIG['TYPE'] == 'CONTINUOUS':
    y_pred = explainer.predict(x_test_bin)
    print(f'R2 Score = {r2_score(y_test, y_pred)}')
    print(f'Explained Variance = {explained_variance_score(y_test, y_pred)}')
    print(f'Mean abs. error = {mean_absolute_error(y_test, y_pred)}')
    print(f'Max error = {max_error(y_test, y_pred)}')
    


Accuracy: 0.8194444444444444
Balanced accuracy: 0.6487824675324676
Precision: 0.6637341153470185
Recall: 0.3464285714285714
F1 0.45524639624539054
ConfusionMatrix [[6696  344]
 [1281  679]]
F-2 0.3830531422768814


In [13]:
#XGboost for Binary Classification
if CONFIG['TYPE'] == 'BINARY' and CONFIG['BASELINE'] == True:
    
    regex = re.compile(r"\[|\]|<", re.IGNORECASE)
    x_train_bin.columns = [regex.sub("_", str(col)) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in x_train_bin.columns.values]
    x_test_bin.columns = [regex.sub("_", str(col)) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in x_test_bin.columns.values]

    xgb_cl = xgb.XGBClassifier()
    xgb_cl.fit(x_train_bin, y_train)
    preds = xgb_cl.predict(x_test_bin)     
    print('Accuracy:', accuracy_score(y_test, preds))
    print('Balanced accuracy:', balanced_accuracy_score(y_test, preds))
    print('Precision:', precision_score(y_test, preds, pos_label=CONFIG['POS_CLASS']))
    print('Recall:', recall_score(y_test, preds, pos_label=CONFIG['POS_CLASS']))
    print('F-2', fbeta_score(y_test, preds, pos_label=CONFIG['POS_CLASS'], beta= 2))

ValueError: DataFrame.dtypes for data must be int, float or bool.
                Did not expect the data types in fields Sex, Education, MaritalStatus, PayStatus_Sep, PayStatus_Aug, PayStatus_Jul, PayStatus_Jun, PayStatus_May, PayStatus_Apr

In [14]:
if CONFIG['TYPE'] == 'CONTINUOUS':
    explanation = explainer.explain()
    print(explanation)
elif CONFIG['ALGO'] == 'BRCG':
    model = estimator.explain()
    if not model['isCNF']:
        print('Number of rules:', len(model['rules']))
        print(model['rules'])
elif CONFIG['ALGO'] == 'RIPPER':
    print('Rule count: ' + str(sum([len(rules) for rules in estimator.rule_map.values()])))
    print('Rule set:')
    print(estimator.rule_list_to_pretty_string())
elif CONFIG['ALGO'] == 'CORELS':
    r_length = len(estimator.rl().rules)
    print("Rule Length:", r_length)
    for i in range(len(estimator.rl().rules[0]["antecedents"])):
        an = len(estimator.rl().rules[i]["antecedents"])
        print(f"Antecedents Length Rule {i}:" , an)

# uncomment the following line for a full optimized view of the model as data frame for GLRM rules
# explanation.style

Rule count: 8
Rule set:

if {
	(PayStatus_Aug == Two and PayStatus_Sep == Three) or
	(PayStatus_Sep == Two) or
	(PayStatus_Sep == One and PayStatus_Jul == Two and PayStatus_Apr == Two) or
	(PayAmount_Sep == 0 and PayStatus_Sep == One and Amount == 20000 and PayAmount_Jun == 0) or
	(PayStatus_Aug == Two and PayStatus_Sep == Two and MaritalStatus == Married) or
	(PayAmount_Sep == 0 and PayStatus_Sep == One and Sex == M and Bill_Jul == 0 and PayStatus_Apr == Unk and Education == Grad and Age == 43) or
	(PayAmount_Sep == 0 and PayStatus_Sep == One and PayStatus_Apr == Unk and Amount == 360000) or
	(PayStatus_Sep == Two and PayStatus_Aug == Zero and PayStatus_May == Two and MaritalStatus == Married)
} then 1
else 0
