# Generic Rule Induction Notebook

Continuoulsy refined.

## README

- GLRM needs to run in proper (conda) aix360 environment
- BRCG runs with proper (conda) aix360 environment

### Configuration


In [174]:
CONFIG1 = { 
    'NAME' : 'binary-churn-quantile-brcg', 
    'DATA_SET': '../data/churn_prob_out_35.csv',
    'DATA_TYPES': {'Children': float, 'RatePlan': str},
    'DROP': ['Id', 'pChurn', '3_Class', '5_Class', 'is_test_set'],
    'MODE': 'PREDICTIVE',
    'TRAIN_TEST_SPLIT': 0.3, # 'FIXED' for using 'is_test_set'
    'BINARIZER': 'QUANTILE',
    'ALGO': 'BRCG',
    'TARGET_LABEL': 'CHURN',
    'TYPE' : 'BINARY',
    'EXAMPLE_FEATURE' : 'Est Income',
    'POS_CLASS': 'T'
     }
CONFIG2 = { 
    'NAME' : 'continuous-churn', 
    'DATA_SET': 'data/churn_prob_out_35.csv',
    'DATA_TYPES': {'Children': float, 'RatePlan': str},
    'DROP': ['Id', 'CHURN', '3_Class', '5_Class', 'is_test_set'],
    'MODE': 'PREDICTIVE',
    'TRAIN_TEST_SPLIT': 0.3, # 'FIXED' for using 'is_test_set'
    'BINARIZER': 'QUANTILE',
    'ALGO': 'GLRM',
    'TARGET_LABEL': 'pChurn',
    'TYPE' : 'CONTINUOUS',
    'EXAMPLE_FEATURE' : 'Est Income',
    'POS_CLASS': None
     }
CONFIG3 = { 
    'NAME' : 'bike-demand', 
    'DATA_SET': '../data/SeoulBikeData.csv',
    'DATA_TYPES': {'Rented Bike Count': float, 'Hour': float, 'Humidity': float, 'Visibility (10m)': float, 'RatePlan': str},
    'DROP': ['Date'],
    'MODE': 'PREDICTIVE',
    'TRAIN_TEST_SPLIT': 0.3,
    'BINARIZER': 'QUANTILE',
    'ALGO': 'GLRM',
    'TARGET_LABEL': 'Rented Bike Count',
    'TYPE' : 'CONTINUOUS',
    'EXAMPLE_FEATURE' : 'Dew point temperature(C)',
    'POS_CLASS': None
     }
CONFIG4 = { 
    'NAME' : 'heloc', 
    'DATA_SET': '../data/heloc.csv',
    'DATA_TYPES': {},
    'DROP': ['RiskPerformance'],
    'MODE': 'PREDICTIVE',
    'TRAIN_TEST_SPLIT': 0.3,
    'BINARIZER': 'QUANTILE',
    'ALGO': 'GLRM',
    'TARGET_LABEL': 'Probabilities',
    'TYPE' : 'CONTINUOUS',
    'EXAMPLE_FEATURE' : 'ExternalRiskEstimate',
    'POS_CLASS': None
     }
CONFIG5 = { 
    'NAME' : 'taiwan-credit', 
    'DATA_SET': '../data/TaiwanCreditData.csv',
    'DATA_TYPES': {},
    'DROP': ['DefaultNextMonth'],
    'MODE': 'PREDICTIVE',
    'TRAIN_TEST_SPLIT': 0.3,
    'BINARIZER': 'QUANTILE',
    'ALGO': 'GLRM',
    'TARGET_LABEL': 'Probabilities',
    'TYPE' : 'CONTINUOUS',
    'EXAMPLE_FEATURE' : 'Amount',
    'POS_CLASS': None
     }

CONFIG = CONFIG5
print('Proceed with configuration:', CONFIG['NAME'])

Proceed with configuration: taiwan-credit


In [175]:
import pandas as pd
import numpy as np
# import os
from sklearn.model_selection import train_test_split #, GridSearchCV
from sklearn.metrics import precision_score, recall_score, accuracy_score, balanced_accuracy_score, confusion_matrix, r2_score, explained_variance_score, mean_absolute_error, max_error
from sklearn.ensemble import GradientBoostingRegressor
# from ripper.Ripper import Ripper
import time
import warnings

from aix360.algorithms.rbm import BRCGExplainer, BooleanRuleCG, GLRMExplainer, LinearRuleRegression
from aix360.algorithms.rbm import FeatureBinarizer, FeatureBinarizerFromTrees

# from explainer import Explainer

# TODO make imports dependent on CONFIG
# TODO create reference for performance using boosted trees

# import wittgenstein as lw
# from clf_utils import make_tree_dataset, make_forest, score_forest
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.preprocessing import OneHotEncoder, LabelEncoder


### Data

In [176]:
def convert(char):
    if char == CONFIG['POS_CLASS']:
        return 1
    else:
        return 0

df = pd.read_csv(CONFIG['DATA_SET'],dtype=CONFIG['DATA_TYPES'])
df = df.drop(columns=CONFIG['DROP'])
if CONFIG['ALGO'] == 'BRCG':
    df[CONFIG['TARGET_LABEL']] = df[CONFIG['TARGET_LABEL']].map(convert) 
    # maybe this could also be achieved through explicit binarization of target vector
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 24 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Amount         30000 non-null  int64  
 1   Sex            30000 non-null  object 
 2   Education      30000 non-null  object 
 3   MaritalStatus  30000 non-null  object 
 4   Age            30000 non-null  int64  
 5   PayStatus_Sep  30000 non-null  object 
 6   PayStatus_Aug  30000 non-null  object 
 7   PayStatus_Jul  30000 non-null  object 
 8   PayStatus_Jun  30000 non-null  object 
 9   PayStatus_May  30000 non-null  object 
 10  PayStatus_Apr  30000 non-null  object 
 11  Bill_Sep       30000 non-null  int64  
 12  Bill_Aug       30000 non-null  int64  
 13  Bill_Jul       30000 non-null  int64  
 14  Bill_Jun       30000 non-null  int64  
 15  Bill_May       30000 non-null  int64  
 16  Bill_Apr       30000 non-null  int64  
 17  PayAmount_Sep  30000 non-null  int64  
 18  PayAmo

Unnamed: 0,Amount,Sex,Education,MaritalStatus,Age,PayStatus_Sep,PayStatus_Aug,PayStatus_Jul,PayStatus_Jun,PayStatus_May,...,Bill_Jun,Bill_May,Bill_Apr,PayAmount_Sep,PayAmount_Aug,PayAmount_Jul,PayAmount_Jun,PayAmount_May,PayAmount_Apr,Probabilities
0,20000,F,Univ,Married,24,Two,Two,Clear,Clear,Unk,...,0,0,0,0,689,0,0,0,0,0.786945
1,120000,F,Univ,Single,26,Clear,Two,Zero,Zero,Zero,...,3272,3455,3261,0,1000,1000,1000,0,2000,0.420861
2,90000,F,Univ,Single,34,Zero,Zero,Zero,Zero,Zero,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0.134458
3,50000,F,Univ,Married,37,Zero,Zero,Zero,Zero,Zero,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0.128739
4,50000,M,Univ,Married,57,Clear,Zero,Clear,Zero,Zero,...,20940,19146,19131,2000,36681,10000,9000,689,679,0.095658


In [177]:
df[CONFIG['TARGET_LABEL']].describe()

count    30000.000000
mean         0.221216
std          0.214078
min          0.003548
25%          0.075355
50%          0.136818
75%          0.280786
max          0.997406
Name: Probabilities, dtype: float64

### Train, Test Split

In [178]:
if CONFIG['TRAIN_TEST_SPLIT'] == 'FIXED':
    if CONFIG['MODE'] == 'PREDICTIVE':
        train = df[df['is_test_set'] == False]
        test = df[df['is_test_set'] == True]
    elif CONFIG['MODE'] == 'DESCRIPTIVE':
        train = df
        test = df

    train = train.drop(columns=['is_test_set'])
    test = test.drop(columns=['is_test_set'])

    y_train = train[CONFIG['TARGET_LABEL']]
    x_train = train.drop(columns=[CONFIG['TARGET_LABEL']])

    y_test = test[CONFIG['TARGET_LABEL']]
    x_test = test.drop(columns=[CONFIG['TARGET_LABEL']])
else:
    x_train, x_test, y_train, y_test = train_test_split(df.drop(columns=[CONFIG['TARGET_LABEL']]), df[CONFIG['TARGET_LABEL']], test_size=CONFIG['TRAIN_TEST_SPLIT'], random_state=42)

print('Training:', x_train.shape, y_train.shape)
print('Test:', x_test.shape, y_test.shape)

Training: (21000, 23) (21000,)
Test: (9000, 23) (9000,)


### Reference Performance

In [179]:
if CONFIG['TYPE'] == 'CONTINUOUS':
    print('needs prior encoding of categoricals')
    # gbr = GradientBoostingRegressor(n_estimators=500, random_state=0)
    # gbr.fit(x_train, y_train)
    # # print('Training R^2:', r2_score(yTrain, gbr.predict(dfTrain)))
    # print('Test R^2:', r2_score(y_test, gbr.predict(x_test)))

needs prior encoding of categoricals


### Binarization

In [180]:
if CONFIG['BINARIZER'] == 'TREES':
    binarizer =  FeatureBinarizerFromTrees(negations=True, randomState=42) # FeatureBinarizer(negations=False), FeatureBinarizerFromTrees(negations=True, randomState=42)
    binarizer = binarizer.fit(x_train, y_train)
    x_train_bin = binarizer.transform(x_train) #  x_train_bin = binarizer.fit_transform(x_train)
    x_test_bin = binarizer.transform(x_test) #  X_fb = self.fb.fit_transform(X_train)
elif CONFIG['BINARIZER'] == 'QUANTILE':
    binarizer =  FeatureBinarizer(numThresh=9,negations=True) # FeatureBinarizer(negations=False), FeatureBinarizerFromTrees(negations=True, randomState=42)
    binarizer = binarizer.fit(x_train)
    x_train_bin = binarizer.transform(x_train) #  x_train_bin = binarizer.fit_transform(x_train)
    x_test_bin = binarizer.transform(x_test) #  X_fb = self.fb.fit_transform(X_train)  
else:
    print('UNRECOGNIZED BINARIZER')

x_train_bin.info() # verbose=True
x_train_bin.head()
x_train_bin[CONFIG['EXAMPLE_FEATURE']][:10]


<class 'pandas.core.frame.DataFrame'>
Int64Index: 21000 entries, 28465 to 23654
Columns: 396 entries, ('Amount', '<=', 30000.0) to ('PayAmount_Apr', '>', 9500.0)
dtypes: int64(396)
memory usage: 63.6 MB


operation,<=,<=,<=,<=,<=,<=,<=,<=,<=,>,>,>,>,>,>,>,>,>
value,30000.0,50000.0,70000.0,100000.0,140000.0,180000.0,210000.0,270000.0,360000.0,30000.0,50000.0,70000.0,100000.0,140000.0,180000.0,210000.0,270000.0,360000.0
28465,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,0,0
27622,0,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0
28376,0,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0
10917,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,0,0,0
27234,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,0,0
15171,0,0,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
962,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,0,0,0
26240,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,0,0
13175,0,0,0,0,1,1,1,1,1,1,1,1,1,0,0,0,0,0
27087,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0


### Rule Induction

In [181]:
start_time = time.time()

if CONFIG['ALGO'] == 'BRCG':
    estimator = BooleanRuleCG() # Explainer()
    # estimator.train(x_train, y_train)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        estimator.fit(x_train_bin, y_train)
elif CONFIG['ALGO'] == 'RIPPER':
    print('TODO plugin RIPPER here')
    # Ripper(d=2, k=2, pruning_threshold=50, mode='python')
elif CONFIG['ALGO'] == 'GLRM':
    linear_model = LinearRuleRegression() # lambda0=0.0005,lambda1=0.0001
    explainer = GLRMExplainer(linear_model)
    explainer.fit(x_train_bin, y_train)
else:
    print('Unrecognized algorithm:', CONFIG['ALGO'])

end_time = time.time()
print('training time: ' + str(end_time - start_time))

training time: 48.53972887992859


### Evaluation

In [182]:
if CONFIG['TYPE'] == 'BINARY':
    # compute predictive accuracy
    # print(estimator.dnf_rules.items())
    y_pred = estimator.predict(x_test_bin)
    accuracy = accuracy_score(y_test, y_pred)
    print('Accuracy:', accuracy)
    # print("Accuracy: %.2f%%" % (accuracy * 100.0))
    # print('Accuracy 2:', accuracy_score(y_test, estimator.predict(x_test)))
    # print('Accuracy 3', estimator.score(x_test, y_test))
    # print('Accuracy 4', estimator.score(x_test, y_test))

    # res = estimator.score(x_test, y_test) # this score function is defect
    # print('acc: ' + str(res))
    # print('rule count: ' + str(sum([len(rules) for rules in estimator.rule_map.values()])))
    model = estimator.explain()
    if not model['isCNF']:
        print('Number of rules:', len(model['rules']))
        print(model['rules'])
    # estimator.rule_map
    # print(estimator.rule_list_to_pretty_string())
elif CONFIG['TYPE'] == 'CONTINUOUS':
    y_pred = explainer.predict(x_test_bin)
    print(f'R2 Score = {r2_score(y_test, y_pred)}')
    print(f'Explained Variance = {explained_variance_score(y_test, y_pred)}')
    print(f'Mean abs. error = {mean_absolute_error(y_test, y_pred)}')
    print(f'Max error = {max_error(y_test, y_pred)}')


R2 Score = 0.8122197549644137
Explained Variance = 0.8122223915580278
Mean abs. error = 0.05835766159744946
Max error = 0.7620149044431979


In [183]:
if CONFIG['TYPE'] == 'CONTINUOUS':
    explanation = explainer.explain()
    print(explanation)

# uncomment the following line for a full optimized view of the model as data frame
# explanation.style

                                                 rule coefficient
0                                         (intercept)    0.673464
1   PayStatus_Sep != Three AND PayStatus_Sep != Tw...   -0.161679
2       PayStatus_Sep != Two AND PayStatus_Jul != Two   -0.114607
3   PayStatus_Sep != Three AND PayStatus_Sep != Tw...   -0.110643
4   PayStatus_Sep != Three AND PayStatus_Sep != Tw...   -0.108837
5       PayStatus_Aug != Two AND PayStatus_Jun != Two  -0.0808469
6   Amount > 30000.00 AND PayStatus_Sep != Three A...  -0.0664476
7   Education != Unk1 AND PayStatus_Sep != Unk AND...   0.0655708
8   PayStatus_Sep != Three AND PayStatus_Sep != Tw...  -0.0643697
9   PayStatus_Sep != Three AND PayStatus_Sep != Tw...  -0.0530466
10  PayStatus_Sep != Three AND PayStatus_Sep != Tw...  -0.0426462
11                                Amount <= 140000.00   0.0390639
12  MaritalStatus != Married AND PayStatus_May != Two  -0.0312586
13                           PayAmount_Sep <= 4317.00   0.0300223
14  PaySta