# Generic Rule Induction Notebook

Continuoulsy refined.

## README

- GLRM needs to run in proper (conda) aix360 environment
- BRCG runs with proper (conda) aix360 environment
- Use aix360i environment for RIPPER

### Configuration


In [1]:
CONFIG1 = { 
    'NAME' : 'binary-churn-quantile-brcg', 
    'DATA_SET': '../data/churn_prob_out_35.csv',
    'DATA_TYPES': {'Children': float, 'RatePlan': str},
    'DROP': ['Id', 'pChurn', '3_Class', '5_Class', 'is_test_set'],
    'MODE': 'PREDICTIVE',
    'TRAIN_TEST_SPLIT': 0.3, # 'FIXED' for using 'is_test_set'
    'BINARIZER': 'QUANTILE',
    'ALGO': 'BRCG',
    'TARGET_LABEL': 'CHURN',
    'TYPE' : 'BINARY',
    'EXAMPLE_FEATURE' : 'Est Income',
    'POS_CLASS': 'T'
     }
CONFIG2 = { 
    'NAME' : 'continuous-churn', 
    'DATA_SET': 'data/churn_prob_out_35.csv',
    'DATA_TYPES': {'Children': float, 'RatePlan': str},
    'DROP': ['Id', 'CHURN', '3_Class', '5_Class', 'is_test_set'],
    'MODE': 'PREDICTIVE',
    'TRAIN_TEST_SPLIT': 0.3, # 'FIXED' for using 'is_test_set'
    'BINARIZER': 'QUANTILE',
    'ALGO': 'GLRM',
    'TARGET_LABEL': 'pChurn',
    'TYPE' : 'CONTINUOUS',
    'EXAMPLE_FEATURE' : 'Est Income',
    'POS_CLASS': None
     }
CONFIG3 = { 
    'NAME' : 'bike-demand', 
    'DATA_SET': '../data/SeoulBikeData.csv',
    'DATA_TYPES': {'Rented Bike Count': float, 'Hour': float, 'Humidity': float, 'Visibility (10m)': float, 'RatePlan': str},
    'DROP': ['Date'],
    'MODE': 'PREDICTIVE',
    'TRAIN_TEST_SPLIT': 0.3,
    'BINARIZER': 'QUANTILE',
    'ALGO': 'GLRM',
    'TARGET_LABEL': 'Rented Bike Count',
    'TYPE' : 'CONTINUOUS',
    'EXAMPLE_FEATURE' : 'Dew point temperature(C)',
    'POS_CLASS': None
     }
CONFIG4 = { 
    'NAME' : 'heloc', 
    'DATA_SET': '../data/heloc.csv',
    'DATA_TYPES': {},
    'DROP': ['RiskPerformance'],
    'MODE': 'PREDICTIVE',
    'TRAIN_TEST_SPLIT': 0.3,
    'BINARIZER': 'QUANTILE',
    'ALGO': 'GLRM',
    'TARGET_LABEL': 'Probabilities',
    'TYPE' : 'CONTINUOUS',
    'EXAMPLE_FEATURE' : 'ExternalRiskEstimate',
    'POS_CLASS': None
     }
CONFIG5 = { 
    'NAME' : 'taiwan-credit', 
    'DATA_SET': '../data/TaiwanCreditData.csv',
    'DATA_TYPES': {},
    'DROP': ['DefaultNextMonth'],
    'MODE': 'PREDICTIVE',
    'TRAIN_TEST_SPLIT': 0.3,
    'BINARIZER': 'QUANTILE',
    'ALGO': 'GLRM',
    'TARGET_LABEL': 'Probabilities',
    'TYPE' : 'CONTINUOUS',
    'EXAMPLE_FEATURE' : 'Amount',
    'POS_CLASS': None
     }
CONFIG6 = { 
    'NAME' : 'german-credit-brcg', 
    'DATA_SET': '../data/german_credit_codiert.csv',
    'DATA_TYPES': {},
    'DROP': ['Index'],
    'MODE': 'PREDICTIVE',
    'TRAIN_TEST_SPLIT': 0.3,
    'BINARIZER': 'QUANTILE',
    'ALGO': 'BRCG',
    'TARGET_LABEL': 'Target',
    'TYPE' : 'BINARY',
    'EXAMPLE_FEATURE' : 'Credit Amount',
    'POS_CLASS': 1
     }
CONFIG7 = { 
    'NAME' : 'german-credit-ripper', 
    'DATA_SET': '../data/german_credit_codiert.csv',
    'DATA_TYPES': {},
    'DROP': ['Index'],
    'MODE': 'PREDICTIVE',
    'TRAIN_TEST_SPLIT': 0.3,
    'BINARIZER': 'NATIVE',
    'ALGO': 'RIPPER',
    'TARGET_LABEL': 'Target',
    'TYPE' : 'BINARY',
    'EXAMPLE_FEATURE' : 'Credit Amount',
    'POS_CLASS': 1
     }
CONFIG8 = { 
    'NAME' : 'binary-churn-ripper', 
    'DATA_SET': '../data/churn_prob_out_35.csv',
    'DATA_TYPES': {'Children': float, 'RatePlan': str},
    'DROP': ['Id', 'pChurn', '3_Class', '5_Class', 'is_test_set'],
    'MODE': 'PREDICTIVE',
    'TRAIN_TEST_SPLIT': 0.3, # 'FIXED' for using 'is_test_set'
    'BINARIZER': 'NATIVE',
    'ALGO': 'RIPPER',
    'TARGET_LABEL': 'CHURN',
    'TYPE' : 'BINARY',
    'EXAMPLE_FEATURE' : 'Est Income',
    'POS_CLASS': 'T'
     }
     
CONFIG9 = { 
    'NAME' : 'compas-ripper', 
    'DATA_SET': '../data/compas.csv',
    'DATA_TYPES': {},
    'DROP': [],
    'MODE': 'PREDICTIVE',
    'TRAIN_TEST_SPLIT': 0.3,
    'BINARIZER': 'NATIVE',
    'ALGO': 'RIPPER',
    'TARGET_LABEL': 'recidivate-within-two-years',
    'TYPE' : 'BINARY',
    'EXAMPLE_FEATURE' : 'current-charge-degree',
    'POS_CLASS': 0
     }

CONFIG = CONFIG9

print('Proceed with configuration:', CONFIG['NAME'])

Proceed with configuration: binary-churn-ripper


In [2]:
import pandas as pd
import numpy as np
# import os
from sklearn.model_selection import train_test_split #, GridSearchCV
from sklearn.metrics import precision_score, recall_score, accuracy_score, balanced_accuracy_score, confusion_matrix, r2_score, explained_variance_score, mean_absolute_error, max_error
from sklearn.ensemble import GradientBoostingRegressor
import time
import warnings

from aix360.algorithms.rbm import BRCGExplainer, BooleanRuleCG, GLRMExplainer, LinearRuleRegression
if CONFIG['BINARIZER'] == 'QUANTILE':
    from aix360.algorithms.rbm import FeatureBinarizer
elif CONFIG['BINARIZER'] == 'TREE':
    from aix360.algorithms.rbm import FeatureBinarizerFromTrees
if CONFIG['ALGO'] == 'RIPPER':
    from aix360i.algorithms.rule_induction.ripper import Ripper



# from explainer import Explainer

# TODO create reference for performance using boosted trees

# import wittgenstein as lw
# from clf_utils import make_tree_dataset, make_forest, score_forest
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.preprocessing import OneHotEncoder, LabelEncoder


Importing dev version v0.981 of RIPPER


### Data

In [3]:
def convert(char):
    if char == CONFIG['POS_CLASS']:
        return 1
    else:
        return 0

df = pd.read_csv(CONFIG['DATA_SET'],dtype=CONFIG['DATA_TYPES'])
df = df.drop(columns=CONFIG['DROP'])
if CONFIG['ALGO'] == 'BRCG':
    df[CONFIG['TARGET_LABEL']] = df[CONFIG['TARGET_LABEL']].map(convert) 
    # maybe this could also be achieved through explicit binarization of target vector
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1799 entries, 0 to 1798
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   CHURN       1799 non-null   object 
 1   Gender      1799 non-null   object 
 2   Status      1799 non-null   object 
 3   Children    1799 non-null   float64
 4   Est Income  1799 non-null   float64
 5   Car Owner   1799 non-null   object 
 6   Age         1799 non-null   float64
 7   Paymethod   1799 non-null   object 
 8   Usage       1799 non-null   float64
 9   RatePlan    1799 non-null   object 
dtypes: float64(4), object(6)
memory usage: 140.7+ KB


Unnamed: 0,CHURN,Gender,Status,Children,Est Income,Car Owner,Age,Paymethod,Usage,RatePlan
0,T,F,S,1.0,38000.0,N,24.393333,CC,229.64,3
1,F,M,M,2.0,29616.0,N,49.426667,CH,75.29,2
2,F,M,M,0.0,19732.8,N,50.673333,CC,47.25,3
3,F,M,S,2.0,96.33,N,56.473333,CC,59.01,1
4,F,F,M,2.0,52004.8,N,25.14,CH,28.14,1


In [4]:
if CONFIG['TYPE'] == 'BINARY':
    print(df[CONFIG['TARGET_LABEL']].value_counts())
elif CONFIG['TYPE'] == 'CONTINUOUS':
    df[CONFIG['TARGET_LABEL']].describe()
else:
    print('Unrecognized problem type')

F    1076
T     723
Name: CHURN, dtype: int64


### Train, Test Split

In [5]:
if CONFIG['TRAIN_TEST_SPLIT'] == 'FIXED':
    if CONFIG['MODE'] == 'PREDICTIVE':
        train = df[df['is_test_set'] == False]
        test = df[df['is_test_set'] == True]
    elif CONFIG['MODE'] == 'DESCRIPTIVE':
        train = df
        test = df

    train = train.drop(columns=['is_test_set'])
    test = test.drop(columns=['is_test_set'])

    y_train = train[CONFIG['TARGET_LABEL']]
    x_train = train.drop(columns=[CONFIG['TARGET_LABEL']])

    y_test = test[CONFIG['TARGET_LABEL']]
    x_test = test.drop(columns=[CONFIG['TARGET_LABEL']])
else:
    x_train, x_test, y_train, y_test = train_test_split(df.drop(columns=[CONFIG['TARGET_LABEL']]), df[CONFIG['TARGET_LABEL']], test_size=CONFIG['TRAIN_TEST_SPLIT'], random_state=42)

print('Training:', x_train.shape, y_train.shape)
print('Test:', x_test.shape, y_test.shape)

Training: (1259, 9) (1259,)
Test: (540, 9) (540,)


### Reference Performance

In [6]:
if CONFIG['TYPE'] == 'CONTINUOUS':
    print('needs prior encoding of categoricals')
    # gbr = GradientBoostingRegressor(n_estimators=500, random_state=0)
    # gbr.fit(x_train, y_train)
    # # print('Training R^2:', r2_score(yTrain, gbr.predict(dfTrain)))
    # print('Test R^2:', r2_score(y_test, gbr.predict(x_test)))

### Binarization

In [7]:
if CONFIG['BINARIZER'] == 'TREES':
    binarizer =  FeatureBinarizerFromTrees(negations=True, randomState=42) # FeatureBinarizer(negations=False), FeatureBinarizerFromTrees(negations=True, randomState=42)
    binarizer = binarizer.fit(x_train, y_train)
    x_train_bin = binarizer.transform(x_train) #  x_train_bin = binarizer.fit_transform(x_train)
    x_test_bin = binarizer.transform(x_test) #  X_fb = self.fb.fit_transform(X_train)
elif CONFIG['BINARIZER'] == 'QUANTILE':
    binarizer =  FeatureBinarizer(numThresh=9,negations=True) # FeatureBinarizer(negations=False), FeatureBinarizerFromTrees(negations=True, randomState=42)
    binarizer = binarizer.fit(x_train)
    x_train_bin = binarizer.transform(x_train) #  x_train_bin = binarizer.fit_transform(x_train)
    x_test_bin = binarizer.transform(x_test) #  X_fb = self.fb.fit_transform(X_train)  
elif CONFIG['BINARIZER'] == 'NATIVE':
    x_train_bin = x_train
    x_test_bin = x_test
else:
    print('UNRECOGNIZED BINARIZER')

x_train_bin.info() # verbose=True
x_train_bin.head()
x_train_bin[CONFIG['EXAMPLE_FEATURE']][:10]


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1259 entries, 513 to 1126
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Gender      1259 non-null   object 
 1   Status      1259 non-null   object 
 2   Children    1259 non-null   float64
 3   Est Income  1259 non-null   float64
 4   Car Owner   1259 non-null   object 
 5   Age         1259 non-null   float64
 6   Paymethod   1259 non-null   object 
 7   Usage       1259 non-null   float64
 8   RatePlan    1259 non-null   object 
dtypes: float64(4), object(5)
memory usage: 98.4+ KB


513     98217.10
81      12686.00
596      8204.72
380     27498.10
1523     8073.11
84      60102.70
1666     8073.11
1497    74530.80
1200    28267.00
1381    16938.50
Name: Est Income, dtype: float64

### Rule Induction

In [8]:
start_time = time.time()
print('Starting training for', CONFIG['ALGO'])

if CONFIG['ALGO'] == 'BRCG':
    estimator = BooleanRuleCG() # Explainer()
    # estimator.train(x_train, y_train)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        estimator.fit(x_train_bin, y_train)
elif CONFIG['ALGO'] == 'RIPPER':
    estimator = Ripper()
    estimator.fit(x_train_bin, y_train, pos_value=CONFIG['POS_CLASS'])
elif CONFIG['ALGO'] == 'GLRM':
    linear_model = LinearRuleRegression() # lambda0=0.0005,lambda1=0.0001
    explainer = GLRMExplainer(linear_model)
    explainer.fit(x_train_bin, y_train)
else:
    print('Unrecognized algorithm:', CONFIG['ALGO'])

end_time = time.time()
print('Training time: ' + str(end_time - start_time))

Starting training for RIPPER
Training time: 0.34529685974121094


### Evaluation

In [9]:
if CONFIG['TYPE'] == 'BINARY':
    y_pred = estimator.predict(x_test_bin)
    print('Accuracy:', accuracy_score(y_test, y_pred))
    print('Balanced accuracy:', balanced_accuracy_score(y_test, y_pred))
    print('Precision:', precision_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS']))
    print('Recall:', recall_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS']))
elif CONFIG['TYPE'] == 'CONTINUOUS':
    y_pred = explainer.predict(x_test_bin)
    print(f'R2 Score = {r2_score(y_test, y_pred)}')
    print(f'Explained Variance = {explained_variance_score(y_test, y_pred)}')
    print(f'Mean abs. error = {mean_absolute_error(y_test, y_pred)}')
    print(f'Max error = {max_error(y_test, y_pred)}')


Accuracy: 0.924074074074074
Balanced accuracy: 0.9174715909090909
Precision: 0.9282296650717703
Recall: 0.8818181818181818


In [10]:
if CONFIG['TYPE'] == 'CONTINUOUS':
    explanation = explainer.explain()
    print(explanation)
elif CONFIG['ALGO'] == 'BRCG':
    model = estimator.explain()
    if not model['isCNF']:
        print('Number of rules:', len(model['rules']))
        print(model['rules'])
elif CONFIG['ALGO'] == 'RIPPER':
    print('Rule count: ' + str(sum([len(rules) for rules in estimator.rule_map.values()])))
    print('Rule set:')
    print(estimator.rule_list_to_pretty_string())

# uncomment the following line for a full optimized view of the model as data frame for GLRM rules
# explanation.style

Rule count: 14
Rule set:

if {
	(Usage >= 74.77 and Usage <= 74.77 and RatePlan == 4) or
	(Status == S and Est Income <= 25285.5 and Paymethod == CC and Gender == F) or
	(Gender == M and Est Income >= 63234.5 and Age >= 39.093333 and Age <= 53.28) or
	(Age <= 25.0 and Usage >= 82.08 and Est Income <= 43634.5) or
	(Status == S and Est Income <= 42000.0 and Gender == F and Paymethod == Auto) or
	(Age >= 53.693333 and Gender == M and Est Income >= 75799.9) or
	(Age <= 25.0 and Children <= 1.0 and Gender == F and Usage <= 98.76) or
	(Paymethod == CH and RatePlan == 1 and Age <= 40.313333 and Status == S) or
	(RatePlan == 4 and Age >= 48.373333 and Children <= 0.0 and Est Income >= 77129.1) or
	(Usage >= 46.32 and Est Income >= 35000.0 and Gender == F and Usage <= 55.32 and Est Income <= 35976.5) or
	(Age >= 53.013333 and Usage <= 20.74 and Usage >= 14.98 and Est Income >= 55860.0) or
	(Age <= 40.18 and Gender == F and Status == S and Est Income <= 43000.0) or
	(Children <= 0.0 and Gender =