# Generic Rule Induction Notebook

Continuoulsy refined.

In [1]:
#%store -z acc_list 
#%store -z prec_list 

## README

- GLRM needs to run in proper (conda) aix360 environment
- BRCG runs with proper (conda) aix360 environment
- Use aix360i environment for RIPPER

### Configuration


In [2]:
from config import config_dict
from config import config_dict_imbalanced

# CONFIG = config_dict_imbalanced['CONFIG-I4R']
CONFIG = config_dict['CONFIG8']
print('Proceed with configuration:', CONFIG['NAME'])
print(CONFIG)

Proceed with configuration: binary-churn-ripper
{'NAME': 'binary-churn-ripper', 'DATA_SET': '../data/churn_prob_out_35.csv', 'DATA_TYPES': {'Children': <class 'float'>, 'RatePlan': <class 'str'>}, 'DROP': ['Id', 'pChurn', '3_Class', '5_Class', 'is_test_set'], 'MODE': 'PREDICTIVE', 'TRAIN_TEST_SPLIT': 0.3, 'BINARIZER': 'NATIVE', 'ALGO': 'RIPPER', 'TARGET_LABEL': 'CHURN', 'TYPE': 'BINARY', 'EXAMPLE_FEATURE': 'Est Income', 'POS_CLASS': 'T', 'BASELINE': True, 'USECASE': None}


In [3]:
import pandas as pd
import numpy as np
# import os
from sklearn.model_selection import train_test_split #, GridSearchCV
from sklearn.metrics import matthews_corrcoef,fbeta_score,confusion_matrix,f1_score,precision_score, recall_score, accuracy_score, balanced_accuracy_score, confusion_matrix, r2_score, explained_variance_score, mean_absolute_error, max_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from xgboost import XGBClassifier
import time
import warnings
import re

if CONFIG['BINARIZER'] == 'QUANTILE':
    from aix360.algorithms.rbm import FeatureBinarizer
elif CONFIG['BINARIZER'] == 'TREES':
    from aix360.algorithms.rbm import FeatureBinarizerFromTrees
if CONFIG['ALGO'] == 'RIPPER':
    from aix360i.algorithms.rule_induction.ripper import Ripper
elif CONFIG['ALGO'] == 'BRCG':
    from aix360.algorithms.rbm import BooleanRuleCG # BRCGExplainer
elif CONFIG['ALGO'] == 'CORELS':
    from corels import *
elif CONFIG['ALGO'] == 'R2N':
    import aix360i.algorithms.rule_induction.r2n.r2n_algo as algo
    from aix360i.algorithms.rule_induction.r2n.training import train as train_R2N
elif CONFIG['ALGO'] == 'Witt_RIPPER':
    import wittgenstein as lw
elif CONFIG['ALGO'] == 'GLRM':
    from aix360.algorithms.rbm import GLRMExplainer, LinearRuleRegression


Importing dev version v0.981 of RIPPER


### Data

In [4]:
def convert(char):
    if char == CONFIG['POS_CLASS']:
        return 1
    else:
        return 0

df = pd.read_csv(CONFIG['DATA_SET'],dtype=CONFIG['DATA_TYPES'])
print('Read', len(df), 'rows from', CONFIG['DATA_SET'])
df = df.drop(columns=CONFIG['DROP'])
POS_CLASS = CONFIG['POS_CLASS']
if CONFIG['ALGO'] == 'BRCG': # or CONFIG['ALGO'] == 'CORELS' 
    print('Normalising target label for BRCG')
    df[CONFIG['TARGET_LABEL']] = df[CONFIG['TARGET_LABEL']].map(convert)
    POS_CLASS = 1
    # BRCG trains for value 1 as POS_CLASS
df.info()
df.head()

Read 1799 rows from ../data/churn_prob_out_35.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1799 entries, 0 to 1798
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   CHURN       1799 non-null   object 
 1   Gender      1799 non-null   object 
 2   Status      1799 non-null   object 
 3   Children    1799 non-null   float64
 4   Est Income  1799 non-null   float64
 5   Car Owner   1799 non-null   object 
 6   Age         1799 non-null   float64
 7   Paymethod   1799 non-null   object 
 8   Usage       1799 non-null   float64
 9   RatePlan    1799 non-null   object 
dtypes: float64(4), object(6)
memory usage: 140.7+ KB


Unnamed: 0,CHURN,Gender,Status,Children,Est Income,Car Owner,Age,Paymethod,Usage,RatePlan
0,T,F,S,1.0,38000.0,N,24.393333,CC,229.64,3
1,F,M,M,2.0,29616.0,N,49.426667,CH,75.29,2
2,F,M,M,0.0,19732.8,N,50.673333,CC,47.25,3
3,F,M,S,2.0,96.33,N,56.473333,CC,59.01,1
4,F,F,M,2.0,52004.8,N,25.14,CH,28.14,1


In [5]:
if CONFIG['TYPE'] == 'BINARY':
    target_dist = df[CONFIG['TARGET_LABEL']].value_counts()
    print(target_dist)
    print('Pos label occurrence:', target_dist[POS_CLASS]) 
elif CONFIG['TYPE'] == 'CONTINUOUS':
    df[CONFIG['TARGET_LABEL']].describe()
else:
    print('Unrecognized problem type')

F    1076
T     723
Name: CHURN, dtype: int64
Pos label occurrence: 723


### Train, Test Split

In [6]:
if CONFIG['TRAIN_TEST_SPLIT'] == 'FIXED':
    if CONFIG['MODE'] == 'PREDICTIVE':
        train = df[df['is_test_set'] == False]
        test = df[df['is_test_set'] == True]
    elif CONFIG['MODE'] == 'DESCRIPTIVE':
        train = df
        test = df

    train = train.drop(columns=['is_test_set'])
    test = test.drop(columns=['is_test_set'])

    y_train = train[CONFIG['TARGET_LABEL']]
    x_train = train.drop(columns=[CONFIG['TARGET_LABEL']])

    y_test = test[CONFIG['TARGET_LABEL']]
    x_test = test.drop(columns=[CONFIG['TARGET_LABEL']])
else:
    x_train, x_test, y_train, y_test = train_test_split(df.drop(columns=[CONFIG['TARGET_LABEL']]), df[CONFIG['TARGET_LABEL']], test_size=CONFIG['TRAIN_TEST_SPLIT'], random_state=42)

print('Training:', x_train.shape, y_train.shape)
print('Test:', x_test.shape, y_test.shape)

Training: (1259, 9) (1259,)
Test: (540, 9) (540,)


### Reference Performance

In [7]:
if CONFIG['TYPE'] == 'CONTINUOUS':
    print('needs prior encoding of categoricals')
    # gbr = GradientBoostingRegressor(n_estimators=500, random_state=0)
    # gbr.fit(x_train, y_train)
    # # print('Training R^2:', r2_score(yTrain, gbr.predict(dfTrain)))
    # print('Test R^2:', r2_score(y_test, gbr.predict(x_test)))
elif CONFIG['TYPE'] == 'BINARY':
    x_train_cp = x_train.copy()
    y_train_cp = y_train.copy()
    y_train_cp = y_train.map(convert)  # use manual encoding to make sure that pos_value = 1
    x_test_cp = x_test.copy()
    y_test_cp = y_test.copy()
    y_test_cp = y_test.map(convert)  # use manual encoding to make sure that pos_value = 1
    categorical_features = x_train_cp.select_dtypes(include=['object']).columns
    print(categorical_features)
    # numerical_feat = df.select_dtypes(include=['int64', 'float64']).columns
    for col in categorical_features:
        label_encoder = LabelEncoder()
        label_encoder = label_encoder.fit(df[col])
        x_train_cp[col] = label_encoder.transform(x_train_cp[col])
        x_test_cp[col] = label_encoder.transform(x_test_cp[col])
    xgb_model = XGBClassifier(use_label_encoder=False)
    xgb_model.fit(x_train_cp, y_train_cp)
    y_pred = xgb_model.predict(x_test_cp)

    print('Accuracy:', accuracy_score(y_test_cp, y_pred))
    print('Balanced accuracy:', balanced_accuracy_score(y_test_cp, y_pred))
    print('Precision:', precision_score(y_test_cp, y_pred, pos_label=1))
    print('Recall:', recall_score(y_test_cp, y_pred, pos_label=1))
    # print(xgb_model)



Index(['Gender', 'Status', 'Car Owner', 'Paymethod', 'RatePlan'], dtype='object')
Parameters: { use_label_encoder } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Accuracy: 0.9611111111111111
Balanced accuracy: 0.9586647727272728
Precision: 0.9585253456221198
Recall: 0.9454545454545454


### Binarization

In [8]:
if CONFIG['BINARIZER'] == 'TREES':
    binarizer =  FeatureBinarizerFromTrees(negations=True, randomState=42) # FeatureBinarizer(negations=False), FeatureBinarizerFromTrees(negations=True, randomState=42)
    binarizer = binarizer.fit(x_train, y_train)
    x_train_bin = binarizer.transform(x_train) #  x_train_bin = binarizer.fit_transform(x_train)
    x_test_bin = binarizer.transform(x_test) #  X_fb = self.fb.fit_transform(X_train)
elif CONFIG['BINARIZER'] == 'QUANTILE':
    binarizer =  FeatureBinarizer(numThresh=9,negations=True) # FeatureBinarizer(negations=False), FeatureBinarizerFromTrees(negations=True, randomState=42)
    binarizer = binarizer.fit(x_train)
    x_train_bin = binarizer.transform(x_train) #  x_train_bin = binarizer.fit_transform(x_train)
    x_test_bin = binarizer.transform(x_test) #  X_fb = self.fb.fit_transform(X_train)  
elif CONFIG['BINARIZER'] == 'NATIVE':
    x_train_bin = x_train
    x_test_bin = x_test
else:
    print('UNRECOGNIZED BINARIZER')

x_train_bin.info() # verbose=True
x_train_bin.head()
#x_train_bin[CONFIG['EXAMPLE_FEATURE']][:10]


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1259 entries, 513 to 1126
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Gender      1259 non-null   object 
 1   Status      1259 non-null   object 
 2   Children    1259 non-null   float64
 3   Est Income  1259 non-null   float64
 4   Car Owner   1259 non-null   object 
 5   Age         1259 non-null   float64
 6   Paymethod   1259 non-null   object 
 7   Usage       1259 non-null   float64
 8   RatePlan    1259 non-null   object 
dtypes: float64(4), object(5)
memory usage: 98.4+ KB


Unnamed: 0,Gender,Status,Children,Est Income,Car Owner,Age,Paymethod,Usage,RatePlan
513,F,M,2.0,98217.1,N,44.98,CC,42.74,4
81,F,S,1.0,12686.0,N,64.8,Auto,103.88,2
596,F,M,2.0,8204.72,N,52.973333,CH,39.79,2
380,F,S,0.0,27498.1,N,52.646667,CC,12.08,4
1523,F,M,1.0,8073.11,N,52.646667,Auto,89.05,3


### Rule Induction

In [9]:
start_time = time.time()
print('Starting training for', CONFIG['ALGO'])

if CONFIG['ALGO'] == 'BRCG':
    estimator = BooleanRuleCG() # Explainer()
    # estimator.train(x_train, y_train)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        estimator.fit(x_train_bin, y_train)
elif CONFIG['ALGO'] == 'RIPPER':
    estimator = Ripper()
    estimator.fit(x_train_bin, y_train, pos_value=POS_CLASS)
elif CONFIG['ALGO'] == 'Witt_RIPPER':
    estimator = lw.RIPPER()
    estimator.fit(x_train_bin, y_train,class_feat=CONFIG["TARGET_LABEL"] , pos_value=POS_CLASS)
elif CONFIG['ALGO'] == 'GLRM':
    linear_model = LinearRuleRegression() # lambda0=0.0005,lambda1=0.0001
    explainer = GLRMExplainer(linear_model)
    explainer.fit(x_train_bin, y_train)
elif CONFIG['ALGO'] == 'CORELS':
    estimator = CorelsClassifier(n_iter=10000, 
                     max_card=2, # feautres per statement
                     c = 0.0001 # Higher values penalise longer rulelists
                    )
    estimator.fit(x_train_bin, y_train , prediction_name = CONFIG["TARGET_LABEL"])
elif CONFIG['ALGO'] == 'R2N':
    estimator = algo.R2Nalgo(n_seeds=3, max_epochs=100, decay_rate=0.998, coef = 10**-3, normalize_num=False)
    estimator.fit(x_train_bin, y_train)   
else:
    print('Unrecognized algorithm:', CONFIG['ALGO'])

end_time = time.time()
print('Training time: ' + str(end_time - start_time))

Starting training for RIPPER
Training time: 0.3723928928375244


### Evaluation

In [11]:
acc_list = []
prec_list = []
%store -r acc_list
%store -r prec_list

if CONFIG['TYPE'] == 'BINARY':
    y_pred = estimator.predict(x_test_bin)
    print('Accuracy:', accuracy_score(y_test, y_pred))
    print('Balanced accuracy:', balanced_accuracy_score(y_test, y_pred))
    print('Precision:', precision_score(y_test, y_pred, pos_label=POS_CLASS))
    print('Recall:', recall_score(y_test, y_pred, pos_label=POS_CLASS))
    print('F1', f1_score(y_test, y_pred, pos_label=POS_CLASS))
    print('ConfusionMatrix', confusion_matrix(y_test, y_pred))
    print('F-2', fbeta_score(y_test, y_pred, pos_label=POS_CLASS, beta= 2))
    print('Mathhews', matthews_corrcoef(y_test, y_pred))


    acc_list.append(recall_score(y_test, y_pred, pos_label=POS_CLASS))
    prec_list.append(precision_score(y_test, y_pred, pos_label=POS_CLASS))
    %store acc_list
    %store prec_list
   
elif CONFIG['TYPE'] == 'CONTINUOUS':
    y_pred = explainer.predict(x_test_bin)
    print(f'R2 Score = {r2_score(y_test, y_pred)}')
    print(f'Explained Variance = {explained_variance_score(y_test, y_pred)}')
    print(f'Mean abs. error = {mean_absolute_error(y_test, y_pred)}')
    print(f'Max error = {max_error(y_test, y_pred)}')
    


Accuracy: 0.924074074074074
Balanced accuracy: 0.9174715909090909
Precision: 0.9282296650717703
Recall: 0.8818181818181818
F1 0.9044289044289043
ConfusionMatrix [[305  15]
 [ 26 194]]
F-2 0.8907254361799816
Mathhews 0.8422792703169054
Stored 'acc_list' (list)
Stored 'prec_list' (list)


In [12]:
if CONFIG['TYPE'] == 'CONTINUOUS':
    explanation = explainer.explain()
    print(explanation)
elif CONFIG['ALGO'] == 'BRCG':
    model = estimator.explain()
    if not model['isCNF']:
        print('Number of rules:', len(model['rules']))
        print(model['rules'])
elif CONFIG['ALGO'] == 'RIPPER':
    # print('Rule count: ' + str(sum([len(rules) for rules in estimator.rule_map.values()])))
    # print('Rule set:')
    # print(estimator.rule_list_to_pretty_string())

    rule_set_list = []
    rule_set = estimator.export_rules_to_trxf_dnf_ruleset(POS_CLASS)
    conjunctions = rule_set.conjunctions
    for c in conjunctions:
        conjunction_dict = {}
        predicates = c.predicates
        for p in predicates:
            name = str(p.feature) + str(p.relation)
            value = p.value
            conjunction_dict[name] = value
        rule_set_list.append(conjunction_dict)
    print(rule_set_list)
    rule_set_df = pd.DataFrame(rule_set_list)
    print(rule_set_df.head())
elif CONFIG['ALGO'] == 'CORELS':
    r_length = len(estimator.rl().rules)
    print("Rule Length:", r_length)
    for i in range(len(estimator.rl().rules[0]["antecedents"])):
        an = len(estimator.rl().rules[i]["antecedents"])
        print(f"Antecedents Length Rule {i}:" , an)
elif CONFIG['ALGO'] == 'Witt_RIPPER':
    print("Rule Length:", len(estimator.ruleset_))

# uncomment the following line for a full optimized view of the model as data frame for GLRM rules
# explanation.style

[{'UsageRelation.GE': 74.77, 'UsageRelation.LE': 74.77, 'RatePlanRelation.EQ': '4'}, {'StatusRelation.EQ': 'S', 'Est IncomeRelation.LE': 25285.5, 'PaymethodRelation.EQ': 'CC', 'GenderRelation.EQ': 'F'}, {'GenderRelation.EQ': 'M', 'Est IncomeRelation.GE': 63234.5, 'AgeRelation.GE': 39.093333, 'AgeRelation.LE': 53.28}, {'AgeRelation.LE': 25.0, 'UsageRelation.GE': 82.08, 'Est IncomeRelation.LE': 43634.5}, {'StatusRelation.EQ': 'S', 'Est IncomeRelation.LE': 42000.0, 'GenderRelation.EQ': 'F', 'PaymethodRelation.EQ': 'Auto'}, {'AgeRelation.GE': 53.693333, 'GenderRelation.EQ': 'M', 'Est IncomeRelation.GE': 75799.9}, {'AgeRelation.LE': 25.0, 'ChildrenRelation.LE': 1.0, 'GenderRelation.EQ': 'F', 'UsageRelation.LE': 98.76}, {'PaymethodRelation.EQ': 'CH', 'RatePlanRelation.EQ': '1', 'AgeRelation.LE': 40.313333, 'StatusRelation.EQ': 'S'}, {'RatePlanRelation.EQ': '4', 'AgeRelation.GE': 48.373333, 'ChildrenRelation.LE': 0.0, 'Est IncomeRelation.GE': 77129.1}, {'UsageRelation.GE': 46.32, 'Est IncomeR