Rules and Rules-Based Systems (RBS) are used extensively in many areas, from fraud detection to predicting customer churn. Despite having many advantages, there are also numerous disadvantages associated with deploying and maintaining an RBS.

In [1]:
!pip install iguanas

Collecting iguanas
  Downloading iguanas-0.1.4-py3-none-any.whl (107 kB)
[K     |████████████████████████████████| 107 kB 5.6 MB/s 
[?25hCollecting category-encoders>=2.0.0
  Downloading category_encoders-2.4.0-py2.py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 2.4 MB/s 
Collecting hyperopt==0.2.5
  Downloading hyperopt-0.2.5-py2.py3-none-any.whl (965 kB)
[K     |████████████████████████████████| 965 kB 33.8 MB/s 
Installing collected packages: hyperopt, category-encoders, iguanas
  Attempting uninstall: hyperopt
    Found existing installation: hyperopt 0.1.2
    Uninstalling hyperopt-0.1.2:
      Successfully uninstalled hyperopt-0.1.2
Successfully installed category-encoders-2.4.0 hyperopt-0.2.5 iguanas-0.1.4


In [3]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 34 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 42.2 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=6d0ee1b3d9d0a74534e51b63215cff6cc039ef2b36d07b75e06c4096751555ca
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


In [4]:
from iguanas.rule_generation import RuleGeneratorDT
from iguanas.rule_selection import SimpleFilter, CorrelatedFilter, BayesSearchCV
from iguanas.metrics import FScore, JaccardSimilarity
from iguanas.rbs import RBSOptimiser, RBSPipeline
from iguanas.correlation_reduction import AgglomerativeClusteringReducer
from iguanas.pipeline import LinearPipeline
from iguanas.pipeline.class_accessor import ClassAccessor
from iguanas.space import UniformFloat, UniformInteger, Choice

import pandas as pd
from sklearn.model_selection import train_test_split
from category_encoders.one_hot import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

  import pandas.util.testing as tm


In [5]:
# Read in data
df = pd.read_csv(
    '/content/titanic.csv', 
    index_col='PassengerId'
)

# Create feature set and target.
target_col = 'Survived'
X = df.drop(
    target_col, 
    axis=1
)
y = df[target_col]

# Drop unnecessary columns
cols_to_drop = ['Name', 'Ticket', 'Cabin']
X.drop(
    cols_to_drop, 
    axis=1, 
    inplace=True
)

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.33,
    random_state=42
)

In [6]:
# One hot encode
encoder = OneHotEncoder(
    use_cat_names=True
)
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

# Impute
X_train.fillna(-1, inplace=True)
X_test.fillna(-1, inplace=True)

In [7]:
# Instantiate F1 score
f1 = FScore(beta=1)
# Instantiate rule generator class
generator = RuleGeneratorDT(
    metric=f1.fit,
    n_total_conditions=4,
    tree_ensemble=RandomForestClassifier(
        n_estimators=10,
        random_state=0
    )
)
# Generate rules using X_train, y_train
X_rules = generator.fit(X_train, y_train)
# Instantiate RBS Pipeline class
rbs_pipeline = RBSPipeline(
    config=[], # Empty list means that the RBSOptimiser will generate a new rule config
    final_decision=0
)
# Instantiate RBS Optimiser class, using the RBS Pipeline above
rbs_optimiser = RBSOptimiser(
    pipeline=rbs_pipeline,
    metric=f1.fit, 
    pos_pred_rules=ClassAccessor(
        class_tag='generator', 
        class_attribute='rule_names'
    ),
    n_iter=10
)
# Optimise the RBS Pipeline using the generated rules
rbs_optimiser.fit(X_rules, y_train)

In [8]:
# Instantiate F1 score
f1 = FScore(beta=1)
# Instantiate rule generator class
generator = RuleGeneratorDT(
    metric=f1.fit,
    n_total_conditions=4,
    tree_ensemble=RandomForestClassifier(
        n_estimators=10,
        random_state=0
    )
)
# Instantiate RBS Pipeline class
rbs_pipeline = RBSPipeline(
    config=[], # Empty list means that the RBSOptimiser will generate a new rule config
    final_decision=0
)
# Instantiate RBS Optimiser class, using the RBS Pipeline above
rbs_optimiser = RBSOptimiser(
    pipeline=rbs_pipeline,
    metric=f1.fit, 
    pos_pred_rules=ClassAccessor(
        class_tag='generator', 
        class_attribute='rule_names'
    ),
    n_iter=10
)
# Create the steps for our linear pipeline
steps = [
    ('generator', generator),
    ('rbs_optimiser', rbs_optimiser)
]
# Instantiate the linear pipeline class
lp = LinearPipeline(steps=steps)

In [9]:
# Define search space for linear pipeline
search_spaces = {
    'generator': {
        'n_total_conditions': UniformInteger(1, 5),
        'target_feat_corr_types': Choice([
            'Infer',
            None
        ])
    },
    'rbs_optimiser': {
        'n_iter': UniformInteger(5, 20)
    }
}

In [10]:
# Instantiate Bayes Search CV class
bs = BayesSearchCV(
    pipeline=lp, 
    search_spaces=search_spaces, 
    metric=f1.fit, 
    cv=3, 
    n_iter=15,
    num_cores=3,
    error_score=0,
    verbose=1
)
# Fit on training data
bs.fit(X_train, y_train)

--- Optimising pipeline parameters ---
100%|██████████| 15/15 [06:30<00:00, 26.07s/trial, best loss: -0.6447698369569125]
--- Refitting on entire dataset with best pipeline ---


In [12]:
# Fit our original pipeline to the training set
lp.fit(X_train, y_train)

In [13]:
# Predict on the test set using our fitted, original pipeline
y_pred_test_init = lp.predict(X_test)
# Predict on the test set using our optimised pipeline
y_pred_test_opt = bs.predict(X_test)
# Calculate F1 score of original pipeline prediction
f1_init = f1.fit(y_pred_test_init, y_test)
# Calculate F1 score of optimised pipeline prediction
f1_opt = f1.fit(y_pred_test_opt, y_test)
# Print results
print(f'F1 score of original pipeline: {round(f1_init, 2)}')
print(f'F1 score of optimised pipeline: {round(f1_opt, 2)}')
print(f'Percentage improvement in F1 score: {round(100*(f1_opt-f1_init)/f1_init, 2)}%')

F1 score of original pipeline: 0.58
F1 score of optimised pipeline: 0.68
Percentage improvement in F1 score: 17.49%


In [14]:
bs.cv_results.head()

Unnamed: 0,Params,generator__n_total_conditions,generator__target_feat_corr_types,rbs_optimiser__n_iter,FoldIdx,Scores,MeanScore,StdDevScore
0,"{'generator': {'n_total_conditions': 2.0, 'tar...",2.0,Infer,12.0,"[0, 1, 2]","[0.6160714285714286, 0.6994535519125683, 0.618...",0.64477,0.038683
8,"{'generator': {'n_total_conditions': 2.0, 'tar...",2.0,Infer,13.0,"[0, 1, 2]","[0.6160714285714286, 0.6994535519125683, 0.618...",0.64477,0.038683
10,"{'generator': {'n_total_conditions': 2.0, 'tar...",2.0,Infer,11.0,"[0, 1, 2]","[0.6160714285714286, 0.6994535519125683, 0.618...",0.64477,0.038683
13,"{'generator': {'n_total_conditions': 2.0, 'tar...",2.0,Infer,11.0,"[0, 1, 2]","[0.6160714285714286, 0.6994535519125683, 0.618...",0.64477,0.038683
6,"{'generator': {'n_total_conditions': 3.0, 'tar...",3.0,Infer,18.0,"[0, 1, 2]","[0.5670498084291188, 0.5795918367346938, 0.603...",0.583415,0.015166


In [16]:
bs.best_score

0.6447698369569125

In [17]:
bs.best_index

0

In [18]:
bs.best_params

{'generator': {'n_total_conditions': 2.0, 'target_feat_corr_types': 'Infer'},
 'rbs_optimiser': {'n_iter': 12.0}}

In [21]:
# Access the rule_strings attribute from the generator step in the optimised pipeline
rule_strings = bs.pipeline_.get_params()
# Access the rules_to_keep attribute from the rbs_optimiser step in the optimised pipeline
rules_to_keep = bs.pipeline_.get_params()
# Filter the generated rules to those remaining after the rbs_optimiser step
rule_strings_remaining = {
    rule_name: rule_string for rule_name, rule_string in rule_strings.items() if rule_name in rules_to_keep
}
# Show the string representation of the rules remaining after the rbs_optimiser step
rule_strings_remaining

{'generator': {'_rule_name_counter': 19,
  '_today': '20220411',
  'infer_dtypes': True,
  'lambda_args': {'RGDT_Rule_20220411_0': [],
   'RGDT_Rule_20220411_1': [],
   'RGDT_Rule_20220411_10': [],
   'RGDT_Rule_20220411_11': [],
   'RGDT_Rule_20220411_12': [],
   'RGDT_Rule_20220411_13': [],
   'RGDT_Rule_20220411_14': [],
   'RGDT_Rule_20220411_15': [],
   'RGDT_Rule_20220411_16': [],
   'RGDT_Rule_20220411_17': [],
   'RGDT_Rule_20220411_18': [],
   'RGDT_Rule_20220411_2': [],
   'RGDT_Rule_20220411_3': [],
   'RGDT_Rule_20220411_4': [],
   'RGDT_Rule_20220411_5': [],
   'RGDT_Rule_20220411_6': [],
   'RGDT_Rule_20220411_7': [],
   'RGDT_Rule_20220411_8': [],
   'RGDT_Rule_20220411_9': []},
  'lambda_kwargs': {'RGDT_Rule_20220411_0': {'Age': -0.125},
   'RGDT_Rule_20220411_1': {'Age': 35.5},
   'RGDT_Rule_20220411_10': {'Fare': 79.025},
   'RGDT_Rule_20220411_11': {'Fare': 79.025},
   'RGDT_Rule_20220411_12': {'Parch': 1},
   'RGDT_Rule_20220411_13': {'Parch': 2},
   'RGDT_Rule_2022