## Example

In [None]:
import pandas as pd, numpy as np, sys
from sklearn.ensemble import (RandomForestClassifier, 
                              ExtraTreesClassifier)
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import OneHotEncoder
from sklearn.datasets import fetch_openml
from itertools import product
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib.ticker import PercentFormatter
from AssoruleMining import *

In [None]:
mpl.rcParams['lines.linewidth'] = 2
mpl.rcParams['axes.spines.right'] = False
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.grid'] = False

In [None]:
def plot_results(train, test):
    
    fig, ax = plt.subplots(figsize=(7,5))
    x = np.arange(len(train)) + 1 
    
    # Common plotting style
    base_style = dict(lw=2, solid_capstyle='round', ms=7)
    configs = [(train, "Precision", "#1B9CFC", "Train", "-", "o"),
               (test, "Precision", "#1B9CFC", "Test",  "--", "o"),
               (train, "Recall", "#FC427B", "Train", "-", "s"),
               (test, "Recall", "#FC427B", "Test", "--", "s")]
    
    for n, (data, col, color, which, ls, marker) in enumerate(configs):
        ax.plot(data[col.lower()], color=color, ls=ls, marker=None, 
                label="{} ({})".format(col, which), **base_style)
        if n==1:
            ax.fill_between(x, train["precision"], test["precision"], 
                    color="#1B9CFC", alpha=0.2, label="Precision Gap")
        elif n==3:
            ax.fill_between(x, train["recall"], test["recall"], 
                    color="#FC427B", alpha=0.2, label="Recall Gap")

    ax.set_ylabel("Precision & Recall", fontsize=13, fontweight=1000)
    ax.set_xlabel("Number of Rules", fontsize=13, fontweight=1000)
    ax.legend(loc="best", fontsize=12, framealpha=0, ncol=2, 
              columnspacing=0.5, handletextpad=0.8, 
              labelspacing=0.4, handlelength=1.5)
    ax.yaxis.set_major_formatter(PercentFormatter(xmax=1))
    ax.tick_params(axis='both', labelsize=13)
    plt.tight_layout()
    
    return ax

Target: class (">50K" vs "<=50K")

In [None]:
X = fetch_openml("adult", version=2, as_frame=True).frame.drop(columns=["fnlwgt", "education-num"])
y = X.pop("class").values
y = np.where(y==">50K", 1, 0)

In [None]:
X.head()

In [None]:
X.info()

Convert categorical to numerical variable.

In [None]:
cat = list(X.select_dtypes(include=["category"]))
for c in cat: X[c] = X[c].cat.add_categories('missing').fillna('missing')
enc = OneHotEncoder(handle_unknown='ignore').fit(X[cat])
columns = ["{} ({})".format(*n) 
           for c,v in zip(cat, enc.categories_) 
           for n in list(product([c],v))]

In [None]:
cat_df = pd.DataFrame(enc.transform(X[cat]).toarray().astype(int), columns=columns)
cat_X  = X.drop(columns=cat).merge(cat_df, left_index=True, right_index=True)

In [None]:
Xt_train, Xt_test, yt_train, yt_test = tts(cat_X, y, test_size=0.3, shuffle=True, random_state=0)

## Creation of rules
- Extract the best path in tree that satisfies criteria.
- If **`exclude`=True**, after each iteration training samples under previously selected leaf node (path) are excluded from the training set before determining the next rule. If **`exclude`=False**, it changes target to non-target i.e. 1 to 0 while keeping the sample size the same.
- This approach stops when the evaluating metric is deemed satisfactory (`max_iter`).

In [None]:
# print(TreeRuleMining.__doc__)

### Example 1: **`DecisionTreeClassifier`**

In [None]:
kwds = dict(max_depth=None,
            max_features=Xt_train.shape[1], 
            random_state=0, 
            min_samples_leaf=0.01, 
            class_weight="balanced")
Tree1 = DecisionTreeClassifier(**kwds)

In [None]:
model1 = TreeRuleMining(Tree1, 
                        exclude=True, 
                        metric="recall", 
                        max_iter=50).fit(Xt_train, yt_train)

Apply selected rules on `X` and evaluate rule performance against `y`.

In [None]:
eval_train1 = model1.evaluate(Xt_train, yt_train, cumulative=True)
eval_test1  = model1.evaluate(Xt_test , yt_test , cumulative=True)

In [None]:
eval_train1.head()

Example of visualizations using **`plot_results`**

In [None]:
_ = plot_results(eval_train1, eval_test1)

See all selected rules and their subrules using **`print_rule`**.

In [None]:
n_rules = 5
for n in np.arange(n_rules)+1:
    print(key:=f"Rule_{n}")
    print_rule(model1.rules[key])
    print()

Use **`self.transform`** to convert rules into features array.

In [None]:
model1.transform(Xt_train, n_rules).head()

Summary on `Xt_train` using **`print_stats`**

In [None]:
yt_pred_train = model1.transform(Xt_train, n_rules).sum(1)>0
print_stats(yt_train, yt_pred_train)

In [None]:
yt_pred_test = model1.transform(Xt_test, n_rules).sum(1)>0
print_stats(yt_test, yt_pred_test)

### Example 2: **`RandomForestClassifier`**

In [None]:
kwds = dict(n_estimators=20,
            max_depth=None,  
            min_samples_leaf=0.01,
            max_features="sqrt", 
            random_state=0, 
            bootstrap=True)
Tree2 = RandomForestClassifier(**kwds)

In [None]:
model2 = TreeRuleMining(Tree2, 
                        exclude=True, 
                        metric="recall", 
                        max_iter=50).fit(Xt_train, yt_train)

In [None]:
eval_train2 = model2.evaluate(Xt_train, yt_train, cumulative=True)
eval_test2  = model2.evaluate(Xt_test , yt_test , cumulative=True)

In [None]:
_ = plot_results(eval_train2, eval_test2)

### Example 3: **`ExtraTreesClassifier`** 

In [None]:
kwds = dict(n_estimators=20, 
            criterion="gini",
            max_depth=None,  
            min_samples_leaf=0.01,
            max_features="sqrt", 
            random_state=0, 
            bootstrap=True, 
            monotonic_cst=[1]*Xt_train.shape[1])
Tree3 = ExtraTreesClassifier(**kwds)

In [None]:
model3 = TreeRuleMining(Tree3, 
                        exclude=True, 
                        metric="recall", 
                        max_iter=50).fit(Xt_train, yt_train)

In [None]:
eval_train3 = model3.evaluate(Xt_train, yt_train, cumulative=True)
eval_test3  = model3.evaluate(Xt_test , yt_test , cumulative=True)

In [None]:
_ = plot_results(eval_train3, eval_test3)