 # Examples for Generating Data and Applying SPH+CPI

## Imports

In [1]:
# First, do the imports that we need for the examples
import random
import numpy as np

from anytree import RenderTree
import warnings
warnings.simplefilter("ignore")
from DataGenerator import ImbalanceGenerator
from Hierarchy import HardCodedHierarchy
from Utility import train_test_splitting, get_train_test_X_y, update_data_and_training_data
from SPH_CPI import SPHandCPI, RandomForestBorutaMethod

In [2]:
# Data Generation is based on randomness, so we should set random seeds so that the results are reproducible
np.random.seed(0)
random.seed(0)

## Data Generation

In [3]:
# Parameters for Data generation we used in the paper
n_features = 100
n_samples = 1050
n_classes = 84
root = HardCodedHierarchy().create_hardcoded_hierarchy()
imb_degree = "medium" # Either one of ["very_balanced", "balanced", "medium", "imbalanced", "very_imbalanced"]

In [5]:
# We instantiate an imbalance generator. We leave the parameters as "default" parameters.
generator = ImbalanceGenerator(n_features=n_features,
                               n_samples_total=n_samples,
                               root=root,
                               imbalance_degree=imb_degree,
                               total_n_classes=n_classes)

In [6]:
# Then we generate the data. The result is a dataframe. The actual features of the dataset are contained in the columns
# F0, F1, ..., F{n_features - 1}
# The classes are contained in the column "target" and we also have specific attributes for the different levels of the hierarchy
df = generator.generate_data_with_product_hierarchy()

# sort the columns to first have the hierarchy-specific features
hierarchy_features = ["level-0", "level-1", "level-2", "group"] 
numeric_features = [f"F{i}" for i in range(n_features)]
# class label is encoded as "target" column
class_label = ["target"]

df[[*hierarchy_features,  *class_label, *numeric_features]].head()

Unnamed: 0,level-0,level-1,level-2,group,target,F0,F1,F2,F3,F4,...,F90,F91,F92,F93,F94,F95,F96,F97,F98,F99
0,Engine,Diesel,DE-OM1,DE-OM1-2,4,,0.719088,0.678771,,,...,,1.0,0.0,0.793774,1.0,0.957549,,,0.412513,
1,Engine,Diesel,DE-OM1,DE-OM1-2,2,,0.739794,0.881237,,,...,0.429717,0.065778,0.406139,0.428853,0.224817,0.309506,,0.753438,0.0,
2,Engine,Diesel,DE-OM1,DE-OM1-2,1,,0.611839,0.291793,,,...,0.585255,0.330236,0.718082,0.3434,0.7845,0.227665,,0.548237,0.444239,
3,Engine,Diesel,DE-OM1,DE-OM1-2,3,,0.462704,,,,...,0.56485,0.0,0.447254,0.0,0.328744,0.103987,,0.285159,0.478487,
4,Engine,Diesel,DE-OM1,DE-OM1-2,2,,0.164762,0.582491,,,...,,0.030291,1.0,0.418368,0.0,0.598347,,0.0,0.079998,


In [8]:
# We can also access the generated hierarchy via the root attribute of the generator instance
hierarchy_root = generator.root
# Then we can also print the hierarchy
print(RenderTree(hierarchy_root))

Engine[n_samples=1050, n_classes=84, classes=(1, 84)]
├── Diesel[n_samples=277, n_classes=60, classes=(1, 60)]
│   ├── DE-OM1[n_samples=96, n_classes=28, classes=(1, 28)]
│   │   ├── DE-OM1-2[n_samples=10, n_classes=4, classes=(1, 4)]
│   │   ├── DE-OM1-3[n_samples=37, n_classes=10, classes=(5, 14)]
│   │   ├── DE-OM1-4[n_samples=15, n_classes=5, classes=(15, 19)]
│   │   ├── DE-OM1-5[n_samples=22, n_classes=5, classes=(20, 24)]
│   │   └── DE-OM1-6[n_samples=12, n_classes=4, classes=(25, 28)]
│   ├── DE-OM2[n_samples=130, n_classes=41, classes=(5, 45)]
│   │   ├── DE-OM2-1[n_samples=52, n_classes=20, classes=(5, 24)]
│   │   ├── DE-OM2-2[n_samples=12, n_classes=9, classes=(25, 33)]
│   │   ├── DE-OM2-3[n_samples=8, n_classes=2, classes=(33, 34)]
│   │   ├── DE-OM2-5[n_samples=43, n_classes=13, classes=(32, 44)]
│   │   └── DE-OM2-6[n_samples=15, n_classes=4, classes=(45, 48)]
│   └── DE-OM3[n_samples=51, n_classes=12, classes=(41, 52)]
│       ├── DE-OM3-1[n_samples=9, n_classes=3, cl

## Preparing the data

In [9]:
# Now we do the train/test split. We want 75% as training samples
df_train, df_test = train_test_splitting(df, n_train_samples=int(0.75*n_samples))

In [10]:
# Transform to X_train, X_test and y_train, y_test
X_train, X_test, y_train, y_test = get_train_test_X_y(df_train=df_train, df_test=df_test, n_features=n_features)

In [11]:
# Update the training data in our hierarchy model
root = update_data_and_training_data(root, df_train, n_features=n_features)

## Applying RandomForest and Boruta (RF+B) as Baseline

In [12]:
# Instantiating RF and Boruta. First, Boruta might take some time as it performs per default 100 iterations.
rf_b = RandomForestBorutaMethod()
rf_b.fit(X_train, y_train)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	100
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	100
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	100
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	100
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	100
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	100
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	100
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	19
Tentative: 	48
Rejected: 	33
Iteration: 	9 / 100
Confirmed: 	19
Tentative: 	48
Rejected: 	33
Iteration: 	10 / 100
Confirmed: 	19
Tentative: 	48
Rejected: 	33
Iteration: 	11 / 100
Confirmed: 	19
Tentative: 	48
Rejected: 	33
Iteration: 	12 / 100
Confirmed: 	22
Tentative: 	36
Rejected: 	42
Iteration: 	13 / 100
Confirmed: 	22
Tentative: 	36
Rejected: 	42
Iteration: 	14 / 100
Confirmed: 	22
Tentative: 	36
Rejected: 	42
Iteration: 	15 / 100
Confirmed: 	22
Tentative: 	36
Rejected: 	42
Iteration: 	16 / 100
Confirmed: 	22
Tenta

RandomForestClassifier(n_estimators=89,
                       random_state=RandomState(MT19937) at 0x1CD8638D540)

In [13]:
# Look at the results of the Random Forest Baseline for the top-k Accuracy
rf_b.predict_test_samples(df_test)
rf_b.get_accuracy_per_e_df()

Unnamed: 0,Method,R_e,A@e,RA@e,Run
0,RF+B,1,0.323194,1.0,1
1,RF+B,2,0.452471,1.285714,1
2,RF+B,3,0.555133,1.60274,1
3,RF+B,4,0.612167,1.826087,1
4,RF+B,5,0.680608,2.145251,1
5,RF+B,6,0.722433,2.368421,1
6,RF+B,7,0.741445,2.487179,1
7,RF+B,8,0.749049,2.543147,1
8,RF+B,9,0.760456,2.64,1
9,RF+B,10,0.771863,2.748768,1


## Applying SPH and CPI

In [14]:
# Now we instantiate SPH+CPI with the default parameters from Section 5 in the paper.
sph_cpi = SPHandCPI(max_info_loss=0.25, gini_threshold=0.3, p_threshold=0.8, hierarchy=root)
# Fit our model.  We see in the output where we use surrogate models and when as well as how CPI partitions the data.
# Note that there might be surrogates with less than 25% info loss. This indicates that there is only one class in the relevant sample subset.
sph_cpi.fit(X_train, y_train)

Using surrogate for DE-OM2-2, which is DE-OM2 with info loss 0.6363636363636364 and 2 class(es)
Using surrogate for DE-OM3-1, which is DE-OM3 with info loss 0.2222222222222222 and 1 class(es)
Using surrogate for GE-OM1-5, which is GE-OM1 with info loss 1.0 and 0 class(es)
Using surrogate for GE-OM1-6, which is GE-OM1 with info loss 0.33333333333333337 and 3 class(es)
Using surrogate for GE-OM1-7, which is GE-OM1 with info loss 0.4 and 1 class(es)
Using surrogate for GE-OM3-1, which is GE-OM3 with info loss 0.16666666666666663 and 1 class(es)
Using surrogate for GE-OM3-5, which is GE-OM3 with info loss 0.2592592592592593 and 3 class(es)
Using surrogate for GE-OM3-10, which is GE-OM3 with info loss 0.7272727272727273 and 1 class(es)
Using surrogate for GE-OM3-11, which is GE-OM3 with info loss 0.19999999999999996 and 1 class(es)
Using surrogate for GE-OM3-12, which is GE-OM3 with info loss 0.26315789473684215 and 2 class(es)
partition min/majority
Counter({44: 11, 41: 7, 43: 5, 48: 5, 42

{'DE-OM1-2': Pipeline(steps=[('imputer', KNNImputer()),
                 ('forest',
                  RandomForestClassifier(n_estimators=200, random_state=1234))]),
 'DE-OM1-3': Pipeline(steps=[('imputer', KNNImputer()),
                 ('forest',
                  RandomForestClassifier(n_estimators=200, random_state=1234))]),
 'DE-OM1-4': Pipeline(steps=[('imputer', KNNImputer()),
                 ('forest',
                  RandomForestClassifier(n_estimators=200, random_state=1234))]),
 'DE-OM1-5': Pipeline(steps=[('imputer', KNNImputer()),
                 ('forest',
                  RandomForestClassifier(n_estimators=200, random_state=1234))]),
 'DE-OM1-6': Pipeline(steps=[('imputer', KNNImputer()),
                 ('forest',
                  RandomForestClassifier(n_estimators=200, random_state=1234))]),
 'DE-OM2-1': Pipeline(steps=[('imputer', KNNImputer()),
                 ('forest',
                  RandomForestClassifier(n_estimators=200, random_state=1234))]),
 'DE

In [15]:
# Predict test samples. We use df_test and not y_test. This is because for SPH we also need the info of the hierarchy,
# i.e., of the specific product group to search the model repository for the appropriate model.
sph_cpi.predict_test_samples(df_test=df_test)
acc_df = sph_cpi.get_accuracy_per_e_df()
acc_df

Unnamed: 0,Method,R_e,A@e,RA@e,Run,max info loss,gini,p value
0,SPH+CPI,1,0.346008,1.0,1,0.25,0.3,0.8
1,SPH+CPI,2,0.551331,1.372414,1,0.25,0.3,0.8
2,SPH+CPI,3,0.631179,1.578313,1,0.25,0.3,0.8
3,SPH+CPI,4,0.69962,1.815217,1,0.25,0.3,0.8
4,SPH+CPI,5,0.752852,2.040404,1,0.25,0.3,0.8
5,SPH+CPI,6,0.771863,2.137931,1,0.25,0.3,0.8
6,SPH+CPI,7,0.790875,2.254808,1,0.25,0.3,0.8
7,SPH+CPI,8,0.806084,2.363208,1,0.25,0.3,0.8
8,SPH+CPI,9,0.825095,2.516129,1,0.25,0.3,0.8
9,SPH+CPI,10,0.836502,2.618182,1,0.25,0.3,0.8
