 # Examples for Generating Data and Applying SPH+CPI

## Imports

In [1]:
# First, do the imports that we need for the examples
import random
import numpy as np

from anytree import RenderTree
import warnings
warnings.simplefilter("ignore")
from DataGenerator import ImbalanceGenerator
from Hierarchy import HardCodedHierarchy
from Utility import train_test_splitting, get_train_test_X_y, update_data_and_training_data
from SPH_CPI import SPHandCPI, RandomForestBorutaMethod

Level-0;Engine[n_samples=1050, n_classes=84, classes=(1, 84)]
Level-1;Gasoline[n_samples=773, n_classes=58, classes=(27, 84)]
Level-2;GE-OM3[n_samples=573, n_classes=54, classes=(31, 84)]
Level-3;GE-OM3-13[n_samples=309, n_classes=40, classes=(45, 84), class_occurences=[95, 48, 45, 35, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 33, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4]]
Level-3;GE-OM3-12[n_samples=27, n_classes=7, classes=(72, 78), class_occurences=[10, 9, 1, 1, 2, 2, 2]]
Level-3;GE-OM3-11[n_samples=8, n_classes=2, classes=(71, 72), class_occurences=[7, 1]]
Level-3;GE-OM3-10[n_samples=12, n_classes=9, classes=(65, 73), class_occurences=[4, 1, 1, 1, 1, 1, 1, 1, 1]]
Level-3;GE-OM3-9[n_samples=42, n_classes=10, classes=(63, 72), class_occurences=[20, 10, 3, 3, 1, 1, 1, 1, 1, 1]]
Level-3;GE-OM3-8[n_samples=16, n_classes=3, classes=(65, 67), class_occurences=[10, 5, 1]]
Level-3;GE-OM3-7[n_samples=22, n_classes=5, classes=(60, 64), class_occurences=[9, 6, 3, 2, 2]]
Le

In [2]:
# Data Generation is based on randomness, so we should set random seeds so that the results are reproducible
np.random.seed(0)
random.seed(0)

## Data Generation

In [3]:
# Parameters for Data generation we used in the paper
n_features = 100
n_samples = 1050
n_classes = 84
root = HardCodedHierarchy().create_hardcoded_hierarchy()
imb_degree = "medium" # Either one of ["very_balanced", "balanced", "medium", "imbalanced", "very_imbalanced"]

In [4]:
# We instantiate an imbalance generator. We leave the parameters as "default" parameters.
generator = ImbalanceGenerator(n_features=n_features,
                               n_samples_total=n_samples,
                               root=root,
                               imbalance_degree=imb_degree,
                               total_n_classes=n_classes)

In [5]:
# Then we generate the data. The result is a dataframe. The actual features of the dataset are contained in the columns
# F0, F1, ..., F{n_features - 1}
# The classes are contained in the column "target" and we also have specific attributes for the different levels of the hierarchy
df = generator.generate_data_with_product_hierarchy()

# sort the columns to first have the hierarchy-specific features
hierarchy_features = ["level-0", "level-1", "level-2", "group"] 
numeric_features = [f"F{i}" for i in range(n_features)]
# class label is encoded as "target" column
class_label = ["target"]

df[[*hierarchy_features,  *class_label, *numeric_features]].head()

Level-0;Engine[n_samples=1050, n_classes=84, classes=(1, 84)]
Level-1;Diesel[n_samples=277, n_classes=60, classes=(1, 60)]
Level-1;Gasoline[n_samples=773, n_classes=58, classes=(27, 84)]
Level-2;DE-OM1[n_samples=96, n_classes=28, classes=(1, 28)]
Level-2;DE-OM2[n_samples=130, n_classes=41, classes=(5, 45)]
Level-2;DE-OM3[n_samples=51, n_classes=12, classes=(41, 52)]
Level-2;GE-OM1[n_samples=200, n_classes=43, classes=(27, 69)]
Level-2;GE-OM3[n_samples=573, n_classes=54, classes=(31, 84)]
features that are currently not used: []


Unnamed: 0,level-0,level-1,level-2,group,target,F0,F1,F2,F3,F4,...,F90,F91,F92,F93,F94,F95,F96,F97,F98,F99
0,Engine,Diesel,DE-OM1,DE-OM1-2,4,,0.719088,0.678771,,,...,,1.0,0.0,0.793774,1.0,0.957549,,,0.412513,
1,Engine,Diesel,DE-OM1,DE-OM1-2,2,,0.739794,0.881237,,,...,0.429717,0.065778,0.406139,0.428853,0.224817,0.309506,,0.753438,0.0,
2,Engine,Diesel,DE-OM1,DE-OM1-2,1,,0.611839,0.291793,,,...,0.585255,0.330236,0.718082,0.3434,0.7845,0.227665,,0.548237,0.444239,
3,Engine,Diesel,DE-OM1,DE-OM1-2,3,,0.462704,,,,...,0.56485,0.0,0.447254,0.0,0.328744,0.103987,,0.285159,0.478487,
4,Engine,Diesel,DE-OM1,DE-OM1-2,2,,0.164762,0.582491,,,...,,0.030291,1.0,0.418368,0.0,0.598347,,0.0,0.079998,


In [6]:
# We can also access the generated hierarchy via the root attribute of the generator instance
hierarchy_root = generator.root
# Then we can also print the hierarchy
print(RenderTree(hierarchy_root))

Level-0;Engine[n_samples=1050, n_classes=84, classes=(1, 84)]
├── Level-1;Diesel[n_samples=277, n_classes=60, classes=(1, 60)]
│   ├── Level-2;DE-OM1[n_samples=96, n_classes=28, classes=(1, 28)]
│   │   ├── Level-3;DE-OM1-2[n_samples=10, n_classes=4, classes=(1, 4), class_occurences=[2, 3, 2, 3]]
│   │   ├── Level-3;DE-OM1-3[n_samples=37, n_classes=10, classes=(5, 14), class_occurences=[8, 5, 3, 3, 3, 3, 3, 3, 3, 3]]
│   │   ├── Level-3;DE-OM1-4[n_samples=15, n_classes=5, classes=(15, 19), class_occurences=[6, 4, 3, 1, 1]]
│   │   ├── Level-3;DE-OM1-5[n_samples=22, n_classes=5, classes=(20, 24), class_occurences=[10, 5, 5, 1, 1]]
│   │   └── Level-3;DE-OM1-6[n_samples=12, n_classes=4, classes=(25, 28), class_occurences=[1, 1, 7, 3]]
│   ├── Level-2;DE-OM2[n_samples=130, n_classes=41, classes=(5, 45)]
│   │   ├── Level-3;DE-OM2-1[n_samples=52, n_classes=20, classes=(5, 24), class_occurences=[6, 5, 4, 3, 3, 3, 4, 3, 3, 4, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1]]
│   │   ├── Level-3;DE-OM2-2[n_samp

## Preparing the data

In [7]:
# Now we do the train/test split. We want 75% as training samples
df_train, df_test = train_test_splitting(df, n_train_samples=int(0.75*n_samples))

number of classes that have only one sample: {}
Test data contains: Counter({(45, 'GE-OM3-13'): 24, (48, 'GE-OM3-13'): 11, (47, 'GE-OM3-13'): 11, (46, 'GE-OM3-13'): 10, (52, 'GE-OM1-3'): 9, (60, 'GE-OM3-13'): 6, (63, 'GE-OM3-9'): 5, (50, 'GE-OM3-6'): 5, (54, 'GE-OM1-4'): 4, (56, 'GE-OM1-4'): 4, (31, 'GE-OM3-4'): 4, (32, 'DE-OM2-5'): 3, (57, 'GE-OM1-4'): 3, (60, 'GE-OM3-7'): 3, (48, 'GE-OM3-5'): 3, (71, 'GE-OM3-11'): 3, (20, 'DE-OM1-5'): 3, (45, 'DE-OM2-6'): 3, (72, 'GE-OM3-12'): 3, (43, 'DE-OM3-3'): 3, (65, 'GE-OM3-8'): 3, (44, 'DE-OM3-3'): 3, (5, 'DE-OM2-1'): 3, (69, 'GE-OM1-7'): 2, (55, 'GE-OM1-4'): 2, (61, 'GE-OM3-7'): 2, (64, 'GE-OM3-9'): 2, (15, 'DE-OM1-4'): 2, (6, 'DE-OM1-3'): 2, (31, 'GE-OM3-1'): 2, (36, 'DE-OM2-5'): 2, (73, 'GE-OM3-12'): 2, (58, 'GE-OM1-4'): 2, (41, 'GE-OM3-4'): 2, (64, 'GE-OM1-6'): 2, (35, 'DE-OM2-5'): 2, (51, 'GE-OM3-5'): 2, (37, 'GE-OM1-3'): 2, (47, 'DE-OM3-3'): 2, (34, 'GE-OM1-2'): 2, (44, 'GE-OM1-3'): 2, (33, 'GE-OM3-4'): 2, (46, 'DE-OM3-3'): 2, (33, 'DE-O

In [8]:
# Transform to X_train, X_test and y_train, y_test
X_train, X_test, y_train, y_test = get_train_test_X_y(df_train=df_train, df_test=df_test, n_features=n_features)

In [9]:
# Update the training data in our hierarchy model
root = update_data_and_training_data(root, df_train, n_features=n_features)

## Applying RandomForest and Boruta (RF+B) as Baseline

In [None]:
# Instantiating RF and Boruta. First, Boruta might take some time as it performs per default 100 iterations.
rf_b = RandomForestBorutaMethod()
rf_b.fit(X_train, y_train)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	100
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	100
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	100
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	100
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	100
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	100
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	100
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	19
Tentative: 	48
Rejected: 	33
Iteration: 	9 / 100
Confirmed: 	19
Tentative: 	48
Rejected: 	33
Iteration: 	10 / 100
Confirmed: 	19
Tentative: 	48
Rejected: 	33
Iteration: 	11 / 100
Confirmed: 	19
Tentative: 	48
Rejected: 	33
Iteration: 	12 / 100
Confirmed: 	22
Tentative: 	36
Rejected: 	42
Iteration: 	13 / 100
Confirmed: 	22
Tentative: 	36
Rejected: 	42
Iteration: 	14 / 100
Confirmed: 	22
Tentative: 	36
Rejected: 	42
Iteration: 	15 / 100
Confirmed: 	22
Tentative: 	36
Rejected: 	42
Iteration: 	16 / 100
Confirmed: 	22
Tenta

In [None]:
# Look at the results of the Random Forest Baseline for the top-k Accuracy
rf_b.predict_test_samples(df_test)
rf_b.get_accuracy_per_e_df()

## Applying SPH and CPI

In [None]:
# Now we instantiate SPH+CPI with the default parameters from Section 5 in the paper.
sph_cpi = SPHandCPI(max_info_loss=0.25, gini_threshold=0.3, p_threshold=0.8, hierarchy=root)
# Fit our model.  We see in the output where we use surrogate models and when as well as how CPI partitions the data.
# Note that there might be surrogates with less than 25% info loss. This indicates that there is only one class in the relevant sample subset.
sph_cpi.fit(X_train, y_train)

In [None]:
# Predict test samples. We use df_test and not y_test. This is because for SPH we also need the info of the hierarchy,
# i.e., of the specific product group to search the model repository for the appropriate model.
sph_cpi.predict_test_samples(df_test=df_test)
acc_df = sph_cpi.get_accuracy_per_e_df()
acc_df