# Data Generation Example notebook

In this notebook, we show how to use our data generator, how the generated data looks like, and how to change the parameters as well as the taxonomy.

In [1]:
import sys
# Make sure that we have access to the project path
project_path = "\\Path\\to\\Datagenerator\\Project"
if project_path not in sys.path:
    sys.path.append(project_path)

In [2]:
from DataGenerator import Generator 
from Taxonomy import EngineTaxonomy
from anytree import RenderTree

## Basic Data Generation

In [3]:
# In the following we use the taxonomy from the paper, i.e., a taxonomy regarding vehicle engines
taxonomy = EngineTaxonomy().create_taxonomy()
# We can also print the taxonomy. Note that the attributes are not defined as we set them during our generation procedure.
# The taxonomy has 3 levels and 25 leave nodes at the bottom level
print(RenderTree(taxonomy))

Level-0;Engine
├── Level-1;Diesel
│   ├── Level-2;DE-OM1
│   │   ├── Level-3;DE-OM1-2
│   │   ├── Level-3;DE-OM1-3
│   │   └── Level-3;DE-OM1-4
│   ├── Level-2;DE-OM2
│   │   ├── Level-3;DE-OM2-1
│   │   ├── Level-3;DE-OM2-2
│   │   ├── Level-3;DE-OM2-3
│   │   ├── Level-3;DE-OM2-5
│   │   └── Level-3;DE-OM2-6
│   └── Level-2;DE-OM3
│       ├── Level-3;DE-OM3-1
│       └── Level-3;DE-OM3-3
└── Level-1;Gasoline
    ├── Level-2;GE-OM1
    │   ├── Level-3;GE-OM1-2
    │   ├── Level-3;GE-OM1-3
    │   ├── Level-3;GE-OM1-4
    │   ├── Level-3;GE-OM1-5
    │   ├── Level-3;GE-OM1-6
    │   └── Level-3;GE-OM1-7
    └── Level-2;GE-OM3
        ├── Level-3;GE-OM3-1
        ├── Level-3;GE-OM3-4
        ├── Level-3;GE-OM3-6
        ├── Level-3;GE-OM3-7
        ├── Level-3;GE-OM3-8
        ├── Level-3;GE-OM3-9
        ├── Level-3;GE-OM3-10
        ├── Level-3;GE-OM3-11
        └── Level-3;GE-OM3-12


In [4]:
# Further we specify some basic parameters.
n_instances = 1000
n_features = 20
n_classes = 30

In [5]:
# The data generation can be done in two simple lines:
# 1.) Generate an Generator object
generator = Generator(root=taxonomy, n=n_instances, c=n_classes, n_features=n_features)
# 2.) Generate the data. Result is a pandas DataFrame.
df = generator.generate_data_from_taxonomy()

The generated data is a pandas Dataframe. It has 'n_features' columns of the form F0, F1, ...F(n_features - 1). Further, it has one column for each level in the taxonomy, i.e., three columns level-0, -1, -2, and a column "group" that specifies the bottom level of the taxonomy.
Further, the class labels are contained in the column "target". Thus, we have in total 25 columns.

In [6]:
df

Unnamed: 0,F0,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F15,F16,F17,F18,F19,target,group,level-0,level-1,level-2
0,0.812367,0.275718,0.765518,0.992493,1.000000,0.000000,0.723275,0.780657,0.152783,0.756609,...,0.000000,1.000000,0.653087,0.259895,0.080974,0,DE-OM1-2,Engine,Diesel,DE-OM1
1,0.000000,0.435499,0.145054,0.546468,0.412984,0.381898,0.000000,0.677615,1.000000,0.542731,...,0.684861,0.834129,0.343963,0.273741,0.141978,0,DE-OM1-2,Engine,Diesel,DE-OM1
2,0.735516,0.846704,0.571184,1.000000,0.753720,0.508478,0.414665,0.806455,0.816904,1.000000,...,0.416989,0.668614,0.263325,1.000000,0.823119,0,DE-OM1-2,Engine,Diesel,DE-OM1
3,0.534358,0.864196,0.808252,0.801693,0.315922,0.492080,0.708226,0.306688,0.738537,0.429269,...,0.336129,0.510750,0.523287,0.200966,1.000000,0,DE-OM1-2,Engine,Diesel,DE-OM1
4,0.944185,1.000000,0.240285,0.394997,0.159795,0.352899,0.578775,0.317828,0.299892,0.000000,...,0.102840,0.709852,0.575585,0.531915,0.156881,0,DE-OM1-2,Engine,Diesel,DE-OM1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,4.842104,2.612413,2.552446,0.615911,1.249556,1.374147,0.315141,0.408268,1.964119,2.800882,...,0.000000,3.557158,1.639147,0.602726,0.549525,17,GE-OM3-12,Engine,Gasoline,GE-OM3
996,4.225165,2.527960,2.757127,0.627447,1.279827,1.541723,0.235616,0.627328,1.670386,2.483974,...,0.089077,3.204804,1.817256,0.773362,0.482220,22,GE-OM3-12,Engine,Gasoline,GE-OM3
997,4.278069,2.427003,2.814733,0.491698,1.433499,1.318641,0.360069,0.527077,1.515603,2.605730,...,0.442315,3.652745,1.537139,0.563288,0.353962,22,GE-OM3-12,Engine,Gasoline,GE-OM3
998,4.634554,2.406823,2.444012,0.392038,1.295436,1.269595,0.227595,1.000000,1.462916,2.819238,...,0.470972,3.617995,1.189529,0.368504,0.391689,16,GE-OM3-12,Engine,Gasoline,GE-OM3


In [7]:
# Further, if we now print the taxonomy, we also see the number of instances and the number of classes set as attributes
print(RenderTree(taxonomy))

Level-0;Engine[n_instances=1000, n_classes=30]
├── Level-1;Diesel[n_instances=346, n_classes=11]
│   ├── Level-2;DE-OM1[n_instances=59, n_classes=4]
│   │   ├── Level-3;DE-OM1-2[n_instances=9, n_classes=2]
│   │   ├── Level-3;DE-OM1-3[n_instances=19, n_classes=2]
│   │   └── Level-3;DE-OM1-4[n_instances=31, n_classes=3]
│   ├── Level-2;DE-OM2[n_instances=107, n_classes=4]
│   │   ├── Level-3;DE-OM2-1[n_instances=8, n_classes=2]
│   │   ├── Level-3;DE-OM2-2[n_instances=13, n_classes=2]
│   │   ├── Level-3;DE-OM2-3[n_instances=14, n_classes=2]
│   │   ├── Level-3;DE-OM2-5[n_instances=27, n_classes=2]
│   │   └── Level-3;DE-OM2-6[n_instances=45, n_classes=4]
│   └── Level-2;DE-OM3[n_instances=180, n_classes=8]
│       ├── Level-3;DE-OM3-1[n_instances=64, n_classes=2]
│       └── Level-3;DE-OM3-3[n_instances=116, n_classes=8]
└── Level-1;Gasoline[n_instances=654, n_classes=30]
    ├── Level-2;GE-OM1[n_instances=207, n_classes=10]
    │   ├── Level-3;GE-OM1-2[n_instances=18, n_classes=2]
  

## Changing Parameters

In the following, we vary the basic parameters and the parameters that we used in our paper. You can also try and change the parameters on your own.

In [8]:
# Basic parameters
n_instances = 10000
n_features = 50
n_classes = 50
taxonomy = EngineTaxonomy().create_taxonomy()

In [9]:
# Parameters that influence the data characteristics
s_C = 1
s_G = 0
cf = 10
gs = 0.5

In [10]:
# 1.) Generate an Generator object
generator = Generator(root=taxonomy, n=n_instances, c=n_classes, n_features=n_features)
# 2.) Generate the data. Result is a pandas DataFrame.
df = generator.generate_data_from_taxonomy()

In [11]:
# Look at the generated data
# As expected, we have 10000 rows and 55 columns
df

Unnamed: 0,F0,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F45,F46,F47,F48,F49,target,group,level-0,level-1,level-2
0,0.426214,0.693712,0.580154,0.187844,0.487096,0.293131,0.580435,0.420567,0.832548,0.121053,...,1.612467,0.673739,0.190118,0.681520,0.464186,1,DE-OM1-2,Engine,Diesel,DE-OM1
1,0.160402,0.296788,0.686382,0.390674,0.384022,0.690608,0.102998,0.000000,0.340144,0.539406,...,1.248154,0.549844,0.535073,0.715249,0.282258,0,DE-OM1-2,Engine,Diesel,DE-OM1
2,0.465744,0.547179,0.385961,0.440008,0.492200,0.773925,0.456381,0.569933,0.495814,0.302540,...,1.174385,0.889072,0.662610,0.594579,0.167629,0,DE-OM1-2,Engine,Diesel,DE-OM1
3,0.323060,0.333969,0.337132,0.451348,0.447151,0.686228,0.388783,0.412239,0.759483,0.582549,...,1.693281,0.504161,0.491272,0.422744,0.388607,0,DE-OM1-2,Engine,Diesel,DE-OM1
4,0.411728,0.302272,0.489606,0.230053,0.495352,0.682546,0.642704,0.481362,0.784768,0.795468,...,1.385773,0.693195,0.669271,0.334200,0.369380,0,DE-OM1-2,Engine,Diesel,DE-OM1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1.447509,0.598869,0.413995,1.638032,0.532661,0.313223,0.358809,1.437327,0.356673,0.466662,...,2.656612,0.632695,0.466003,0.329913,0.482889,25,GE-OM3-12,Engine,Gasoline,GE-OM3
9996,1.592127,0.775521,0.387977,1.835071,0.615827,0.621083,0.760044,1.289678,0.498132,0.538624,...,2.313382,0.323585,0.152433,0.523434,0.408081,44,GE-OM3-12,Engine,Gasoline,GE-OM3
9997,1.787420,0.641352,0.714267,1.539860,0.425449,0.466943,0.711903,1.470071,0.538166,0.401772,...,2.593091,0.710696,0.578247,0.608358,0.592799,25,GE-OM3-12,Engine,Gasoline,GE-OM3
9998,1.637756,0.759401,0.491937,1.490258,0.537720,0.614198,0.380675,1.561331,0.471795,0.487045,...,2.312102,0.711826,0.536702,0.463894,0.652028,25,GE-OM3-12,Engine,Gasoline,GE-OM3


## Using a custom Taxonomy

In the following, we show how to create a custom Taxonomy with anytree and how to use it with our data generator.

In [12]:
# Create simple taxonomy with 2 levels, while each node has two childs
# Note that we have to use our Node class as we made minor changes to the Node class of anytree
from Taxonomy import Node
# Level-0
taxonomy_root = Node("Root Node")

# Level-1
child_1 = Node("child_1", parent=taxonomy_root)
child_2 = Node("child_2", parent=taxonomy_root)

# Level-2
child_1_1 = Node("child_1_1", parent=child_1)
child_1_2 = Node("child_1_2", parent=child_1)
child_2_1 = Node("child_2_1", parent=child_2)
child_2_2 = Node("child_2_2", parent=child_2)

In [13]:
# We can also visualize this taxonomy
print(RenderTree(taxonomy_root))

Level-0;Root Node
├── Level-1;child_1
│   ├── Level-2;child_1_1
│   └── Level-2;child_1_2
└── Level-1;child_2
    ├── Level-2;child_2_1
    └── Level-2;child_2_2


In [14]:
# Basic parameters
n_instances = 1000
n_features = 20
n_classes = 30

generator = Generator(root=taxonomy_root, n=n_instances, c=n_classes, n_features=n_features)
df = generator.generate_data_from_taxonomy()

In [15]:
df

Unnamed: 0,F0,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F14,F15,F16,F17,F18,F19,target,group,level-0,level-1
0,0.669011,1.736182,0.669506,0.134823,0.585426,0.638191,0.568471,0.293096,0.762893,0.324469,...,0.456309,0.619138,0.845513,0.613617,0.772344,0.658844,0,child_1_1,Root Node,child_1
1,0.349178,1.440564,0.336313,0.731095,0.142914,0.904905,0.484733,0.742978,0.520511,0.634801,...,0.343387,0.494628,0.291768,0.000000,0.488135,0.438983,5,child_1_1,Root Node,child_1
2,0.611500,1.548336,0.543177,0.902788,0.775290,0.677185,0.480253,0.521960,0.270361,0.567221,...,0.538353,0.344294,0.266219,0.376593,0.641975,0.768272,0,child_1_1,Root Node,child_1
3,0.662515,1.560614,0.546489,0.382932,0.760992,0.804662,0.508401,0.555026,0.623145,0.599545,...,0.535013,0.601987,0.590577,0.218241,0.689672,0.533553,0,child_1_1,Root Node,child_1
4,0.458459,1.363106,0.526584,0.775227,0.630858,0.626455,0.570478,0.693738,0.395581,0.831443,...,0.701176,0.469014,0.303390,0.163513,0.485665,0.532340,0,child_1_1,Root Node,child_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1.607742,1.489299,0.941575,0.695913,0.502300,0.638520,0.469854,1.474976,0.436496,0.728864,...,0.027566,0.808425,0.451300,0.710349,0.465299,0.672711,1,child_2_2,Root Node,child_2
996,1.928110,1.128622,0.656211,0.417585,0.368183,0.616075,0.501198,1.886553,0.404094,0.417237,...,0.385750,0.588268,0.423932,0.635154,0.508058,0.668239,0,child_2_2,Root Node,child_2
997,1.605420,1.340211,0.459901,0.439645,0.112719,0.476519,0.439925,1.488432,0.297769,0.533433,...,0.432441,0.201417,0.607152,0.295764,0.208529,0.614100,13,child_2_2,Root Node,child_2
998,1.517065,1.382011,0.786660,0.716106,0.287312,0.473030,0.379345,1.654127,0.736579,0.464336,...,0.273211,0.659763,0.642485,0.267602,0.642814,0.735231,0,child_2_2,Root Node,child_2
