In [55]:
import pandas as pd
import numpy as np
import os
from etnn.data.prepare_ferris_wheel import prepare_1_ferris
from etnn.data.ferris_score_helpers import build_wheel_happyness
from tqdm import tqdm

In [2]:
df = prepare_1_ferris(
    dataset_path=".",
    df_name_output="health_dataset_preprocessed-1.csv",
    df_name_input="Sleep_health_and_lifestyle_dataset.csv",
    try_pregen=True
)

In [3]:
df.head()

Unnamed: 0,id,age,occupation,sleep_duration,sleep_quality,physical_activity,stress_level,bmi,heart_rate,daily_steps,sleep_disorder,blood_pressure1,blood_pressure2,gender_male,gender_female,gender_other
0,1,27,1,6.1,6,42,6,2,77,4200,0,126,83,True,False,False
1,2,28,2,6.2,6,60,8,0,75,10000,0,125,80,True,False,False
2,3,28,2,6.2,6,60,8,0,75,10000,0,125,80,True,False,False
3,4,28,3,5.9,4,30,8,3,85,3000,1,140,90,True,False,False
4,5,28,3,5.9,4,30,8,3,85,3000,1,140,90,True,False,False


In [4]:
# take group of ids from persons and state how the label for this group shall be calculated

In [5]:
# define ferris wheel
num_gondolas = 10
num_part_pg = 5

In [6]:
# generate sample element
random_order = np.arange(len(df))+1
np.random.shuffle(random_order)

In [7]:
example = [
    random_order[i*num_part_pg:(i+1)*num_part_pg]
    for i in range(num_gondolas)
]

In [8]:
example

[array([ 53, 331, 367, 229, 248]),
 array([192, 179,  92, 199, 258]),
 array([213, 197, 174, 158,  72]),
 array([ 38, 147, 290, 165,  41]),
 array([339, 146, 134, 283,  95]),
 array([190, 171, 113, 268, 247]),
 array([266,  37,  78, 310,  75]),
 array([334, 116, 284, 340,  97]),
 array([250, 177, 256, 299,  33]),
 array([161, 245, 209, 234, 132])]

In [23]:
random_order[:num_gondolas*num_part_pg].reshape(num_gondolas, num_part_pg)

array([[ 53, 331, 367, 229, 248],
       [192, 179,  92, 199, 258],
       [213, 197, 174, 158,  72],
       [ 38, 147, 290, 165,  41],
       [339, 146, 134, 283,  95],
       [190, 171, 113, 268, 247],
       [266,  37,  78, 310,  75],
       [334, 116, 284, 340,  97],
       [250, 177, 256, 299,  33],
       [161, 245, 209, 234, 132]])

Rules:
- People being happy with other people in same gondola
    + Age composition too seperated is bad
    + shift in gender is bad if too much, 50-50 is good or all one gender
    + same with age composition
    + sleep derived(multiplier with quality) persons get a subtraction and 'good sleepers' get bonus (sleep disorder counts as stronger subtraction)
    + higher heart rate and pressure = joy or fear
    + composition of persons in regards to bmi : extreme values make others (no exception for group all those as to many underweight or overweight persons may be awquard as well)
- People being happy with neighboring gondolas composition
    + same age gets bonus, none gets penalty as potentially group is separated
    + gap between happyness index between self and neighbors causes it to produce a mean of only the neighbors

In [26]:
build_wheel_happyness(df, example)

69.25677333333333

In [28]:
build_wheel_happyness(df, random_order[:num_gondolas*num_part_pg].reshape(num_gondolas, num_part_pg))

69.25677333333333

In [52]:
# define dataset size
num_to_generate = 10_000
# define ferris wheel
num_gondolas = 10
num_part_pg = 5

In [56]:
# initialize dataset storage
dataset_storage = []

# init ordering for latter shuffling
random_order = np.arange(len(df))+1
for _ in tqdm(range(num_to_generate)):
    # generate sample element
    np.random.shuffle(random_order)

    # shape element
    sample = random_order[:num_gondolas*num_part_pg].reshape(num_gondolas, num_part_pg)

    # calc label
    label = build_wheel_happyness(df, sample)

    # add to storage
    dataset_storage.append(sample.flatten().tolist() + [label])

df_generated = pd.DataFrame(dataset_storage, columns=[
    f"g-{i}_p-{j}"
    for i in range(num_gondolas)
    for j in range(num_part_pg)
] + ['label'])

100%|██████████| 10000/10000 [03:46<00:00, 44.07it/s]


In [46]:
df_generated.head()

Unnamed: 0,g-0_p-0,g-0_p-1,g-0_p-2,g-0_p-3,g-0_p-4,g-1_p-0,g-1_p-1,g-1_p-2,g-1_p-3,g-1_p-4,...,g-8_p-1,g-8_p-2,g-8_p-3,g-8_p-4,g-9_p-0,g-9_p-1,g-9_p-2,g-9_p-3,g-9_p-4,label
0,284,276,51,230,246,271,27,257,183,89,...,143,1,124,78,94,107,73,324,363,61.212
1,277,45,12,297,36,302,325,133,254,229,...,266,177,258,370,214,243,280,232,110,54.046173
2,204,98,21,368,184,203,89,46,37,316,...,91,5,198,364,369,293,217,237,23,58.062787
3,214,248,236,192,322,118,326,265,257,258,...,174,304,352,147,52,4,287,83,34,70.6636
4,275,193,9,95,204,310,25,28,243,209,...,166,105,298,158,338,268,179,62,260,67.075373


In [57]:
df_generated.to_csv("test.csv")

In [50]:
df_load = pd.read_csv("test.csv", index_col=0)

In [51]:
df_load.head()

Unnamed: 0,g-0_p-0,g-0_p-1,g-0_p-2,g-0_p-3,g-0_p-4,g-1_p-0,g-1_p-1,g-1_p-2,g-1_p-3,g-1_p-4,...,g-8_p-1,g-8_p-2,g-8_p-3,g-8_p-4,g-9_p-0,g-9_p-1,g-9_p-2,g-9_p-3,g-9_p-4,label
0,284,276,51,230,246,271,27,257,183,89,...,143,1,124,78,94,107,73,324,363,61.212
1,277,45,12,297,36,302,325,133,254,229,...,266,177,258,370,214,243,280,232,110,54.046173
2,204,98,21,368,184,203,89,46,37,316,...,91,5,198,364,369,293,217,237,23,58.062787
3,214,248,236,192,322,118,326,265,257,258,...,174,304,352,147,52,4,287,83,34,70.6636
4,275,193,9,95,204,310,25,28,243,209,...,166,105,298,158,338,268,179,62,260,67.075373
