# Preparation of Dataset (development notebook)

## Import section

In [1]:
import numpy as np
import pandas as pd

from etnn.data.prepare_ferris_wheel import prepare_1_ferris, generate_ferris_dataset
from etnn.data.ferris_score_helpers import build_wheel_happyness
from etnn.data.ferris_wheel import load_pure_ferris_wheel_dataset
import torch
from torch.utils.data import random_split, DataLoader
from tqdm import tqdm

In [2]:
DATASET_PATH = "../datasets"

## Person dataset

In [3]:
df_health = prepare_1_ferris(
    dataset_path="../datasets",
    df_name_output="health_dataset_preprocessed-1.csv",
    df_name_input="Sleep_health_and_lifestyle_dataset.csv",
    try_pregen=True
)

In [4]:
df_health.head()

Unnamed: 0,id,age,occupation,sleep_duration,sleep_quality,physical_activity,stress_level,bmi,heart_rate,daily_steps,sleep_disorder,blood_pressure1,blood_pressure2,gender_male,gender_female,gender_other
0,1,27,1,6.1,6,42,6,2,77,4200,0,126,83,True,False,False
1,2,28,2,6.2,6,60,8,0,75,10000,0,125,80,True,False,False
2,3,28,2,6.2,6,60,8,0,75,10000,0,125,80,True,False,False
3,4,28,3,5.9,4,30,8,3,85,3000,1,140,90,True,False,False
4,5,28,3,5.9,4,30,8,3,85,3000,1,140,90,True,False,False


## Gondola dataset (indexing)

In [5]:
# take group of ids from persons and state how the label for this group shall be calculated

In [6]:
# define ferris wheel
num_gondolas = 10
num_part_pg = 5

In [7]:
# generate sample element
random_order = np.arange(len(df_health)) + 1
np.random.shuffle(random_order)

In [8]:
example = [
    random_order[i*num_part_pg:(i+1)*num_part_pg]
    for i in range(num_gondolas)
]

In [9]:
example

[array([325, 373,  25,   3, 317]),
 array([180, 203,  54,   5,  57]),
 array([355, 332, 115, 234, 100]),
 array([341,  43, 104, 295, 255]),
 array([371,  66, 175, 194, 140]),
 array([309, 286, 259, 167, 273]),
 array([128, 132,  42, 274, 213]),
 array([311, 257,   2, 210, 342]),
 array([131, 300,  94, 171, 189]),
 array([ 71,  45,  84, 237, 374])]

In [10]:
random_order[:num_gondolas*num_part_pg].reshape(num_gondolas, num_part_pg)

array([[325, 373,  25,   3, 317],
       [180, 203,  54,   5,  57],
       [355, 332, 115, 234, 100],
       [341,  43, 104, 295, 255],
       [371,  66, 175, 194, 140],
       [309, 286, 259, 167, 273],
       [128, 132,  42, 274, 213],
       [311, 257,   2, 210, 342],
       [131, 300,  94, 171, 189],
       [ 71,  45,  84, 237, 374]])

Rules:
- People being happy with other people in same gondola
    + Age composition too seperated is bad
    + shift in gender is bad if too much, 50-50 is good or all one gender
    + same with age composition
    + sleep derived(multiplier with quality) persons get a subtraction and 'good sleepers' get bonus (sleep disorder counts as stronger subtraction)
    + higher heart rate and pressure = joy or fear
    + composition of persons in regards to bmi : extreme values make others (no exception for group all those as to many underweight or overweight persons may be awquard as well)
- People being happy with neighboring gondolas composition
    + same age gets bonus, none gets penalty as potentially group is separated
    + gap between happyness index between self and neighbors causes it to produce a mean of only the neighbors

In [11]:
build_wheel_happyness(df_health, example)

54.92653333333333

In [12]:
build_wheel_happyness(df_health, random_order[:num_gondolas * num_part_pg].reshape(num_gondolas, num_part_pg))

54.92653333333333

In [13]:
# define dataset size
num_to_generate = 10_000
# define ferris wheel
num_gondolas = 10
num_part_pg = 5

In [14]:
df_index, df_health = generate_ferris_dataset(
    num_gondolas=num_gondolas,
    num_part_pg=num_part_pg,
    num_to_generate=num_to_generate,
    dataset_path="../datasets",
    df_intermediate_output_name="health_dataset_preprocessed-1.csv",
    df_name_input="Sleep_health_and_lifestyle_dataset.csv",
    try_pregen=True
)

In [15]:
df_index.head()

Unnamed: 0,g-0_p-0,g-0_p-1,g-0_p-2,g-0_p-3,g-0_p-4,g-1_p-0,g-1_p-1,g-1_p-2,g-1_p-3,g-1_p-4,...,g-8_p-1,g-8_p-2,g-8_p-3,g-8_p-4,g-9_p-0,g-9_p-1,g-9_p-2,g-9_p-3,g-9_p-4,label
0,264,191,98,235,35,356,253,9,147,311,...,315,265,92,23,287,17,234,223,117,68.01148
1,338,176,95,11,161,248,364,53,64,71,...,181,365,96,187,165,38,114,98,56,55.463373
2,290,99,75,21,237,287,215,213,172,214,...,76,188,274,77,264,117,254,168,65,66.108493
3,15,277,93,106,290,354,60,226,75,318,...,243,288,74,131,344,86,266,218,83,58.642507
4,190,260,34,286,360,167,251,298,256,15,...,48,258,107,5,307,121,162,7,37,62.584213


In [16]:
df_health.head()

Unnamed: 0,id,age,occupation,sleep_duration,sleep_quality,physical_activity,stress_level,bmi,heart_rate,daily_steps,sleep_disorder,blood_pressure1,blood_pressure2,gender_male,gender_female,gender_other
0,1,27,1,6.1,6,42,6,2,77,4200,0,126,83,True,False,False
1,2,28,2,6.2,6,60,8,0,75,10000,0,125,80,True,False,False
2,3,28,2,6.2,6,60,8,0,75,10000,0,125,80,True,False,False
3,4,28,3,5.9,4,30,8,3,85,3000,1,140,90,True,False,False
4,5,28,3,5.9,4,30,8,3,85,3000,1,140,90,True,False,False


In [17]:
df_index.iloc[:, :-1]

Unnamed: 0,g-0_p-0,g-0_p-1,g-0_p-2,g-0_p-3,g-0_p-4,g-1_p-0,g-1_p-1,g-1_p-2,g-1_p-3,g-1_p-4,...,g-8_p-0,g-8_p-1,g-8_p-2,g-8_p-3,g-8_p-4,g-9_p-0,g-9_p-1,g-9_p-2,g-9_p-3,g-9_p-4
0,264,191,98,235,35,356,253,9,147,311,...,299,315,265,92,23,287,17,234,223,117
1,338,176,95,11,161,248,364,53,64,71,...,100,181,365,96,187,165,38,114,98,56
2,290,99,75,21,237,287,215,213,172,214,...,122,76,188,274,77,264,117,254,168,65
3,15,277,93,106,290,354,60,226,75,318,...,235,243,288,74,131,344,86,266,218,83
4,190,260,34,286,360,167,251,298,256,15,...,28,48,258,107,5,307,121,162,7,37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,127,84,110,152,265,90,318,72,331,237,...,346,7,341,126,70,208,67,248,291,246
9996,286,97,191,288,32,268,41,347,272,169,...,194,178,269,310,79,370,108,189,340,24
9997,372,168,198,199,239,369,323,277,174,281,...,191,112,75,331,177,163,155,7,188,290
9998,154,310,224,59,87,207,97,78,290,241,...,12,338,254,183,145,22,95,247,158,231


## Total dataset creation

In [18]:
dataset = load_pure_ferris_wheel_dataset(
    num_gondolas=num_gondolas,
    num_part_pg=num_part_pg,
    num_to_generate=num_to_generate,
    dataset_path="../datasets",
    df_intermediate_output_name="health_dataset_preprocessed-1.csv",
    df_name_input="Sleep_health_and_lifestyle_dataset.csv",
    try_pregen=True
)

In [19]:
dataset[0]

(tensor([[4.5000e+01, 1.1000e+01, 6.9000e+00, 7.0000e+00, 5.5000e+01, 5.0000e+00,
          2.0000e+00, 7.5000e+01, 5.5000e+03, 0.0000e+00, 1.2500e+02, 8.2000e+01,
          0.0000e+00, 1.0000e+00, 0.0000e+00],
         [4.3000e+01, 4.0000e+00, 6.7000e+00, 7.0000e+00, 4.5000e+01, 4.0000e+00,
          2.0000e+00, 6.5000e+01, 6.0000e+03, 2.0000e+00, 1.3500e+02, 9.0000e+01,
          0.0000e+00, 1.0000e+00, 0.0000e+00],
         [3.6000e+01, 7.0000e+00, 7.1000e+00, 8.0000e+00, 6.0000e+01, 4.0000e+00,
          0.0000e+00, 6.8000e+01, 7.0000e+03, 0.0000e+00, 1.1500e+02, 7.5000e+01,
          0.0000e+00, 1.0000e+00, 0.0000e+00],
         [4.4000e+01, 4.0000e+00, 6.6000e+00, 7.0000e+00, 4.5000e+01, 4.0000e+00,
          2.0000e+00, 6.5000e+01, 6.0000e+03, 2.0000e+00, 1.3500e+02, 9.0000e+01,
          0.0000e+00, 1.0000e+00, 0.0000e+00],
         [3.1000e+01, 2.0000e+00, 7.7000e+00, 7.0000e+00, 7.5000e+01, 6.0000e+00,
          0.0000e+00, 7.0000e+01, 8.0000e+03, 0.0000e+00, 1.2000e+02, 8.00

## DataLoaders

In [20]:
generator = torch.Generator().manual_seed(420)
train_ds, val_ds, test_ds = random_split(dataset, [0.7, 0.1, 0.2], generator=generator)

In [21]:
train_ds

<torch.utils.data.dataset.Subset at 0x2021173a040>

In [22]:
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)

In [23]:
for x, y in train_loader:
    print(f"x-shape:{x.shape}, y-shape:{y.shape}")
    break

x-shape:torch.Size([32, 50, 15]), y-shape:torch.Size([32])


## Specialize pure datasets

In [24]:
# create viable permutations of input and add them to dataset

In [25]:
num_add_equal_elem = 12
seed = 4645883

In [26]:
df_sampled = df_index.sample(num_add_equal_elem, replace=True, random_state=seed)

In [27]:
df_sampled

Unnamed: 0,g-0_p-0,g-0_p-1,g-0_p-2,g-0_p-3,g-0_p-4,g-1_p-0,g-1_p-1,g-1_p-2,g-1_p-3,g-1_p-4,...,g-8_p-1,g-8_p-2,g-8_p-3,g-8_p-4,g-9_p-0,g-9_p-1,g-9_p-2,g-9_p-3,g-9_p-4,label
8137,186,265,291,148,360,346,29,184,233,350,...,145,102,73,192,185,116,340,1,257,60.267387
8463,180,189,61,274,15,310,222,362,81,14,...,122,17,335,186,132,104,179,278,53,64.2122
1343,88,64,113,75,67,245,70,200,309,362,...,239,210,304,211,296,77,81,86,169,58.237173
7714,144,118,168,106,34,278,213,352,53,107,...,228,5,220,310,79,45,139,110,30,72.370747
5818,30,371,39,42,243,135,112,96,331,1,...,273,140,101,192,290,60,193,346,338,67.01644
877,95,162,300,76,47,92,360,327,226,237,...,178,188,98,73,87,140,151,45,370,66.941373
7679,64,172,213,208,113,344,15,202,368,158,...,68,336,282,72,153,354,197,171,181,62.17652
8766,292,239,372,153,350,248,133,308,238,230,...,140,357,71,15,114,224,39,258,144,62.134387
503,220,27,324,201,141,260,148,215,22,129,...,162,342,214,212,274,320,125,346,372,63.34304
1407,348,111,343,80,182,373,282,338,142,217,...,210,197,126,209,70,26,42,307,223,59.733467


In [28]:
# for each randomly one of two things:
# - group order change
#   + shift groups
#   + invert group order
# - permutate gondola people

In [29]:
df_t = df_sampled.to_numpy()[:, :-1]
df_t.shape

(12, 50)

In [30]:
df_g = df_t.reshape(df_t.shape[0], num_gondolas, -1)
df_g.shape

(12, 10, 5)

In [31]:
shifts = np.random.randint(0,num_gondolas, df_g.shape[0])
idx = np.arange(num_gondolas)

In [32]:
df_g

array([[[186., 265., 291., 148., 360.],
        [346.,  29., 184., 233., 350.],
        [ 75., 121.,  81.,  72.,  71.],
        [ 31., 348., 220., 198.,  97.],
        [206.,  43.,  98., 318., 147.],
        [311., 125., 193., 343., 369.],
        [260., 143., 345., 190., 231.],
        [ 24.,  52., 327., 168., 252.],
        [ 47., 145., 102.,  73., 192.],
        [185., 116., 340.,   1., 257.]],

       [[180., 189.,  61., 274.,  15.],
        [310., 222., 362.,  81.,  14.],
        [250., 118.,   5., 203., 244.],
        [352., 348.,  99., 217., 287.],
        [ 33., 313., 345., 117., 139.],
        [107.,  82.,  31.,  71., 181.],
        [225., 237., 212., 323., 151.],
        [  3., 145., 185., 343., 226.],
        [184., 122.,  17., 335., 186.],
        [132., 104., 179., 278.,  53.]],

       [[ 88.,  64., 113.,  75.,  67.],
        [245.,  70., 200., 309., 362.],
        [226., 314., 161., 202.,  93.],
        [336., 193., 270.,  32., 306.],
        [ 14.,  95.,  26., 364., 353

In [37]:
for i in tqdm(range(num_add_equal_elem)):
    df_g[i] = df_g[i][(idx + shifts[i]) % num_gondolas]
    for j in range(num_gondolas):
        np.random.shuffle(df_g[i][j])

100%|██████████| 12/12 [00:00<?, ?it/s]


In [34]:
print(shifts)

[4 8 4 5 5 1 7 8 2 7 4 0]


In [35]:
df_g

array([[[147., 206.,  43., 318.,  98.],
        [193., 369., 125., 343., 311.],
        [231., 345., 190., 260., 143.],
        [252.,  52., 327.,  24., 168.],
        [ 73., 102., 192.,  47., 145.],
        [  1., 340., 116., 257., 185.],
        [186., 291., 265., 360., 148.],
        [346., 184., 233., 350.,  29.],
        [ 71.,  75.,  81., 121.,  72.],
        [ 97., 198., 220., 348.,  31.]],

       [[184.,  17., 122., 335., 186.],
        [278., 179., 132.,  53., 104.],
        [ 15., 189., 274., 180.,  61.],
        [ 81.,  14., 222., 362., 310.],
        [118., 250., 203.,   5., 244.],
        [287., 217., 348.,  99., 352.],
        [139., 313., 117.,  33., 345.],
        [181.,  82., 107.,  31.,  71.],
        [212., 225., 237., 151., 323.],
        [343., 145., 226.,   3., 185.]],

       [[ 26., 364.,  14.,  95., 353.],
        [143., 243., 160.,  94., 130.],
        [233., 347.,  63., 313.,  42.],
        [ 66., 371.,  54., 330.,  85.],
        [239., 211., 146., 304., 210

In [40]:
pd_new = pd.DataFrame(np.c_[df_t, df_sampled.label.to_numpy()], columns=df_sampled.columns)

In [41]:
pd_new

Unnamed: 0,g-0_p-0,g-0_p-1,g-0_p-2,g-0_p-3,g-0_p-4,g-1_p-0,g-1_p-1,g-1_p-2,g-1_p-3,g-1_p-4,...,g-8_p-1,g-8_p-2,g-8_p-3,g-8_p-4,g-9_p-0,g-9_p-1,g-9_p-2,g-9_p-3,g-9_p-4,label
0,145.0,47.0,73.0,102.0,192.0,185.0,1.0,116.0,257.0,340.0,...,231.0,190.0,345.0,143.0,168.0,252.0,52.0,327.0,24.0,60.267387
1,237.0,212.0,323.0,225.0,151.0,145.0,185.0,226.0,3.0,343.0,...,313.0,33.0,345.0,117.0,31.0,82.0,181.0,107.0,71.0,64.2122
2,146.0,211.0,239.0,210.0,304.0,81.0,86.0,169.0,77.0,296.0,...,347.0,42.0,63.0,233.0,85.0,66.0,371.0,54.0,330.0,58.237173
3,168.0,118.0,106.0,144.0,34.0,213.0,278.0,107.0,352.0,53.0,...,5.0,220.0,127.0,228.0,45.0,110.0,79.0,30.0,139.0,72.370747
4,30.0,243.0,42.0,371.0,39.0,96.0,135.0,331.0,112.0,1.0,...,273.0,125.0,140.0,192.0,338.0,290.0,60.0,346.0,193.0,67.01644
5,311.0,44.0,202.0,295.0,279.0,70.0,329.0,330.0,75.0,243.0,...,76.0,162.0,47.0,300.0,92.0,360.0,226.0,327.0,237.0,66.941373
6,75.0,311.0,73.0,190.0,90.0,104.0,138.0,317.0,295.0,98.0,...,14.0,148.0,143.0,235.0,25.0,331.0,144.0,318.0,170.0,62.17652
7,216.0,198.0,250.0,244.0,356.0,100.0,36.0,163.0,143.0,193.0,...,298.0,131.0,180.0,10.0,336.0,270.0,366.0,158.0,122.0,62.134387
8,44.0,225.0,307.0,325.0,243.0,110.0,29.0,272.0,167.0,31.0,...,316.0,331.0,28.0,295.0,370.0,369.0,252.0,259.0,135.0,63.34304
9,84.0,183.0,361.0,73.0,129.0,214.0,162.0,240.0,57.0,266.0,...,131.0,258.0,286.0,292.0,105.0,299.0,82.0,190.0,135.0,59.733467


In [44]:
pd.concat([df_index, pd_new], ignore_index=True)

Unnamed: 0,g-0_p-0,g-0_p-1,g-0_p-2,g-0_p-3,g-0_p-4,g-1_p-0,g-1_p-1,g-1_p-2,g-1_p-3,g-1_p-4,...,g-8_p-1,g-8_p-2,g-8_p-3,g-8_p-4,g-9_p-0,g-9_p-1,g-9_p-2,g-9_p-3,g-9_p-4,label
0,264.0,191.0,98.0,235.0,35.0,356.0,253.0,9.0,147.0,311.0,...,315.0,265.0,92.0,23.0,287.0,17.0,234.0,223.0,117.0,68.011480
1,338.0,176.0,95.0,11.0,161.0,248.0,364.0,53.0,64.0,71.0,...,181.0,365.0,96.0,187.0,165.0,38.0,114.0,98.0,56.0,55.463373
2,290.0,99.0,75.0,21.0,237.0,287.0,215.0,213.0,172.0,214.0,...,76.0,188.0,274.0,77.0,264.0,117.0,254.0,168.0,65.0,66.108493
3,15.0,277.0,93.0,106.0,290.0,354.0,60.0,226.0,75.0,318.0,...,243.0,288.0,74.0,131.0,344.0,86.0,266.0,218.0,83.0,58.642507
4,190.0,260.0,34.0,286.0,360.0,167.0,251.0,298.0,256.0,15.0,...,48.0,258.0,107.0,5.0,307.0,121.0,162.0,7.0,37.0,62.584213
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10007,216.0,198.0,250.0,244.0,356.0,100.0,36.0,163.0,143.0,193.0,...,298.0,131.0,180.0,10.0,336.0,270.0,366.0,158.0,122.0,62.134387
10008,44.0,225.0,307.0,325.0,243.0,110.0,29.0,272.0,167.0,31.0,...,316.0,331.0,28.0,295.0,370.0,369.0,252.0,259.0,135.0,63.343040
10009,84.0,183.0,361.0,73.0,129.0,214.0,162.0,240.0,57.0,266.0,...,131.0,258.0,286.0,292.0,105.0,299.0,82.0,190.0,135.0,59.733467
10010,48.0,362.0,342.0,358.0,108.0,282.0,326.0,18.0,348.0,269.0,...,14.0,215.0,200.0,57.0,16.0,65.0,77.0,334.0,134.0,72.082213


In [36]:
a = np.array([[1,2,3,4,5], [6,7,8,9,10]])
np.random.shuffle(a[2:-1])
a

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10]])