# Preparation of Dataset (development notebook)

## Import section

In [1]:
import numpy as np
import pandas as pd

from etnn.data.prepare_ferris_wheel import prepare_1_ferris, generate_ferris_dataset, add_valid_permutations, \
    sample_new_permutations
from etnn.data.ferris_score_helpers import build_wheel_happyness
from etnn.data.ferris_wheel import load_pure_ferris_wheel_dataset, load_modified_ferris_wheel_dataset
import torch
from torch.utils.data import random_split, DataLoader
from tqdm import tqdm

In [2]:
DATASET_PATH = "../datasets"

## Person dataset

In [3]:
df_health = prepare_1_ferris(
    dataset_path="../datasets",
    df_name_output="health_dataset_preprocessed-1.csv",
    df_name_input="Sleep_health_and_lifestyle_dataset.csv",
    try_pregen=True
)

In [4]:
df_health.head()

Unnamed: 0,id,age,occupation,sleep_duration,sleep_quality,physical_activity,stress_level,bmi,heart_rate,daily_steps,sleep_disorder,blood_pressure1,blood_pressure2,gender_male,gender_female,gender_other
0,1,27,1,6.1,6,42,6,2,77,4200,0,126,83,True,False,False
1,2,28,2,6.2,6,60,8,0,75,10000,0,125,80,True,False,False
2,3,28,2,6.2,6,60,8,0,75,10000,0,125,80,True,False,False
3,4,28,3,5.9,4,30,8,3,85,3000,1,140,90,True,False,False
4,5,28,3,5.9,4,30,8,3,85,3000,1,140,90,True,False,False


## Gondola dataset (indexing)

In [5]:
# take group of ids from persons and state how the label for this group shall be calculated

In [6]:
# define ferris wheel
num_gondolas = 10
num_part_pg = 5

In [7]:
# generate sample element
random_order = np.arange(len(df_health)) + 1
np.random.shuffle(random_order)

In [8]:
example = [
    random_order[i*num_part_pg:(i+1)*num_part_pg]
    for i in range(num_gondolas)
]

In [9]:
example

[array([317, 372, 152, 371,  62]),
 array([ 86, 250, 268, 238, 356]),
 array([187, 180,  68, 144,  57]),
 array([188, 200,  36, 199, 173]),
 array([347,  71, 206, 286, 230]),
 array([ 25, 275, 367, 337, 111]),
 array([273, 239,  40, 116, 139]),
 array([228, 189,  47, 348, 215]),
 array([ 72, 202,  91,  17, 324]),
 array([ 27, 270, 271, 127, 223])]

In [10]:
random_order[:num_gondolas*num_part_pg].reshape(num_gondolas, num_part_pg)

array([[317, 372, 152, 371,  62],
       [ 86, 250, 268, 238, 356],
       [187, 180,  68, 144,  57],
       [188, 200,  36, 199, 173],
       [347,  71, 206, 286, 230],
       [ 25, 275, 367, 337, 111],
       [273, 239,  40, 116, 139],
       [228, 189,  47, 348, 215],
       [ 72, 202,  91,  17, 324],
       [ 27, 270, 271, 127, 223]])

Rules:
- People being happy with other people in same gondola
    + Age composition too seperated is bad
    + shift in gender is bad if too much, 50-50 is good or all one gender
    + same with age composition
    + sleep derived(multiplier with quality) persons get a subtraction and 'good sleepers' get bonus (sleep disorder counts as stronger subtraction)
    + higher heart rate and pressure = joy or fear
    + composition of persons in regards to bmi : extreme values make others (no exception for group all those as to many underweight or overweight persons may be awquard as well)
- People being happy with neighboring gondolas composition
    + same age gets bonus, none gets penalty as potentially group is separated
    + gap between happyness index between self and neighbors causes it to produce a mean of only the neighbors

In [11]:
build_wheel_happyness(df_health, example)

59.89417333333333

In [12]:
build_wheel_happyness(df_health, random_order[:num_gondolas * num_part_pg].reshape(num_gondolas, num_part_pg))

59.89417333333333

In [13]:
# define dataset size
num_to_generate = 10_000
# define ferris wheel
num_gondolas = 10
num_part_pg = 5

In [14]:
df_index, df_health = generate_ferris_dataset(
    num_gondolas=num_gondolas,
    num_part_pg=num_part_pg,
    num_to_generate=num_to_generate,
    dataset_path="../datasets",
    df_intermediate_output_name="health_dataset_preprocessed-1.csv",
    df_name_input="Sleep_health_and_lifestyle_dataset.csv",
    try_pregen=True
)

  1%|▏         | 140/10000 [00:03<03:47, 43.29it/s]


KeyboardInterrupt: 

In [None]:
df_index.head()

In [None]:
df_health.head()

In [None]:
df_index.iloc[:, :-1]

## Total dataset creation

In [None]:
dataset = load_pure_ferris_wheel_dataset(
    num_gondolas=num_gondolas,
    num_part_pg=num_part_pg,
    num_to_generate=num_to_generate,
    dataset_path="../datasets",
    df_intermediate_output_name="health_dataset_preprocessed-1.csv",
    df_name_input="Sleep_health_and_lifestyle_dataset.csv",
    try_pregen=True
)

In [None]:
dataset[0]

## DataLoaders

In [None]:
generator = torch.Generator().manual_seed(420)
train_ds, val_ds, test_ds = random_split(dataset, [0.7, 0.1, 0.2], generator=generator)

In [None]:
train_ds

In [None]:
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)

In [None]:
for x, y in train_loader:
    print(f"x-shape:{x.shape}, y-shape:{y.shape}")
    break

## Specialize pure datasets

In [None]:
df_added_valid = add_valid_permutations(
    num_to_generate,
    df_index,
    num_gondolas
)

In [None]:
df_added_valid

In [None]:
dataset = load_modified_ferris_wheel_dataset(
    num_gondolas=num_gondolas,
    num_part_pg=num_part_pg,
    num_to_generate=num_to_generate,
    num_valid_to_add=num_to_generate,
    dataset_path="../datasets",
    df_intermediate_output_name="health_dataset_preprocessed-1.csv",
    df_name_input="Sleep_health_and_lifestyle_dataset.csv",
    try_pregen=True
)

## Change regression task to classification

In [None]:
temp = df_index.label/(10*num_gondolas)*10

In [None]:
temp.round()

## Problem: dataset too centered around a specific value

In [None]:
df_index.label.describe()

In [None]:
df_downsample = df_index.copy(deep=True)
df_downsample

In [None]:
df_downsample['label_cat'] = (df_downsample.label/5).round()

In [None]:
df_downsample

In [None]:
n_samples = df_downsample.groupby('label_cat').size().min()
n_samples

In [None]:
downsampled_df = df_downsample.groupby('label_cat').apply(lambda x: x.sample(min(len(x), 10)))
downsampled_df

In [None]:
downsampled_df.label.describe()

# Generate complete dataset

In [None]:
from itertools import permutations

In [None]:
list(permutations(range(3), 2))

In [None]:
num_gondolas = 10
num_part_pg = 5

In [None]:
n_persons = 374

In [None]:
# create all combinations
all_perms = permutations(range(1, n_persons + 1), num_gondolas*num_part_pg)

Wait... way too large! this has over 10^89 possibilities - scrap this

# Generation of testset

In [None]:
# idea: want to generate elements that are valid permutations and are not in the training set

In [None]:
df_index

In [None]:
df_test = sample_new_permutations(
    df_index=df_index,
    num_gondolas=num_gondolas,
    num_elem=10,
    merge_check=True
)

In [None]:
df_test

In [None]:
df_test2 = df_index.iloc[-10:]
df_test2

In [None]:
merged = df_test2.merge(df_index, how='left', indicator=True)

In [None]:
merged

In [None]:
sum(merged['_merge'] == 'both')

In [None]:
merged._merge