# Preparation of Dataset (development notebook)

## Import section

In [1]:
import numpy as np
import pandas as pd

from etnn.data.prepare_ferris_wheel import prepare_1_ferris, generate_ferris_dataset, add_valid_permutations, \
    sample_new_permutations
from etnn.data.ferris_score_helpers import build_wheel_happyness
from etnn.data.ferris_wheel import load_pure_ferris_wheel_dataset, load_modified_ferris_wheel_dataset
import torch
from torch.utils.data import random_split, DataLoader
from tqdm import tqdm

In [2]:
DATASET_PATH = "../datasets"

## Person dataset

In [3]:
df_health = prepare_1_ferris(
    dataset_path="../datasets",
    df_name_output="health_dataset_preprocessed-1.csv",
    df_name_input="Sleep_health_and_lifestyle_dataset.csv",
    try_pregen=True
)

In [4]:
df_health.head()

Unnamed: 0,id,age,occupation,sleep_duration,sleep_quality,physical_activity,stress_level,bmi,heart_rate,daily_steps,sleep_disorder,blood_pressure1,blood_pressure2,gender_male,gender_female,gender_other
0,1,27,1,6.1,6,42,6,2,77,4200,0,126,83,True,False,False
1,2,28,2,6.2,6,60,8,0,75,10000,0,125,80,True,False,False
2,3,28,2,6.2,6,60,8,0,75,10000,0,125,80,True,False,False
3,4,28,3,5.9,4,30,8,3,85,3000,1,140,90,True,False,False
4,5,28,3,5.9,4,30,8,3,85,3000,1,140,90,True,False,False


## Gondola dataset (indexing)

In [5]:
# take group of ids from persons and state how the label for this group shall be calculated

In [6]:
# define ferris wheel
num_gondolas = 10
num_part_pg = 5

In [7]:
# generate sample element
random_order = np.arange(len(df_health)) + 1
np.random.shuffle(random_order)

In [8]:
example = [
    random_order[i*num_part_pg:(i+1)*num_part_pg]
    for i in range(num_gondolas)
]

In [9]:
example

[array([ 13, 300, 334, 344,  82]),
 array([219, 202, 373,  96, 313]),
 array([355,  34,   7, 237, 356]),
 array([366, 129,  71,  77, 110]),
 array([132,  54, 220, 372,  64]),
 array([312, 248, 107, 217, 336]),
 array([ 14,  21, 223, 104, 109]),
 array([257, 188, 184, 359, 121]),
 array([ 94, 269, 114, 161, 256]),
 array([295,  47, 253,  86, 199])]

In [10]:
random_order[:num_gondolas*num_part_pg].reshape(num_gondolas, num_part_pg)

array([[ 13, 300, 334, 344,  82],
       [219, 202, 373,  96, 313],
       [355,  34,   7, 237, 356],
       [366, 129,  71,  77, 110],
       [132,  54, 220, 372,  64],
       [312, 248, 107, 217, 336],
       [ 14,  21, 223, 104, 109],
       [257, 188, 184, 359, 121],
       [ 94, 269, 114, 161, 256],
       [295,  47, 253,  86, 199]])

Rules:
- People being happy with other people in same gondola
    + Age composition too seperated is bad
    + shift in gender is bad if too much, 50-50 is good or all one gender
    + same with age composition
    + sleep derived(multiplier with quality) persons get a subtraction and 'good sleepers' get bonus (sleep disorder counts as stronger subtraction)
    + higher heart rate and pressure = joy or fear
    + composition of persons in regards to bmi : extreme values make others (no exception for group all those as to many underweight or overweight persons may be awquard as well)
- People being happy with neighboring gondolas composition
    + same age gets bonus, none gets penalty as potentially group is separated
    + gap between happyness index between self and neighbors causes it to produce a mean of only the neighbors

In [11]:
build_wheel_happyness(df_health, example)

58.27232

In [12]:
build_wheel_happyness(df_health, random_order[:num_gondolas * num_part_pg].reshape(num_gondolas, num_part_pg))

58.27232

In [13]:
# define dataset size
num_to_generate = 10_000
# define ferris wheel
num_gondolas = 10
num_part_pg = 5

In [14]:
df_index, df_health = generate_ferris_dataset(
    num_gondolas=num_gondolas,
    num_part_pg=num_part_pg,
    num_to_generate=num_to_generate,
    dataset_path="../datasets",
    df_intermediate_output_name="health_dataset_preprocessed-1.csv",
    df_name_input="Sleep_health_and_lifestyle_dataset.csv",
    try_pregen=True
)

In [15]:
df_index.head()

Unnamed: 0,g-0_p-0,g-0_p-1,g-0_p-2,g-0_p-3,g-0_p-4,g-1_p-0,g-1_p-1,g-1_p-2,g-1_p-3,g-1_p-4,...,g-8_p-1,g-8_p-2,g-8_p-3,g-8_p-4,g-9_p-0,g-9_p-1,g-9_p-2,g-9_p-3,g-9_p-4,label
0,264,191,98,235,35,356,253,9,147,311,...,315,265,92,23,287,17,234,223,117,68.01148
1,338,176,95,11,161,248,364,53,64,71,...,181,365,96,187,165,38,114,98,56,55.463373
2,290,99,75,21,237,287,215,213,172,214,...,76,188,274,77,264,117,254,168,65,66.108493
3,15,277,93,106,290,354,60,226,75,318,...,243,288,74,131,344,86,266,218,83,58.642507
4,190,260,34,286,360,167,251,298,256,15,...,48,258,107,5,307,121,162,7,37,62.584213


In [16]:
df_health.head()

Unnamed: 0,id,age,occupation,sleep_duration,sleep_quality,physical_activity,stress_level,bmi,heart_rate,daily_steps,sleep_disorder,blood_pressure1,blood_pressure2,gender_male,gender_female,gender_other
0,1,27,1,6.1,6,42,6,2,77,4200,0,126,83,True,False,False
1,2,28,2,6.2,6,60,8,0,75,10000,0,125,80,True,False,False
2,3,28,2,6.2,6,60,8,0,75,10000,0,125,80,True,False,False
3,4,28,3,5.9,4,30,8,3,85,3000,1,140,90,True,False,False
4,5,28,3,5.9,4,30,8,3,85,3000,1,140,90,True,False,False


In [17]:
df_index.iloc[:, :-1]

Unnamed: 0,g-0_p-0,g-0_p-1,g-0_p-2,g-0_p-3,g-0_p-4,g-1_p-0,g-1_p-1,g-1_p-2,g-1_p-3,g-1_p-4,...,g-8_p-0,g-8_p-1,g-8_p-2,g-8_p-3,g-8_p-4,g-9_p-0,g-9_p-1,g-9_p-2,g-9_p-3,g-9_p-4
0,264,191,98,235,35,356,253,9,147,311,...,299,315,265,92,23,287,17,234,223,117
1,338,176,95,11,161,248,364,53,64,71,...,100,181,365,96,187,165,38,114,98,56
2,290,99,75,21,237,287,215,213,172,214,...,122,76,188,274,77,264,117,254,168,65
3,15,277,93,106,290,354,60,226,75,318,...,235,243,288,74,131,344,86,266,218,83
4,190,260,34,286,360,167,251,298,256,15,...,28,48,258,107,5,307,121,162,7,37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,127,84,110,152,265,90,318,72,331,237,...,346,7,341,126,70,208,67,248,291,246
9996,286,97,191,288,32,268,41,347,272,169,...,194,178,269,310,79,370,108,189,340,24
9997,372,168,198,199,239,369,323,277,174,281,...,191,112,75,331,177,163,155,7,188,290
9998,154,310,224,59,87,207,97,78,290,241,...,12,338,254,183,145,22,95,247,158,231


## Total dataset creation

In [18]:
dataset = load_pure_ferris_wheel_dataset(
    num_gondolas=num_gondolas,
    num_part_pg=num_part_pg,
    num_to_generate=num_to_generate,
    dataset_path="../datasets",
    df_intermediate_output_name="health_dataset_preprocessed-1.csv",
    df_name_input="Sleep_health_and_lifestyle_dataset.csv",
    try_pregen=True
)

In [19]:
dataset[0]

(tensor([[4.5000e+01, 1.1000e+01, 6.9000e+00, 7.0000e+00, 5.5000e+01, 5.0000e+00,
          2.0000e+00, 7.5000e+01, 5.5000e+03, 0.0000e+00, 1.2500e+02, 8.2000e+01,
          0.0000e+00, 1.0000e+00, 0.0000e+00],
         [4.3000e+01, 4.0000e+00, 6.7000e+00, 7.0000e+00, 4.5000e+01, 4.0000e+00,
          2.0000e+00, 6.5000e+01, 6.0000e+03, 2.0000e+00, 1.3500e+02, 9.0000e+01,
          0.0000e+00, 1.0000e+00, 0.0000e+00],
         [3.6000e+01, 7.0000e+00, 7.1000e+00, 8.0000e+00, 6.0000e+01, 4.0000e+00,
          0.0000e+00, 6.8000e+01, 7.0000e+03, 0.0000e+00, 1.1500e+02, 7.5000e+01,
          0.0000e+00, 1.0000e+00, 0.0000e+00],
         [4.4000e+01, 4.0000e+00, 6.6000e+00, 7.0000e+00, 4.5000e+01, 4.0000e+00,
          2.0000e+00, 6.5000e+01, 6.0000e+03, 2.0000e+00, 1.3500e+02, 9.0000e+01,
          0.0000e+00, 1.0000e+00, 0.0000e+00],
         [3.1000e+01, 2.0000e+00, 7.7000e+00, 7.0000e+00, 7.5000e+01, 6.0000e+00,
          0.0000e+00, 7.0000e+01, 8.0000e+03, 0.0000e+00, 1.2000e+02, 8.00

## DataLoaders

In [20]:
generator = torch.Generator().manual_seed(420)
train_ds, val_ds, test_ds = random_split(dataset, [0.7, 0.1, 0.2], generator=generator)

In [21]:
train_ds

<torch.utils.data.dataset.Subset at 0x1e9acf701c0>

In [22]:
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)

In [23]:
for x, y in train_loader:
    print(f"x-shape:{x.shape}, y-shape:{y.shape}")
    break

x-shape:torch.Size([32, 50, 15]), y-shape:torch.Size([32])


## Specialize pure datasets

In [24]:
df_added_valid = add_valid_permutations(
    num_to_generate,
    df_index,
    num_gondolas
)

100%|██████████| 10000/10000 [00:00<00:00, 93543.30it/s]


In [25]:
df_added_valid

Unnamed: 0,g-0_p-0,g-0_p-1,g-0_p-2,g-0_p-3,g-0_p-4,g-1_p-0,g-1_p-1,g-1_p-2,g-1_p-3,g-1_p-4,...,g-8_p-1,g-8_p-2,g-8_p-3,g-8_p-4,g-9_p-0,g-9_p-1,g-9_p-2,g-9_p-3,g-9_p-4,label
0,264,191,98,235,35,356,253,9,147,311,...,315,265,92,23,287,17,234,223,117,68.011480
1,338,176,95,11,161,248,364,53,64,71,...,181,365,96,187,165,38,114,98,56,55.463373
2,290,99,75,21,237,287,215,213,172,214,...,76,188,274,77,264,117,254,168,65,66.108493
3,15,277,93,106,290,354,60,226,75,318,...,243,288,74,131,344,86,266,218,83,58.642507
4,190,260,34,286,360,167,251,298,256,15,...,48,258,107,5,307,121,162,7,37,62.584213
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,14,204,272,195,136,217,178,137,46,197,...,263,94,345,248,227,6,102,127,301,59.765280
19996,278,282,328,303,251,265,25,243,199,365,...,342,188,40,77,39,358,34,290,335,71.027800
19997,169,160,69,298,314,234,346,350,246,338,...,173,73,277,286,70,17,180,118,137,71.582987
19998,26,358,181,46,355,83,54,366,125,155,...,277,174,80,106,260,342,194,287,371,57.986000


In [26]:
dataset = load_modified_ferris_wheel_dataset(
    num_gondolas=num_gondolas,
    num_part_pg=num_part_pg,
    num_to_generate=num_to_generate,
    num_valid_to_add=num_to_generate,
    dataset_path="../datasets",
    df_intermediate_output_name="health_dataset_preprocessed-1.csv",
    df_name_input="Sleep_health_and_lifestyle_dataset.csv",
    try_pregen=True
)

## Change regression task to classification

In [27]:
temp = df_index.label/(10*num_gondolas)*10

In [28]:
temp.round()

0       7.0
1       6.0
2       7.0
3       6.0
4       6.0
       ... 
9995    6.0
9996    6.0
9997    7.0
9998    7.0
9999    5.0
Name: label, Length: 10000, dtype: float64

## Problem: dataset too centered around a specific value

In [29]:
df_index.label.describe()

count    10000.000000
mean        63.529537
std          5.500780
min         48.837187
25%         59.621903
50%         63.253673
75%         66.986460
max         90.799333
Name: label, dtype: float64

In [30]:
df_downsample = df_index.copy(deep=True)
df_downsample

Unnamed: 0,g-0_p-0,g-0_p-1,g-0_p-2,g-0_p-3,g-0_p-4,g-1_p-0,g-1_p-1,g-1_p-2,g-1_p-3,g-1_p-4,...,g-8_p-1,g-8_p-2,g-8_p-3,g-8_p-4,g-9_p-0,g-9_p-1,g-9_p-2,g-9_p-3,g-9_p-4,label
0,264,191,98,235,35,356,253,9,147,311,...,315,265,92,23,287,17,234,223,117,68.011480
1,338,176,95,11,161,248,364,53,64,71,...,181,365,96,187,165,38,114,98,56,55.463373
2,290,99,75,21,237,287,215,213,172,214,...,76,188,274,77,264,117,254,168,65,66.108493
3,15,277,93,106,290,354,60,226,75,318,...,243,288,74,131,344,86,266,218,83,58.642507
4,190,260,34,286,360,167,251,298,256,15,...,48,258,107,5,307,121,162,7,37,62.584213
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,127,84,110,152,265,90,318,72,331,237,...,7,341,126,70,208,67,248,291,246,63.678893
9996,286,97,191,288,32,268,41,347,272,169,...,178,269,310,79,370,108,189,340,24,55.686920
9997,372,168,198,199,239,369,323,277,174,281,...,112,75,331,177,163,155,7,188,290,66.045467
9998,154,310,224,59,87,207,97,78,290,241,...,338,254,183,145,22,95,247,158,231,72.811613


In [31]:
df_downsample['label_cat'] = (df_downsample.label/5).round()

In [32]:
df_downsample

Unnamed: 0,g-0_p-0,g-0_p-1,g-0_p-2,g-0_p-3,g-0_p-4,g-1_p-0,g-1_p-1,g-1_p-2,g-1_p-3,g-1_p-4,...,g-8_p-2,g-8_p-3,g-8_p-4,g-9_p-0,g-9_p-1,g-9_p-2,g-9_p-3,g-9_p-4,label,label_cat
0,264,191,98,235,35,356,253,9,147,311,...,265,92,23,287,17,234,223,117,68.011480,14.0
1,338,176,95,11,161,248,364,53,64,71,...,365,96,187,165,38,114,98,56,55.463373,11.0
2,290,99,75,21,237,287,215,213,172,214,...,188,274,77,264,117,254,168,65,66.108493,13.0
3,15,277,93,106,290,354,60,226,75,318,...,288,74,131,344,86,266,218,83,58.642507,12.0
4,190,260,34,286,360,167,251,298,256,15,...,258,107,5,307,121,162,7,37,62.584213,13.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,127,84,110,152,265,90,318,72,331,237,...,341,126,70,208,67,248,291,246,63.678893,13.0
9996,286,97,191,288,32,268,41,347,272,169,...,269,310,79,370,108,189,340,24,55.686920,11.0
9997,372,168,198,199,239,369,323,277,174,281,...,75,331,177,163,155,7,188,290,66.045467,13.0
9998,154,310,224,59,87,207,97,78,290,241,...,254,183,145,22,95,247,158,231,72.811613,15.0


In [33]:
n_samples = df_downsample.groupby('label_cat').size().min()
n_samples

1

In [34]:
downsampled_df = df_downsample.groupby('label_cat').apply(lambda x: x.sample(min(len(x), 10)))
downsampled_df

Unnamed: 0_level_0,Unnamed: 1_level_0,g-0_p-0,g-0_p-1,g-0_p-2,g-0_p-3,g-0_p-4,g-1_p-0,g-1_p-1,g-1_p-2,g-1_p-3,g-1_p-4,...,g-8_p-2,g-8_p-3,g-8_p-4,g-9_p-0,g-9_p-1,g-9_p-2,g-9_p-3,g-9_p-4,label,label_cat
label_cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
10.0,8158,176,74,113,170,94,66,187,283,152,116,...,339,362,111,234,242,123,214,148,51.405973,10.0
10.0,9804,364,19,44,335,190,242,146,63,132,143,...,110,119,215,168,136,348,333,250,52.429067,10.0
10.0,2343,277,13,224,306,307,295,293,3,333,111,...,292,118,215,145,231,348,161,49,51.966147,10.0
10.0,5570,374,265,151,60,367,54,229,277,162,4,...,341,31,146,272,245,369,43,140,50.780053,10.0
10.0,6361,351,125,3,191,71,67,6,214,246,20,...,36,238,341,335,111,61,94,151,50.790893,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17.0,3558,24,134,52,183,42,34,122,143,85,137,...,123,79,326,176,55,50,125,30,84.566170,17.0
17.0,1733,91,101,360,134,87,188,163,70,26,85,...,109,75,223,22,130,350,61,117,84.667533,17.0
17.0,5103,200,247,181,49,260,342,156,340,227,287,...,186,64,353,51,109,112,84,241,82.867680,17.0
17.0,2424,106,4,199,269,110,98,129,7,203,241,...,163,234,164,71,32,179,231,244,85.504837,17.0


In [35]:
downsampled_df.label.describe()

count    76.000000
mean     66.639625
std      10.500593
min      50.596480
25%      57.242400
50%      66.172760
75%      74.905291
max      90.799333
Name: label, dtype: float64

# Generate complete dataset

In [36]:
from itertools import permutations

In [37]:
list(permutations(range(3), 2))

[(0, 1), (0, 2), (1, 0), (1, 2), (2, 0), (2, 1)]

In [38]:
num_gondolas = 10
num_part_pg = 5

In [39]:
n_persons = 374

In [40]:
# create all combinations
all_perms = permutations(range(1, n_persons + 1), num_gondolas*num_part_pg)

Wait... way too large! this has over 10^89 possibilities - scrap this

# Generation of testset

In [41]:
# idea: want to generate elements that are valid permutations and are not in the training set

In [42]:
df_index

Unnamed: 0,g-0_p-0,g-0_p-1,g-0_p-2,g-0_p-3,g-0_p-4,g-1_p-0,g-1_p-1,g-1_p-2,g-1_p-3,g-1_p-4,...,g-8_p-1,g-8_p-2,g-8_p-3,g-8_p-4,g-9_p-0,g-9_p-1,g-9_p-2,g-9_p-3,g-9_p-4,label
0,264,191,98,235,35,356,253,9,147,311,...,315,265,92,23,287,17,234,223,117,68.011480
1,338,176,95,11,161,248,364,53,64,71,...,181,365,96,187,165,38,114,98,56,55.463373
2,290,99,75,21,237,287,215,213,172,214,...,76,188,274,77,264,117,254,168,65,66.108493
3,15,277,93,106,290,354,60,226,75,318,...,243,288,74,131,344,86,266,218,83,58.642507
4,190,260,34,286,360,167,251,298,256,15,...,48,258,107,5,307,121,162,7,37,62.584213
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,127,84,110,152,265,90,318,72,331,237,...,7,341,126,70,208,67,248,291,246,63.678893
9996,286,97,191,288,32,268,41,347,272,169,...,178,269,310,79,370,108,189,340,24,55.686920
9997,372,168,198,199,239,369,323,277,174,281,...,112,75,331,177,163,155,7,188,290,66.045467
9998,154,310,224,59,87,207,97,78,290,241,...,338,254,183,145,22,95,247,158,231,72.811613


In [43]:
df_test = sample_new_permutations(
    df_index=df_index,
    num_gondolas=num_gondolas,
    num_elem=10,
    merge_check=True
)

100%|██████████| 10/10 [00:00<00:00, 10005.50it/s]


In [44]:
df_test

Unnamed: 0,g-0_p-0,g-0_p-1,g-0_p-2,g-0_p-3,g-0_p-4,g-1_p-0,g-1_p-1,g-1_p-2,g-1_p-3,g-1_p-4,...,g-8_p-1,g-8_p-2,g-8_p-3,g-8_p-4,g-9_p-0,g-9_p-1,g-9_p-2,g-9_p-3,g-9_p-4,label
0,171,90,157,176,135,173,204,142,262,213,...,231,292,6,366,240,372,104,108,245,68.67872
1,149,372,148,164,145,187,335,374,353,61,...,239,317,346,176,312,93,96,329,34,57.2614
2,242,62,333,192,12,203,303,245,266,44,...,293,14,106,342,196,56,42,78,294,62.591667
3,166,111,143,29,351,282,120,326,46,3,...,325,162,28,84,92,34,153,137,123,66.252413
4,160,16,79,94,132,65,326,121,45,243,...,78,267,25,246,138,351,194,107,269,63.542427
5,91,239,325,287,195,292,117,268,320,334,...,21,148,43,274,211,33,169,74,168,60.318107
6,54,370,29,248,53,122,31,131,157,278,...,204,371,186,343,350,283,12,130,216,70.58172
7,319,205,136,65,311,316,302,367,117,292,...,369,224,231,211,31,374,137,277,23,62.375333
8,146,17,352,53,39,125,183,22,367,133,...,227,251,67,6,64,124,49,271,91,64.633307
9,261,361,317,282,237,249,43,353,341,128,...,203,181,19,335,206,55,277,326,324,67.247347


In [45]:
df_test2 = df_index.iloc[-10:]
df_test2

Unnamed: 0,g-0_p-0,g-0_p-1,g-0_p-2,g-0_p-3,g-0_p-4,g-1_p-0,g-1_p-1,g-1_p-2,g-1_p-3,g-1_p-4,...,g-8_p-1,g-8_p-2,g-8_p-3,g-8_p-4,g-9_p-0,g-9_p-1,g-9_p-2,g-9_p-3,g-9_p-4,label
9990,106,45,228,126,234,246,89,105,243,344,...,130,87,167,310,217,146,21,54,317,61.771467
9991,324,53,301,257,20,48,316,73,319,184,...,138,195,237,11,7,291,221,249,272,62.655867
9992,284,346,238,179,80,22,190,123,17,242,...,71,265,347,186,191,37,78,234,320,64.899493
9993,123,261,272,361,15,8,346,89,281,242,...,99,371,32,144,55,178,60,17,319,57.06012
9994,146,13,58,171,90,4,24,31,187,185,...,86,205,244,196,222,29,295,199,137,59.29568
9995,127,84,110,152,265,90,318,72,331,237,...,7,341,126,70,208,67,248,291,246,63.678893
9996,286,97,191,288,32,268,41,347,272,169,...,178,269,310,79,370,108,189,340,24,55.68692
9997,372,168,198,199,239,369,323,277,174,281,...,112,75,331,177,163,155,7,188,290,66.045467
9998,154,310,224,59,87,207,97,78,290,241,...,338,254,183,145,22,95,247,158,231,72.811613
9999,351,31,277,304,357,35,111,254,40,69,...,71,65,163,296,123,363,180,148,297,53.81376


In [46]:
merged = df_test2.merge(df_index, how='left', indicator=True)

In [47]:
merged

Unnamed: 0,g-0_p-0,g-0_p-1,g-0_p-2,g-0_p-3,g-0_p-4,g-1_p-0,g-1_p-1,g-1_p-2,g-1_p-3,g-1_p-4,...,g-8_p-2,g-8_p-3,g-8_p-4,g-9_p-0,g-9_p-1,g-9_p-2,g-9_p-3,g-9_p-4,label,_merge
0,106,45,228,126,234,246,89,105,243,344,...,87,167,310,217,146,21,54,317,61.771467,both
1,324,53,301,257,20,48,316,73,319,184,...,195,237,11,7,291,221,249,272,62.655867,both
2,284,346,238,179,80,22,190,123,17,242,...,265,347,186,191,37,78,234,320,64.899493,both
3,123,261,272,361,15,8,346,89,281,242,...,371,32,144,55,178,60,17,319,57.06012,both
4,146,13,58,171,90,4,24,31,187,185,...,205,244,196,222,29,295,199,137,59.29568,both
5,127,84,110,152,265,90,318,72,331,237,...,341,126,70,208,67,248,291,246,63.678893,both
6,286,97,191,288,32,268,41,347,272,169,...,269,310,79,370,108,189,340,24,55.68692,both
7,372,168,198,199,239,369,323,277,174,281,...,75,331,177,163,155,7,188,290,66.045467,both
8,154,310,224,59,87,207,97,78,290,241,...,254,183,145,22,95,247,158,231,72.811613,both
9,351,31,277,304,357,35,111,254,40,69,...,65,163,296,123,363,180,148,297,53.81376,both


In [48]:
sum(merged['_merge'] == 'both')

10

In [49]:
merged._merge

0    both
1    both
2    both
3    both
4    both
5    both
6    both
7    both
8    both
9    both
Name: _merge, dtype: category
Categories (3, object): ['left_only', 'right_only', 'both']