# Preparation of Dataset (development notebook)

## Import section

In [1]:
import numpy as np
import pandas as pd

from etnn.data.prepare_ferris_wheel import prepare_1_ferris, generate_ferris_dataset, add_valid_permutations, \
    sample_new_permutations
from etnn.data.ferris_score_helpers import build_wheel_happyness
from etnn.data.ferris_wheel import load_pure_ferris_wheel_dataset, load_modified_ferris_wheel_dataset
import torch
from torch.utils.data import random_split, DataLoader
from tqdm import tqdm

In [2]:
DATASET_PATH = "../datasets"

## Person dataset

In [3]:
df_health = prepare_1_ferris(
    dataset_path="../datasets",
    df_name_output="health_dataset_preprocessed-1.csv",
    df_name_input="Sleep_health_and_lifestyle_dataset.csv",
    try_pregen=True
)

In [4]:
df_health.head()

Unnamed: 0,id,age,occupation,sleep_duration,sleep_quality,physical_activity,stress_level,bmi,heart_rate,daily_steps,sleep_disorder,blood_pressure1,blood_pressure2,gender_male,gender_female,gender_other
0,1,27,1,6.1,6,42,6,2,77,4200,0,126,83,True,False,False
1,2,28,2,6.2,6,60,8,0,75,10000,0,125,80,True,False,False
2,3,28,2,6.2,6,60,8,0,75,10000,0,125,80,True,False,False
3,4,28,3,5.9,4,30,8,3,85,3000,1,140,90,True,False,False
4,5,28,3,5.9,4,30,8,3,85,3000,1,140,90,True,False,False


## Gondola dataset (indexing)

In [5]:
# take group of ids from persons and state how the label for this group shall be calculated

In [6]:
# define ferris wheel
num_gondolas = 10
num_part_pg = 5

In [7]:
# generate sample element
random_order = np.arange(len(df_health)) + 1
np.random.shuffle(random_order)

In [8]:
example = [
    random_order[i*num_part_pg:(i+1)*num_part_pg]
    for i in range(num_gondolas)
]

In [9]:
example

[array([136, 324, 185, 204,  46]),
 array([365, 217, 367, 360, 143]),
 array([214, 108, 352,  19,   4]),
 array([183, 196,  18,  31, 152]),
 array([221, 165, 304, 216,   5]),
 array([316, 318, 161, 268, 172]),
 array([125, 112, 334, 213, 323]),
 array([278, 332, 286, 191, 330]),
 array([281, 298, 211, 366,  80]),
 array([ 20, 160, 260, 232, 369])]

In [10]:
random_order[:num_gondolas*num_part_pg].reshape(num_gondolas, num_part_pg)

array([[136, 324, 185, 204,  46],
       [365, 217, 367, 360, 143],
       [214, 108, 352,  19,   4],
       [183, 196,  18,  31, 152],
       [221, 165, 304, 216,   5],
       [316, 318, 161, 268, 172],
       [125, 112, 334, 213, 323],
       [278, 332, 286, 191, 330],
       [281, 298, 211, 366,  80],
       [ 20, 160, 260, 232, 369]])

Rules:
- People being happy with other people in same gondola
    + Age composition too seperated is bad
    + shift in gender is bad if too much, 50-50 is good or all one gender
    + same with age composition
    + sleep derived(multiplier with quality) persons get a subtraction and 'good sleepers' get bonus (sleep disorder counts as stronger subtraction)
    + higher heart rate and pressure = joy or fear
    + composition of persons in regards to bmi : extreme values make others (no exception for group all those as to many underweight or overweight persons may be awquard as well)
- People being happy with neighboring gondolas composition
    + same age gets bonus, none gets penalty as potentially group is separated
    + gap between happyness index between self and neighbors causes it to produce a mean of only the neighbors

In [11]:
build_wheel_happyness(df_health, example)

62.27214666666666

In [12]:
build_wheel_happyness(df_health, random_order[:num_gondolas * num_part_pg].reshape(num_gondolas, num_part_pg))

62.27214666666666

In [13]:
# define dataset size
num_to_generate = 10_000
# define ferris wheel
num_gondolas = 10
num_part_pg = 5

In [14]:
df_index, df_health = generate_ferris_dataset(
    num_gondolas=num_gondolas,
    num_part_pg=num_part_pg,
    num_to_generate=num_to_generate,
    dataset_path="../datasets",
    df_intermediate_output_name="health_dataset_preprocessed-1.csv",
    df_name_input="Sleep_health_and_lifestyle_dataset.csv",
    try_pregen=True
)

100%|██████████| 10000/10000 [00:00<00:00, 139473.07it/s]
100%|██████████| 10000/10000 [00:25<00:00, 392.18it/s]


In [15]:
df_index.head()

Unnamed: 0,g-0_p-0,g-0_p-1,g-0_p-2,g-0_p-3,g-0_p-4,g-1_p-0,g-1_p-1,g-1_p-2,g-1_p-3,g-1_p-4,...,g-8_p-1,g-8_p-2,g-8_p-3,g-8_p-4,g-9_p-0,g-9_p-1,g-9_p-2,g-9_p-3,g-9_p-4,label
0,264,191,98,235,35,356,253,9,147,311,...,315,265,92,23,287,17,234,223,117,53.81376
1,338,176,95,11,161,248,364,53,64,71,...,181,365,96,187,165,38,114,98,56,53.81376
2,290,99,75,21,237,287,215,213,172,214,...,76,188,274,77,264,117,254,168,65,53.81376
3,15,277,93,106,290,354,60,226,75,318,...,243,288,74,131,344,86,266,218,83,53.81376
4,190,260,34,286,360,167,251,298,256,15,...,48,258,107,5,307,121,162,7,37,53.81376


In [16]:
df_health.head()

Unnamed: 0,id,age,occupation,sleep_duration,sleep_quality,physical_activity,stress_level,bmi,heart_rate,daily_steps,sleep_disorder,blood_pressure1,blood_pressure2,gender_male,gender_female,gender_other
0,1,27,1,6.1,6,42,6,2,77,4200,0,126,83,True,False,False
1,2,28,2,6.2,6,60,8,0,75,10000,0,125,80,True,False,False
2,3,28,2,6.2,6,60,8,0,75,10000,0,125,80,True,False,False
3,4,28,3,5.9,4,30,8,3,85,3000,1,140,90,True,False,False
4,5,28,3,5.9,4,30,8,3,85,3000,1,140,90,True,False,False


In [17]:
df_index.iloc[:, :-1]

Unnamed: 0,g-0_p-0,g-0_p-1,g-0_p-2,g-0_p-3,g-0_p-4,g-1_p-0,g-1_p-1,g-1_p-2,g-1_p-3,g-1_p-4,...,g-8_p-0,g-8_p-1,g-8_p-2,g-8_p-3,g-8_p-4,g-9_p-0,g-9_p-1,g-9_p-2,g-9_p-3,g-9_p-4
0,264,191,98,235,35,356,253,9,147,311,...,299,315,265,92,23,287,17,234,223,117
1,338,176,95,11,161,248,364,53,64,71,...,100,181,365,96,187,165,38,114,98,56
2,290,99,75,21,237,287,215,213,172,214,...,122,76,188,274,77,264,117,254,168,65
3,15,277,93,106,290,354,60,226,75,318,...,235,243,288,74,131,344,86,266,218,83
4,190,260,34,286,360,167,251,298,256,15,...,28,48,258,107,5,307,121,162,7,37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,127,84,110,152,265,90,318,72,331,237,...,346,7,341,126,70,208,67,248,291,246
9996,286,97,191,288,32,268,41,347,272,169,...,194,178,269,310,79,370,108,189,340,24
9997,372,168,198,199,239,369,323,277,174,281,...,191,112,75,331,177,163,155,7,188,290
9998,154,310,224,59,87,207,97,78,290,241,...,12,338,254,183,145,22,95,247,158,231


## Total dataset creation

In [18]:
dataset = load_pure_ferris_wheel_dataset(
    num_gondolas=num_gondolas,
    num_part_pg=num_part_pg,
    num_to_generate=num_to_generate,
    dataset_path="../datasets",
    df_intermediate_output_name="health_dataset_preprocessed-1.csv",
    df_name_input="Sleep_health_and_lifestyle_dataset.csv",
    try_pregen=True
)

In [19]:
dataset[0]

(tensor([[4.5000e+01, 1.1000e+01, 6.9000e+00, 7.0000e+00, 5.5000e+01, 5.0000e+00,
          2.0000e+00, 7.5000e+01, 5.5000e+03, 0.0000e+00, 1.2500e+02, 8.2000e+01,
          0.0000e+00, 1.0000e+00, 0.0000e+00],
         [4.3000e+01, 4.0000e+00, 6.7000e+00, 7.0000e+00, 4.5000e+01, 4.0000e+00,
          2.0000e+00, 6.5000e+01, 6.0000e+03, 2.0000e+00, 1.3500e+02, 9.0000e+01,
          0.0000e+00, 1.0000e+00, 0.0000e+00],
         [3.6000e+01, 7.0000e+00, 7.1000e+00, 8.0000e+00, 6.0000e+01, 4.0000e+00,
          0.0000e+00, 6.8000e+01, 7.0000e+03, 0.0000e+00, 1.1500e+02, 7.5000e+01,
          0.0000e+00, 1.0000e+00, 0.0000e+00],
         [4.4000e+01, 4.0000e+00, 6.6000e+00, 7.0000e+00, 4.5000e+01, 4.0000e+00,
          2.0000e+00, 6.5000e+01, 6.0000e+03, 2.0000e+00, 1.3500e+02, 9.0000e+01,
          0.0000e+00, 1.0000e+00, 0.0000e+00],
         [3.1000e+01, 2.0000e+00, 7.7000e+00, 7.0000e+00, 7.5000e+01, 6.0000e+00,
          0.0000e+00, 7.0000e+01, 8.0000e+03, 0.0000e+00, 1.2000e+02, 8.00

## DataLoaders

In [20]:
generator = torch.Generator().manual_seed(420)
train_ds, val_ds, test_ds = random_split(dataset, [0.7, 0.1, 0.2], generator=generator)

In [21]:
train_ds

<torch.utils.data.dataset.Subset at 0x1c2b3ce6a30>

In [22]:
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)

In [23]:
for x, y in train_loader:
    print(f"x-shape:{x.shape}, y-shape:{y.shape}")
    break

x-shape:torch.Size([32, 50, 15]), y-shape:torch.Size([32])


## Specialize pure datasets

In [24]:
df_added_valid = add_valid_permutations(
    num_to_generate,
    df_index,
    num_gondolas
)

100%|██████████| 10000/10000 [00:00<00:00, 79958.97it/s]


In [25]:
df_added_valid

Unnamed: 0,g-0_p-0,g-0_p-1,g-0_p-2,g-0_p-3,g-0_p-4,g-1_p-0,g-1_p-1,g-1_p-2,g-1_p-3,g-1_p-4,...,g-8_p-1,g-8_p-2,g-8_p-3,g-8_p-4,g-9_p-0,g-9_p-1,g-9_p-2,g-9_p-3,g-9_p-4,label
0,264,191,98,235,35,356,253,9,147,311,...,315,265,92,23,287,17,234,223,117,53.81376
1,338,176,95,11,161,248,364,53,64,71,...,181,365,96,187,165,38,114,98,56,53.81376
2,290,99,75,21,237,287,215,213,172,214,...,76,188,274,77,264,117,254,168,65,53.81376
3,15,277,93,106,290,354,60,226,75,318,...,243,288,74,131,344,86,266,218,83,53.81376
4,190,260,34,286,360,167,251,298,256,15,...,48,258,107,5,307,121,162,7,37,53.81376
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,14,204,272,195,136,217,178,137,46,197,...,263,94,345,248,227,6,102,127,301,53.81376
19996,278,282,328,303,251,265,25,243,199,365,...,342,188,40,77,39,358,34,290,335,53.81376
19997,169,160,69,298,314,234,346,350,246,338,...,173,73,277,286,70,17,180,118,137,53.81376
19998,26,358,181,46,355,83,54,366,125,155,...,277,174,80,106,260,342,194,287,371,53.81376


In [26]:
dataset = load_modified_ferris_wheel_dataset(
    num_gondolas=num_gondolas,
    num_part_pg=num_part_pg,
    num_to_generate=num_to_generate,
    num_valid_to_add=num_to_generate,
    dataset_path="../datasets",
    df_intermediate_output_name="health_dataset_preprocessed-1.csv",
    df_name_input="Sleep_health_and_lifestyle_dataset.csv",
    try_pregen=True
)

100%|██████████| 10000/10000 [00:00<00:00, 97282.01it/s]
100%|██████████| 1000/1000 [00:00<00:00, 94338.82it/s]


## Change regression task to classification

In [27]:
temp = df_index.label/(10*num_gondolas)*10

In [28]:
temp.round()

0       5.0
1       5.0
2       5.0
3       5.0
4       5.0
       ... 
9995    5.0
9996    5.0
9997    5.0
9998    5.0
9999    5.0
Name: label, Length: 10000, dtype: float64

## Problem: dataset too centered around a specific value

In [29]:
df_index.label.describe()

count    1.000000e+04
mean     5.381376e+01
std      5.641991e-12
min      5.381376e+01
25%      5.381376e+01
50%      5.381376e+01
75%      5.381376e+01
max      5.381376e+01
Name: label, dtype: float64

In [30]:
df_downsample = df_index.copy(deep=True)
df_downsample

Unnamed: 0,g-0_p-0,g-0_p-1,g-0_p-2,g-0_p-3,g-0_p-4,g-1_p-0,g-1_p-1,g-1_p-2,g-1_p-3,g-1_p-4,...,g-8_p-1,g-8_p-2,g-8_p-3,g-8_p-4,g-9_p-0,g-9_p-1,g-9_p-2,g-9_p-3,g-9_p-4,label
0,264,191,98,235,35,356,253,9,147,311,...,315,265,92,23,287,17,234,223,117,53.81376
1,338,176,95,11,161,248,364,53,64,71,...,181,365,96,187,165,38,114,98,56,53.81376
2,290,99,75,21,237,287,215,213,172,214,...,76,188,274,77,264,117,254,168,65,53.81376
3,15,277,93,106,290,354,60,226,75,318,...,243,288,74,131,344,86,266,218,83,53.81376
4,190,260,34,286,360,167,251,298,256,15,...,48,258,107,5,307,121,162,7,37,53.81376
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,127,84,110,152,265,90,318,72,331,237,...,7,341,126,70,208,67,248,291,246,53.81376
9996,286,97,191,288,32,268,41,347,272,169,...,178,269,310,79,370,108,189,340,24,53.81376
9997,372,168,198,199,239,369,323,277,174,281,...,112,75,331,177,163,155,7,188,290,53.81376
9998,154,310,224,59,87,207,97,78,290,241,...,338,254,183,145,22,95,247,158,231,53.81376


In [31]:
df_downsample['label_cat'] = (df_downsample.label/5).round()

In [32]:
df_downsample

Unnamed: 0,g-0_p-0,g-0_p-1,g-0_p-2,g-0_p-3,g-0_p-4,g-1_p-0,g-1_p-1,g-1_p-2,g-1_p-3,g-1_p-4,...,g-8_p-2,g-8_p-3,g-8_p-4,g-9_p-0,g-9_p-1,g-9_p-2,g-9_p-3,g-9_p-4,label,label_cat
0,264,191,98,235,35,356,253,9,147,311,...,265,92,23,287,17,234,223,117,53.81376,11.0
1,338,176,95,11,161,248,364,53,64,71,...,365,96,187,165,38,114,98,56,53.81376,11.0
2,290,99,75,21,237,287,215,213,172,214,...,188,274,77,264,117,254,168,65,53.81376,11.0
3,15,277,93,106,290,354,60,226,75,318,...,288,74,131,344,86,266,218,83,53.81376,11.0
4,190,260,34,286,360,167,251,298,256,15,...,258,107,5,307,121,162,7,37,53.81376,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,127,84,110,152,265,90,318,72,331,237,...,341,126,70,208,67,248,291,246,53.81376,11.0
9996,286,97,191,288,32,268,41,347,272,169,...,269,310,79,370,108,189,340,24,53.81376,11.0
9997,372,168,198,199,239,369,323,277,174,281,...,75,331,177,163,155,7,188,290,53.81376,11.0
9998,154,310,224,59,87,207,97,78,290,241,...,254,183,145,22,95,247,158,231,53.81376,11.0


In [33]:
n_samples = df_downsample.groupby('label_cat').size().min()
n_samples

10000

In [34]:
downsampled_df = df_downsample.groupby('label_cat').apply(lambda x: x.sample(min(len(x), 10)))
downsampled_df

Unnamed: 0_level_0,Unnamed: 1_level_0,g-0_p-0,g-0_p-1,g-0_p-2,g-0_p-3,g-0_p-4,g-1_p-0,g-1_p-1,g-1_p-2,g-1_p-3,g-1_p-4,...,g-8_p-2,g-8_p-3,g-8_p-4,g-9_p-0,g-9_p-1,g-9_p-2,g-9_p-3,g-9_p-4,label,label_cat
label_cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
11.0,4906,20,171,374,254,221,249,38,204,276,2,...,45,178,153,13,321,345,24,146,53.81376,11.0
11.0,5100,337,230,294,174,47,279,199,140,110,183,...,97,46,229,224,241,292,102,86,53.81376,11.0
11.0,8358,75,160,304,18,15,46,267,252,247,34,...,343,61,50,308,28,230,63,73,53.81376,11.0
11.0,6965,22,1,195,97,15,152,10,16,143,41,...,313,77,299,63,129,372,64,228,53.81376,11.0
11.0,4485,150,129,194,145,147,101,103,269,143,197,...,120,63,21,326,364,239,53,48,53.81376,11.0
11.0,1594,11,260,192,356,190,293,39,95,183,302,...,233,296,300,327,240,350,166,276,53.81376,11.0
11.0,4953,313,171,49,100,308,80,43,259,267,341,...,271,85,325,238,373,122,338,164,53.81376,11.0
11.0,3056,212,207,101,276,326,324,184,206,37,209,...,129,204,80,136,345,365,330,193,53.81376,11.0
11.0,4724,274,154,150,40,85,368,110,29,211,226,...,12,243,71,261,225,11,7,251,53.81376,11.0
11.0,9030,239,101,363,176,303,234,293,97,132,83,...,251,73,192,226,6,16,2,288,53.81376,11.0


In [35]:
downsampled_df.label.describe()

count    10.00000
mean     53.81376
std       0.00000
min      53.81376
25%      53.81376
50%      53.81376
75%      53.81376
max      53.81376
Name: label, dtype: float64

# Generate complete dataset

In [36]:
from itertools import permutations

In [37]:
list(permutations(range(3), 2))

[(0, 1), (0, 2), (1, 0), (1, 2), (2, 0), (2, 1)]

In [38]:
num_gondolas = 10
num_part_pg = 5

In [39]:
n_persons = 374

In [40]:
# create all combinations
all_perms = permutations(range(1, n_persons + 1), num_gondolas*num_part_pg)

Wait... way too large! this has over 10^89 possibilities - scrap this

# Generation of testset

In [41]:
# idea: want to generate elements that are valid permutations and are not in the training set

In [42]:
df_index

Unnamed: 0,g-0_p-0,g-0_p-1,g-0_p-2,g-0_p-3,g-0_p-4,g-1_p-0,g-1_p-1,g-1_p-2,g-1_p-3,g-1_p-4,...,g-8_p-1,g-8_p-2,g-8_p-3,g-8_p-4,g-9_p-0,g-9_p-1,g-9_p-2,g-9_p-3,g-9_p-4,label
0,264,191,98,235,35,356,253,9,147,311,...,315,265,92,23,287,17,234,223,117,53.81376
1,338,176,95,11,161,248,364,53,64,71,...,181,365,96,187,165,38,114,98,56,53.81376
2,290,99,75,21,237,287,215,213,172,214,...,76,188,274,77,264,117,254,168,65,53.81376
3,15,277,93,106,290,354,60,226,75,318,...,243,288,74,131,344,86,266,218,83,53.81376
4,190,260,34,286,360,167,251,298,256,15,...,48,258,107,5,307,121,162,7,37,53.81376
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,127,84,110,152,265,90,318,72,331,237,...,7,341,126,70,208,67,248,291,246,53.81376
9996,286,97,191,288,32,268,41,347,272,169,...,178,269,310,79,370,108,189,340,24,53.81376
9997,372,168,198,199,239,369,323,277,174,281,...,112,75,331,177,163,155,7,188,290,53.81376
9998,154,310,224,59,87,207,97,78,290,241,...,338,254,183,145,22,95,247,158,231,53.81376


In [43]:
df_test = sample_new_permutations(
    df_index=df_index,
    num_gondolas=num_gondolas,
    num_elem=10,
    merge_check=True
)

100%|██████████| 10/10 [00:00<?, ?it/s]


In [44]:
df_test

Unnamed: 0,g-0_p-0,g-0_p-1,g-0_p-2,g-0_p-3,g-0_p-4,g-1_p-0,g-1_p-1,g-1_p-2,g-1_p-3,g-1_p-4,...,g-8_p-1,g-8_p-2,g-8_p-3,g-8_p-4,g-9_p-0,g-9_p-1,g-9_p-2,g-9_p-3,g-9_p-4,label
0,371,265,363,162,182,195,94,359,97,206,...,101,237,296,142,226,368,198,38,331,53.81376
1,354,372,364,236,296,187,58,303,304,186,...,213,328,116,85,69,246,335,313,140,53.81376
2,146,33,147,58,195,271,328,157,69,80,...,167,105,24,374,362,36,371,357,369,53.81376
3,246,332,329,22,337,291,114,342,165,286,...,206,217,118,212,262,48,234,139,233,53.81376
4,355,13,22,275,56,131,76,320,313,198,...,260,53,122,93,187,276,219,153,263,53.81376
5,101,158,85,141,192,26,255,156,245,177,...,230,312,372,367,181,306,348,189,62,53.81376
6,129,345,279,230,354,25,128,237,311,4,...,166,321,198,127,304,1,153,23,262,53.81376
7,360,282,38,138,127,117,212,325,164,7,...,192,372,277,99,362,46,335,293,185,53.81376
8,202,8,25,228,128,136,90,299,220,310,...,205,161,147,238,36,113,66,81,305,53.81376
9,318,95,294,259,120,105,288,167,268,250,...,126,135,181,212,219,228,82,77,24,53.81376


In [45]:
df_test2 = df_index.iloc[-10:]
df_test2

Unnamed: 0,g-0_p-0,g-0_p-1,g-0_p-2,g-0_p-3,g-0_p-4,g-1_p-0,g-1_p-1,g-1_p-2,g-1_p-3,g-1_p-4,...,g-8_p-1,g-8_p-2,g-8_p-3,g-8_p-4,g-9_p-0,g-9_p-1,g-9_p-2,g-9_p-3,g-9_p-4,label
9990,106,45,228,126,234,246,89,105,243,344,...,130,87,167,310,217,146,21,54,317,53.81376
9991,324,53,301,257,20,48,316,73,319,184,...,138,195,237,11,7,291,221,249,272,53.81376
9992,284,346,238,179,80,22,190,123,17,242,...,71,265,347,186,191,37,78,234,320,53.81376
9993,123,261,272,361,15,8,346,89,281,242,...,99,371,32,144,55,178,60,17,319,53.81376
9994,146,13,58,171,90,4,24,31,187,185,...,86,205,244,196,222,29,295,199,137,53.81376
9995,127,84,110,152,265,90,318,72,331,237,...,7,341,126,70,208,67,248,291,246,53.81376
9996,286,97,191,288,32,268,41,347,272,169,...,178,269,310,79,370,108,189,340,24,53.81376
9997,372,168,198,199,239,369,323,277,174,281,...,112,75,331,177,163,155,7,188,290,53.81376
9998,154,310,224,59,87,207,97,78,290,241,...,338,254,183,145,22,95,247,158,231,53.81376
9999,351,31,277,304,357,35,111,254,40,69,...,71,65,163,296,123,363,180,148,297,53.81376


In [46]:
merged = df_test2.merge(df_index, how='left', indicator=True)

In [47]:
merged

Unnamed: 0,g-0_p-0,g-0_p-1,g-0_p-2,g-0_p-3,g-0_p-4,g-1_p-0,g-1_p-1,g-1_p-2,g-1_p-3,g-1_p-4,...,g-8_p-2,g-8_p-3,g-8_p-4,g-9_p-0,g-9_p-1,g-9_p-2,g-9_p-3,g-9_p-4,label,_merge
0,106,45,228,126,234,246,89,105,243,344,...,87,167,310,217,146,21,54,317,53.81376,both
1,324,53,301,257,20,48,316,73,319,184,...,195,237,11,7,291,221,249,272,53.81376,both
2,284,346,238,179,80,22,190,123,17,242,...,265,347,186,191,37,78,234,320,53.81376,both
3,123,261,272,361,15,8,346,89,281,242,...,371,32,144,55,178,60,17,319,53.81376,both
4,146,13,58,171,90,4,24,31,187,185,...,205,244,196,222,29,295,199,137,53.81376,both
5,127,84,110,152,265,90,318,72,331,237,...,341,126,70,208,67,248,291,246,53.81376,both
6,286,97,191,288,32,268,41,347,272,169,...,269,310,79,370,108,189,340,24,53.81376,both
7,372,168,198,199,239,369,323,277,174,281,...,75,331,177,163,155,7,188,290,53.81376,both
8,154,310,224,59,87,207,97,78,290,241,...,254,183,145,22,95,247,158,231,53.81376,both
9,351,31,277,304,357,35,111,254,40,69,...,65,163,296,123,363,180,148,297,53.81376,both


In [48]:
sum(merged['_merge'] == 'both')

10

In [49]:
merged._merge

0    both
1    both
2    both
3    both
4    both
5    both
6    both
7    both
8    both
9    both
Name: _merge, dtype: category
Categories (3, object): ['left_only', 'right_only', 'both']