In [29]:
import pandas as pd
from pathlib import Path
from itertools import product
from tqdm import tqdm
import random

## Loading the data

In [30]:
standartized_unbalanced_data = pd.read_csv('../data/authentication/unbalanced/RecodGait_v2/standartized_unbalanced.csv')
standartized_unbalanced_data['user'] = standartized_unbalanced_data['user'].apply(lambda x: int(x))
standartized_unbalanced_data['session'] = standartized_unbalanced_data['session'].apply(lambda x: int(x))
standartized_unbalanced_data

Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,accel-z-57,accel-z-58,accel-z-59,user,level_0,timestamp diff,accel-start-time,index,session,window
0,-0.380865,-0.490814,-0.138980,-0.540292,-1.029622,0.151683,-0.271362,0.093922,0.233053,0.885288,...,-4.849618,-1.033230,-0.527983,1,0.0,0.025,0.025,1.0,1,0.0
1,0.759736,6.453595,3.141979,-1.764701,0.312573,-1.741856,-0.636970,-0.619488,-0.731601,-0.936111,...,0.786765,2.035368,2.023028,1,60.0,0.025,1.525,1.0,1,1.0
2,0.090258,0.748776,0.118458,0.980351,0.895728,1.314015,3.890168,-0.596918,-1.771354,0.153991,...,0.217113,-0.324870,0.715738,1,120.0,0.025,3.025,1.0,1,2.0
3,1.180711,0.123304,-0.242732,-2.090366,-3.026408,-2.704246,-1.275486,-1.903105,-0.559540,1.222120,...,1.849099,1.107216,-2.741191,1,180.0,0.025,4.525,1.0,1,3.0
4,1.504804,-3.261596,-1.013031,-0.832169,6.022226,1.731795,-2.193926,0.393643,2.146832,-0.179159,...,2.350917,1.821752,0.778877,1,240.0,0.025,6.025,1.0,1,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27725,4.407922,2.885707,-1.464764,-3.105162,0.814450,-0.823310,-0.953835,-0.914520,-1.152465,-1.445332,...,-0.089251,-0.728952,-3.189619,144,3351292.0,0.025,141.025,280.0,3,94.0
27726,-2.558267,-1.422138,0.176106,0.849109,-0.541233,-0.348314,-2.053597,-1.504994,2.248131,-0.465023,...,7.003553,2.891149,-2.005648,144,3351352.0,0.025,142.525,280.0,3,95.0
27727,-1.092603,-4.081088,0.190277,-0.199893,-2.203307,-1.567213,-0.137114,0.511935,0.349388,-2.119933,...,-1.870522,0.583068,-2.984589,144,3351412.0,0.025,144.025,280.0,3,96.0
27728,-1.597260,0.267185,0.074118,-0.914575,-0.358683,0.152376,-0.300467,4.109457,3.517375,-4.086155,...,-8.364679,-0.525942,0.958939,144,3351472.0,0.025,145.525,280.0,3,97.0


## Checking the samples per sessions

In [31]:
# Group by user and create a column for every unique sessions
data_sessions = standartized_unbalanced_data.groupby(['user', 'session']).size().unstack(fill_value=0)
data_sessions['S-count'] = data_sessions.apply(lambda row: 5 - row.value_counts().get(0, 0), axis=1)
train_data_sessions = data_sessions[data_sessions['S-count'].isin([3,5])]
val_data_sessions = data_sessions[data_sessions['S-count'] == 4]
test_data_sessions = data_sessions[data_sessions['S-count'] == 2]

In [32]:
train_user_ids = train_data_sessions.index
val_user_ids = val_data_sessions.index
test_user_ids = test_data_sessions.index

In [33]:
train_data_sessions

session,1,3,5,7,9,S-count
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,99,99,99,0,0,3
5,99,99,99,99,99,5
7,99,99,99,0,0,3
11,99,99,99,99,99,5
14,99,99,99,0,0,3
17,99,99,99,0,0,3
21,99,99,99,0,0,3
25,99,99,99,0,0,3
27,99,99,99,0,0,3
29,99,99,99,0,0,3


In [34]:
val_data_sessions

session,1,3,5,7,9,S-count
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,99,99,99,99,0,4
3,99,99,99,99,0,4
8,99,99,99,99,0,4
9,99,99,99,99,0,4
13,99,99,99,99,0,4
22,99,99,99,99,0,4
28,99,99,99,99,0,4
32,99,99,99,99,0,4
36,100,99,99,99,0,4
50,99,99,99,99,0,4


In [35]:
test_data_sessions

session,1,3,5,7,9,S-count
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4,99,99,0,0,0,2
10,99,99,0,0,0,2
12,99,99,0,0,0,2
15,99,99,0,0,0,2
16,99,99,0,0,0,2
...,...,...,...,...,...,...
139,99,99,0,0,0,2
140,99,99,0,0,0,2
141,99,100,0,0,0,2
143,99,99,0,0,0,2


In [36]:
train_user_ids

Index([ 2,  5,  7, 11, 14, 17, 21, 25, 27, 29, 30, 34, 37, 39, 40, 42, 43, 46,
       52, 53, 54, 58, 77],
      dtype='int64', name='user')

In [37]:
val_user_ids

Index([1, 3, 8, 9, 13, 22, 28, 32, 36, 50], dtype='int64', name='user')

In [38]:
test_user_ids

Index([  4,  10,  12,  15,  16,  18,  19,  20,  23,  24,  26,  31,  33,  35,
        38,  41,  44,  47,  49,  51,  55,  56,  57,  59,  60,  61,  62,  64,
        65,  66,  67,  68,  70,  72,  73,  75,  76,  80,  89,  90,  99, 100,
       101, 103, 104, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130, 131,
       132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 143, 144],
      dtype='int64', name='user')

In [39]:
data_dict = {
    partition: {
        user_id: {
            session: [
                int(val)
                for val in standartized_unbalanced_data[(standartized_unbalanced_data['user'] == user_id)&(standartized_unbalanced_data['session'] == session)].index
                ]
            for session in standartized_unbalanced_data[standartized_unbalanced_data['user'] == user_id]['session'].unique().tolist()
        }
        for user_id in user_ids
    }
    for partition, user_ids in zip(['train', 'validation', 'test'], [train_user_ids, val_user_ids, test_user_ids])
}

In [40]:
# Example: the session ids for the user 2, from the train partition
partition = 'train'
user_id = 2
list(data_dict['train'][user_id].keys())

[1, 3, 5]

In [None]:
def generate_pairs_from_rg_partition(partition: str, data_dict: dict, positive_pairs_per_user: int=1000):
    negative_pairs_per_user = positive_pairs_per_user * 5
    pairs = []
    partition_data_dict = data_dict[partition]
    users_ids = list(partition_data_dict.keys())
    for user_id in users_ids:
        unique_session_ids = list(partition_data_dict[user_id].keys())
        # Positive pairs
        for _ in range(positive_pairs_per_user):
            random_session_id_1, random_session_id_2 = random.sample(unique_session_ids, k=2)
            user_sample_index_1 = random.choice(partition_data_dict[user_id][random_session_id_1])
            user_sample_index_2 = random.choice(partition_data_dict[user_id][random_session_id_2])
            # Append the pair
            pairs.append({
                'user1_id': user_id,
                'user2_id': user_id,
                'user1_session_id': random_session_id_1,
                'user2_session_id': random_session_id_2,
                'index_sample_1': user_sample_index_1,
                'index_sample_2': user_sample_index_2,
                'type': '+'
            })
        # Negative pairs
        other_users_ids = [other_user_id for other_user_id in users_ids if user_id != other_user_id]
        # print(f'User {user_id} - USERS {users_ids} - IMPOSTORS: {other_users_ids}')
        for _ in range(negative_pairs_per_user):
            # Original user sample selection
            user_session_id = 1 if partition == 'test' else random.choice(unique_session_ids)
            user_sample_index = random.choice(partition_data_dict[user_id][user_session_id])
            # Impostor user sample selection
            random_impostor_user_id = random.choice(list(other_users_ids))
            random_impostor_session_ids = list(partition_data_dict[random_impostor_user_id].keys())
            random_impostor_session_id = 3 if partition == 'test' else random.choice(random_impostor_session_ids)
            random_impostor_sample_index = random.choice(partition_data_dict[random_impostor_user_id][random_impostor_session_id])
            # Append the pair
            pairs.append({
                'user1_id': user_id,
                'user2_id': random_impostor_user_id,
                'user1_session_id': user_session_id,
                'user2_session_id': random_impostor_session_id,
                'index_sample_1': user_sample_index,
                'index_sample_2': random_impostor_sample_index,
                'type': '-'
            })
    return pd.DataFrame(pairs)
    

In [66]:
train_index_df = process_partition_all_against_all('train', data_dict)
val_index_df = process_partition_all_against_all('validation', data_dict)
test_index_df = process_partition_all_against_all('test', data_dict)

In [67]:
train_index_df.to_csv('../data/authentication/unbalanced/RecodGait_v2/train_index_pairs.csv', index=False)
val_index_df.to_csv('../data/authentication/unbalanced/RecodGait_v2/validation_index_pairs.csv', index=False)
test_index_df.to_csv('../data/authentication/unbalanced/RecodGait_v2/test_index_pairs.csv', index=False)

In [68]:
index_data = pd.read_csv('../data/authentication/unbalanced/RecodGait_v2/test_index_pairs.csv')
index_data.groupby(['user2_id', 'type']).size().to_csv('temp.csv')

In [None]:

index_data[(index_data['type'] == '-') & (index_data['user2_session_id'] == 1) & (index_data['user1_session_id'] == 1)]

Unnamed: 0,user1_id,user2_id,user1_session_id,user2_session_id,index_sample_1,index_sample_2,type
