In [1]:
from tqdm import tqdm

import pandas as pd
import numpy as np

import os

In [2]:
ROOT_PATH = '/home/hice1/cclark339/scratch/Data/single_nuc_1/'
DATA_PATH = ROOT_PATH + 'all_data_new_features_updated.csv'

TRAIN_VAL_BASE_PATH = ROOT_PATH + 'raw_data/training_data/YOLOV5_Cls_Manual_Videos/'

TRAIN_MID_PATH = TRAIN_VAL_BASE_PATH + 'train/'
VAL_MID_PATH = TRAIN_VAL_BASE_PATH + 'val/'

TRAIN_IMG_DIRS = [TRAIN_MID_PATH + 'Male/', TRAIN_MID_PATH + 'Female/']
VAL_IMG_DIRS = [VAL_MID_PATH + 'Male/', VAL_MID_PATH + 'Female/']

TEST_BASE_PATH = ROOT_PATH + 'raw_data/testing_data/test/'
TEST_IMG_DIRS = [TEST_BASE_PATH + 'male/', TEST_BASE_PATH + 'female/']

In [3]:
male_train_imgs = os.listdir(TRAIN_IMG_DIRS[0])
female_train_imgs = os.listdir(TRAIN_IMG_DIRS[1])

male_val_imgs = os.listdir(VAL_IMG_DIRS[0])
female_val_imgs = os.listdir(VAL_IMG_DIRS[1])

male_test_imgs = os.listdir(TEST_IMG_DIRS[0])
female_test_imgs = os.listdir(TEST_IMG_DIRS[1])

all_train_imgs = male_train_imgs + female_train_imgs
all_val_imgs = male_val_imgs + female_val_imgs
all_test_imgs = male_test_imgs + female_test_imgs

print(f'# of train images: {len(all_train_imgs)}')
print(f'# of val images: {len(all_val_imgs)}')
print(f'# of test images: {len(all_test_imgs)}')

# of train images: 662081
# of val images: 73704
# of test images: 1015185


In [4]:
male_train_ids = ['__'.join(img.split('__')[:-1]) for img in male_train_imgs]
female_train_ids = ['__'.join(img.split('__')[:-1]) for img in female_train_imgs]

all_train_ids = male_train_ids + female_train_ids
print(f'# train track IDs: {len(all_train_ids)}')

male_train_ids_set = set(male_train_ids)
female_train_ids_set = set(female_train_ids)

# train track IDs: 662081


In [5]:
male_val_ids = ['__'.join(img.split('__')[:-1]) for img in male_val_imgs]
female_val_ids = ['__'.join(img.split('__')[:-1]) for img in female_val_imgs]

all_val_ids = male_val_ids + female_val_ids
print(f'# val track IDs: {len(all_val_ids)}')

male_val_ids_set = set(male_val_ids)
female_val_ids_set = set(female_val_ids)

# val track IDs: 73704


In [6]:
male_test_ids = ['_'.join(img.split('_')[:-1]) for img in male_test_imgs]
female_test_ids = ['_'.join(img.split('_')[:-1]) for img in female_test_imgs]

all_test_ids = male_test_ids + female_test_ids
print(f'# test track IDs: {len(all_test_ids)}')

male_test_ids_set = set(male_test_ids)
female_test_ids_set = set(female_test_ids)

# test track IDs: 1015185


In [7]:
F = [male_train_ids_set, female_train_ids_set, male_val_ids_set, female_val_ids_set, male_test_ids_set, female_test_ids_set]

In [8]:
for i in range(0, len(F) - 1, 2):
    for j in range(1, len(F), 2):
        S_i = F[i]
        S_j = F[j]
        
        print(f'S_{i}, S_{j} overlap? {len(S_i & S_j) > 0}')

S_0, S_1 overlap? False
S_0, S_3 overlap? False
S_0, S_5 overlap? False
S_2, S_1 overlap? False
S_2, S_3 overlap? False
S_2, S_5 overlap? False
S_4, S_1 overlap? False
S_4, S_3 overlap? False
S_4, S_5 overlap? False


## YOLO Dataset Construction

In [9]:
total_df = pd.read_csv(DATA_PATH)
total_df

Unnamed: 0,unique_track_id,frame_num,track_type,true_label,yolov5_class_id,xc,yc,u_dot,v_dot,speed,distance_traveled,mean_acceleration,outreach_ratio,square_displacement,mean_turning_angle,jerk,rms_velocity
0,MC_singlenuc23_1_Tk33_021220__0001_vid__1330,382058,train,0,0.0,117.405639,104.372970,6.973857,22.796302,2.669734,15439.795210,0.281946,0.074381,128467.100986,0.118348,-0.000210,3.849834
1,MC_singlenuc23_1_Tk33_021220__0001_vid__1330,382059,train,0,0.0,123.302195,123.913192,6.585702,21.623125,2.669734,15439.795210,0.281946,0.074381,128467.100986,0.118348,-0.000210,3.849834
2,MC_singlenuc23_1_Tk33_021220__0001_vid__1330,382068,train,0,0.0,111.683189,400.633206,-0.382109,28.731078,2.669734,15439.795210,0.281946,0.074381,128467.100986,0.118348,-0.000210,3.849834
3,MC_singlenuc23_1_Tk33_021220__0001_vid__1330,382158,train,0,0.0,821.635502,527.801237,5.468046,-3.778739,2.669734,15439.795210,0.281946,0.074381,128467.100986,0.118348,-0.000210,3.849834
4,MC_singlenuc23_1_Tk33_021220__0001_vid__1330,383058,train,0,0.0,883.833486,309.388663,2.265553,0.529649,2.669734,15439.795210,0.281946,0.074381,128467.100986,0.118348,-0.000210,3.849834
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012917,MC_singlenuc96_b1_Tk41_081120__0001_vid__9287,535063,test,0,0.0,113.216856,23.772530,-0.139779,-0.083265,1.223667,4692.119727,0.150978,0.103657,50341.782496,0.234763,-0.000308,2.542154
2012918,MC_singlenuc96_b1_Tk41_081120__0001_vid__9287,535064,test,0,0.0,114.332797,22.259622,-0.026375,-0.212375,1.223667,4692.119727,0.150978,0.103657,50341.782496,0.234763,-0.000308,2.542154
2012919,MC_singlenuc96_b1_Tk41_081120__0001_vid__9287,535065,test,0,0.0,113.453353,22.342868,-0.103404,-0.185682,1.223667,4692.119727,0.150978,0.103657,50341.782496,0.234763,-0.000308,2.542154
2012920,MC_singlenuc96_b1_Tk41_081120__0001_vid__9287,535066,test,0,0.0,113.447966,23.034077,-0.094555,-0.106513,1.223667,4692.119727,0.150978,0.103657,50341.782496,0.234763,-0.000308,2.542154


In [10]:
total_df.isna().any()

unique_track_id        False
frame_num              False
track_type             False
true_label             False
yolov5_class_id        False
xc                     False
yc                     False
u_dot                  False
v_dot                  False
speed                  False
distance_traveled      False
mean_acceleration      False
outreach_ratio         False
square_displacement    False
mean_turning_angle     False
jerk                   False
rms_velocity           False
dtype: bool

In [11]:
df_all_ids = set(total_df['unique_track_id'])

print(f'# of tracks in dataset: {len(df_all_ids)}')

# of tracks in dataset: 1086


In [12]:
trainval_df = total_df.loc[total_df['track_type'] == 'train']
trainval_df

Unnamed: 0,unique_track_id,frame_num,track_type,true_label,yolov5_class_id,xc,yc,u_dot,v_dot,speed,distance_traveled,mean_acceleration,outreach_ratio,square_displacement,mean_turning_angle,jerk,rms_velocity
0,MC_singlenuc23_1_Tk33_021220__0001_vid__1330,382058,train,0,0.0,117.405639,104.372970,6.973857,22.796302,2.669734,15439.795210,0.281946,0.074381,128467.100986,0.118348,-0.00021,3.849834
1,MC_singlenuc23_1_Tk33_021220__0001_vid__1330,382059,train,0,0.0,123.302195,123.913192,6.585702,21.623125,2.669734,15439.795210,0.281946,0.074381,128467.100986,0.118348,-0.00021,3.849834
2,MC_singlenuc23_1_Tk33_021220__0001_vid__1330,382068,train,0,0.0,111.683189,400.633206,-0.382109,28.731078,2.669734,15439.795210,0.281946,0.074381,128467.100986,0.118348,-0.00021,3.849834
3,MC_singlenuc23_1_Tk33_021220__0001_vid__1330,382158,train,0,0.0,821.635502,527.801237,5.468046,-3.778739,2.669734,15439.795210,0.281946,0.074381,128467.100986,0.118348,-0.00021,3.849834
4,MC_singlenuc23_1_Tk33_021220__0001_vid__1330,383058,train,0,0.0,883.833486,309.388663,2.265553,0.529649,2.669734,15439.795210,0.281946,0.074381,128467.100986,0.118348,-0.00021,3.849834
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
950641,MC_singlenuc96_b1_Tk41_081120__0001_vid__9350,537123,train,1,0.0,156.276651,671.360475,4.437002,6.482767,5.231545,1972.812041,0.443116,0.331928,28410.043847,0.060137,-0.00362,5.753525
950642,MC_singlenuc96_b1_Tk41_081120__0001_vid__9350,537124,train,1,0.0,162.206921,672.395285,4.571728,5.991240,5.231545,1972.812041,0.443116,0.331928,28410.043847,0.060137,-0.00362,5.753525
950643,MC_singlenuc96_b1_Tk41_081120__0001_vid__9350,537125,train,1,0.0,170.841343,672.257588,4.938269,5.438280,5.231545,1972.812041,0.443116,0.331928,28410.043847,0.060137,-0.00362,5.753525
950644,MC_singlenuc96_b1_Tk41_081120__0001_vid__9350,537126,train,1,0.0,178.862178,673.323482,5.216379,5.043803,5.231545,1972.812041,0.443116,0.331928,28410.043847,0.060137,-0.00362,5.753525


In [13]:
expanded_train_df = []
for track_id in tqdm(np.unique(all_train_ids)):
    imgs = [img for img in all_train_imgs if track_id + '_' in img]
    
#     print(imgs.shape)
    
    
    row = trainval_df.loc[trainval_df['unique_track_id'] == track_id].iloc[0]
    tmp = [['Male/' + img if row['true_label'] == 1 else 'Female/' + img, 'train', row['true_label'], row['distance_traveled'], row['speed'], row['mean_acceleration'], row['outreach_ratio'], row['square_displacement'], row['mean_turning_angle'], row['jerk'], row['rms_velocity']] for img in imgs]
    
    expanded_train_df += tmp
    

100%|██████████| 628/628 [01:15<00:00,  8.33it/s]


In [14]:
expanded_train_df = pd.DataFrame.from_records(expanded_train_df, columns=['filepath', 'split', 'is_male', 'distance', 'speed', 'acceleration', 'outreach_ratio', 'sqr_displacement', 'mean_turning_angle', 'jerk', 'rms_velocity'])
expanded_train_df

Unnamed: 0,filepath,split,is_male,distance,speed,acceleration,outreach_ratio,sqr_displacement,mean_turning_angle,jerk,rms_velocity
0,Female/MC_singlenuc23_1_Tk33_021220__0001_vid_...,train,0,15439.795210,2.669734,0.281946,0.074381,128467.100986,0.118348,-0.00021,3.849834
1,Female/MC_singlenuc23_1_Tk33_021220__0001_vid_...,train,0,15439.795210,2.669734,0.281946,0.074381,128467.100986,0.118348,-0.00021,3.849834
2,Female/MC_singlenuc23_1_Tk33_021220__0001_vid_...,train,0,15439.795210,2.669734,0.281946,0.074381,128467.100986,0.118348,-0.00021,3.849834
3,Female/MC_singlenuc23_1_Tk33_021220__0001_vid_...,train,0,15439.795210,2.669734,0.281946,0.074381,128467.100986,0.118348,-0.00021,3.849834
4,Female/MC_singlenuc23_1_Tk33_021220__0001_vid_...,train,0,15439.795210,2.669734,0.281946,0.074381,128467.100986,0.118348,-0.00021,3.849834
...,...,...,...,...,...,...,...,...,...,...,...
662076,Male/MC_singlenuc96_b1_Tk41_081120__0001_vid__...,train,1,1972.812041,5.231545,0.443116,0.331928,28410.043847,0.060137,-0.00362,5.753525
662077,Male/MC_singlenuc96_b1_Tk41_081120__0001_vid__...,train,1,1972.812041,5.231545,0.443116,0.331928,28410.043847,0.060137,-0.00362,5.753525
662078,Male/MC_singlenuc96_b1_Tk41_081120__0001_vid__...,train,1,1972.812041,5.231545,0.443116,0.331928,28410.043847,0.060137,-0.00362,5.753525
662079,Male/MC_singlenuc96_b1_Tk41_081120__0001_vid__...,train,1,1972.812041,5.231545,0.443116,0.331928,28410.043847,0.060137,-0.00362,5.753525


In [15]:
expanded_train_df.to_csv('/home/hice1/cclark339/scratch/Data/single_nuc_1/expanded_training_set.csv', index=False)

In [16]:
expanded_val_df = []
for track_id in tqdm(np.unique(all_val_ids)):
    imgs = [img for img in all_val_imgs if track_id + '_' in img]

    row = trainval_df.loc[trainval_df['unique_track_id'] == track_id].iloc[0]
    tmp = [['Male/' + img if row['true_label'] == 1 else 'Female/' + img, 'val', row['true_label'], row['distance_traveled'], row['speed'], row['mean_acceleration'], row['outreach_ratio'], row['square_displacement'], row['mean_turning_angle'], row['jerk'], row['rms_velocity']] for img in imgs]
    
    expanded_val_df += tmp

100%|██████████| 618/618 [00:31<00:00, 19.46it/s]


In [17]:
expanded_val_df = pd.DataFrame.from_records(expanded_val_df, columns=['filepath', 'split', 'is_male', 'distance', 'speed', 'acceleration', 'outreach_ratio', 'sqr_displacement', 'mean_turning_angle', 'jerk', 'rms_velocity'])
expanded_val_df

Unnamed: 0,filepath,split,is_male,distance,speed,acceleration,outreach_ratio,sqr_displacement,mean_turning_angle,jerk,rms_velocity
0,Female/MC_singlenuc23_1_Tk33_021220__0001_vid_...,val,0,15439.795210,2.669734,0.281946,0.074381,128467.100986,0.118348,-0.00021,3.849834
1,Female/MC_singlenuc23_1_Tk33_021220__0001_vid_...,val,0,15439.795210,2.669734,0.281946,0.074381,128467.100986,0.118348,-0.00021,3.849834
2,Female/MC_singlenuc23_1_Tk33_021220__0001_vid_...,val,0,15439.795210,2.669734,0.281946,0.074381,128467.100986,0.118348,-0.00021,3.849834
3,Female/MC_singlenuc23_1_Tk33_021220__0001_vid_...,val,0,15439.795210,2.669734,0.281946,0.074381,128467.100986,0.118348,-0.00021,3.849834
4,Female/MC_singlenuc23_1_Tk33_021220__0001_vid_...,val,0,15439.795210,2.669734,0.281946,0.074381,128467.100986,0.118348,-0.00021,3.849834
...,...,...,...,...,...,...,...,...,...,...,...
73699,Male/MC_singlenuc96_b1_Tk41_081120__0001_vid__...,val,1,1972.812041,5.231545,0.443116,0.331928,28410.043847,0.060137,-0.00362,5.753525
73700,Male/MC_singlenuc96_b1_Tk41_081120__0001_vid__...,val,1,1972.812041,5.231545,0.443116,0.331928,28410.043847,0.060137,-0.00362,5.753525
73701,Male/MC_singlenuc96_b1_Tk41_081120__0001_vid__...,val,1,1972.812041,5.231545,0.443116,0.331928,28410.043847,0.060137,-0.00362,5.753525
73702,Male/MC_singlenuc96_b1_Tk41_081120__0001_vid__...,val,1,1972.812041,5.231545,0.443116,0.331928,28410.043847,0.060137,-0.00362,5.753525


In [18]:
expanded_val_df.to_csv('/home/hice1/cclark339/scratch/Data/single_nuc_1/expanded_validation_set.csv', index=False)

In [19]:
test_df = total_df.loc[total_df['track_type'] == 'test']
test_df

Unnamed: 0,unique_track_id,frame_num,track_type,true_label,yolov5_class_id,xc,yc,u_dot,v_dot,speed,distance_traveled,mean_acceleration,outreach_ratio,square_displacement,mean_turning_angle,jerk,rms_velocity
950646,MC_singlenuc62_3_Tk65_060220__0001_vid__1602,119735,test,1,0.0,0.487516,0.959091,4.070835,-4.832684,5.275686,18.049692,0.364641,8.830518e-07,0.179105,0.054888,-0.000580,6.427450
950647,MC_singlenuc62_3_Tk65_060220__0001_vid__1602,119736,test,1,0.0,0.488613,0.948572,3.116560,-6.775233,5.275686,18.049692,0.364641,8.830518e-07,0.179105,0.054888,-0.000580,6.427450
950648,MC_singlenuc62_3_Tk65_060220__0001_vid__1602,119737,test,1,0.0,0.491099,0.941041,3.145261,-6.924879,5.275686,18.049692,0.364641,8.830518e-07,0.179105,0.054888,-0.000580,6.427450
950649,MC_singlenuc62_3_Tk65_060220__0001_vid__1602,119738,test,1,0.0,0.494843,0.931517,3.524060,-7.442532,5.275686,18.049692,0.364641,8.830518e-07,0.179105,0.054888,-0.000580,6.427450
950650,MC_singlenuc62_3_Tk65_060220__0001_vid__1602,119739,test,1,0.0,0.499262,0.917409,3.936836,-8.617140,5.275686,18.049692,0.364641,8.830518e-07,0.179105,0.054888,-0.000580,6.427450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012917,MC_singlenuc96_b1_Tk41_081120__0001_vid__9287,535063,test,0,0.0,113.216856,23.772530,-0.139779,-0.083265,1.223667,4692.119727,0.150978,1.036571e-01,50341.782496,0.234763,-0.000308,2.542154
2012918,MC_singlenuc96_b1_Tk41_081120__0001_vid__9287,535064,test,0,0.0,114.332797,22.259622,-0.026375,-0.212375,1.223667,4692.119727,0.150978,1.036571e-01,50341.782496,0.234763,-0.000308,2.542154
2012919,MC_singlenuc96_b1_Tk41_081120__0001_vid__9287,535065,test,0,0.0,113.453353,22.342868,-0.103404,-0.185682,1.223667,4692.119727,0.150978,1.036571e-01,50341.782496,0.234763,-0.000308,2.542154
2012920,MC_singlenuc96_b1_Tk41_081120__0001_vid__9287,535066,test,0,0.0,113.447966,23.034077,-0.094555,-0.106513,1.223667,4692.119727,0.150978,1.036571e-01,50341.782496,0.234763,-0.000308,2.542154


In [20]:
expanded_test_df = []
for track_id in tqdm(np.unique(all_test_ids)):
    imgs = [img for img in all_test_imgs if track_id + '_' in img]    
    
    row = test_df.loc[test_df['unique_track_id'] == track_id].iloc[0]
    tmp = [['male/' + img if row['true_label'] == 1 else 'female/' + img, 'test', row['true_label'], row['distance_traveled'], row['speed'], row['mean_acceleration'], row['outreach_ratio'], row['square_displacement'], row['mean_turning_angle'], row['jerk'], row['rms_velocity']] for img in imgs]
    
    expanded_test_df += tmp

100%|██████████| 457/457 [01:18<00:00,  5.79it/s]


In [21]:
expanded_test_df = pd.DataFrame.from_records(expanded_test_df, columns=['filepath', 'split', 'is_male', 'distance', 'speed', 'acceleration', 'outreach_ratio', 'sqr_displacement', 'mean_turning_angle', 'jerk', 'rms_velocity'])
expanded_test_df

Unnamed: 0,filepath,split,is_male,distance,speed,acceleration,outreach_ratio,sqr_displacement,mean_turning_angle,jerk,rms_velocity
0,male/MC_singlenuc23_1_Tk33_021220__0004_vid__1...,test,1,23589.076095,5.025806,0.449513,0.039682,270184.060313,0.076592,-0.000379,6.049916
1,male/MC_singlenuc23_1_Tk33_021220__0004_vid__1...,test,1,23589.076095,5.025806,0.449513,0.039682,270184.060313,0.076592,-0.000379,6.049916
2,male/MC_singlenuc23_1_Tk33_021220__0004_vid__1...,test,1,23589.076095,5.025806,0.449513,0.039682,270184.060313,0.076592,-0.000379,6.049916
3,male/MC_singlenuc23_1_Tk33_021220__0004_vid__1...,test,1,23589.076095,5.025806,0.449513,0.039682,270184.060313,0.076592,-0.000379,6.049916
4,male/MC_singlenuc23_1_Tk33_021220__0004_vid__1...,test,1,23589.076095,5.025806,0.449513,0.039682,270184.060313,0.076592,-0.000379,6.049916
...,...,...,...,...,...,...,...,...,...,...,...
1015180,female/MC_singlenuc96_b1_Tk41_081120__0001_vid...,test,0,4692.119727,1.223667,0.150978,0.103657,50341.782496,0.234763,-0.000308,2.542154
1015181,female/MC_singlenuc96_b1_Tk41_081120__0001_vid...,test,0,4692.119727,1.223667,0.150978,0.103657,50341.782496,0.234763,-0.000308,2.542154
1015182,female/MC_singlenuc96_b1_Tk41_081120__0001_vid...,test,0,4692.119727,1.223667,0.150978,0.103657,50341.782496,0.234763,-0.000308,2.542154
1015183,female/MC_singlenuc96_b1_Tk41_081120__0001_vid...,test,0,4692.119727,1.223667,0.150978,0.103657,50341.782496,0.234763,-0.000308,2.542154


In [22]:
expanded_test_df.to_csv('/home/hice1/cclark339/scratch/Data/single_nuc_1/expanded_testing_set.csv', index=False)

## Dataset Processing

In [23]:
expanded_trainval_df = pd.concat([expanded_train_df, expanded_val_df], axis=0).reset_index(drop=True)

expanded_trainval_df['filename'] = expanded_trainval_df['filepath'].apply(lambda x: x.split('/')[-1])
expanded_trainval_df['track_id'] = expanded_trainval_df['filename'].apply(lambda x: '__'.join(x.split('__')[:-1]))

expanded_trainval_df

Unnamed: 0,filepath,split,is_male,distance,speed,acceleration,outreach_ratio,sqr_displacement,mean_turning_angle,jerk,rms_velocity,filename,track_id
0,Female/MC_singlenuc23_1_Tk33_021220__0001_vid_...,train,0,15439.795210,2.669734,0.281946,0.074381,128467.100986,0.118348,-0.00021,3.849834,MC_singlenuc23_1_Tk33_021220__0001_vid__1330__...,MC_singlenuc23_1_Tk33_021220__0001_vid__1330
1,Female/MC_singlenuc23_1_Tk33_021220__0001_vid_...,train,0,15439.795210,2.669734,0.281946,0.074381,128467.100986,0.118348,-0.00021,3.849834,MC_singlenuc23_1_Tk33_021220__0001_vid__1330__...,MC_singlenuc23_1_Tk33_021220__0001_vid__1330
2,Female/MC_singlenuc23_1_Tk33_021220__0001_vid_...,train,0,15439.795210,2.669734,0.281946,0.074381,128467.100986,0.118348,-0.00021,3.849834,MC_singlenuc23_1_Tk33_021220__0001_vid__1330__...,MC_singlenuc23_1_Tk33_021220__0001_vid__1330
3,Female/MC_singlenuc23_1_Tk33_021220__0001_vid_...,train,0,15439.795210,2.669734,0.281946,0.074381,128467.100986,0.118348,-0.00021,3.849834,MC_singlenuc23_1_Tk33_021220__0001_vid__1330__...,MC_singlenuc23_1_Tk33_021220__0001_vid__1330
4,Female/MC_singlenuc23_1_Tk33_021220__0001_vid_...,train,0,15439.795210,2.669734,0.281946,0.074381,128467.100986,0.118348,-0.00021,3.849834,MC_singlenuc23_1_Tk33_021220__0001_vid__1330__...,MC_singlenuc23_1_Tk33_021220__0001_vid__1330
...,...,...,...,...,...,...,...,...,...,...,...,...,...
735780,Male/MC_singlenuc96_b1_Tk41_081120__0001_vid__...,val,1,1972.812041,5.231545,0.443116,0.331928,28410.043847,0.060137,-0.00362,5.753525,MC_singlenuc96_b1_Tk41_081120__0001_vid__9350_...,MC_singlenuc96_b1_Tk41_081120__0001_vid__9350
735781,Male/MC_singlenuc96_b1_Tk41_081120__0001_vid__...,val,1,1972.812041,5.231545,0.443116,0.331928,28410.043847,0.060137,-0.00362,5.753525,MC_singlenuc96_b1_Tk41_081120__0001_vid__9350_...,MC_singlenuc96_b1_Tk41_081120__0001_vid__9350
735782,Male/MC_singlenuc96_b1_Tk41_081120__0001_vid__...,val,1,1972.812041,5.231545,0.443116,0.331928,28410.043847,0.060137,-0.00362,5.753525,MC_singlenuc96_b1_Tk41_081120__0001_vid__9350_...,MC_singlenuc96_b1_Tk41_081120__0001_vid__9350
735783,Male/MC_singlenuc96_b1_Tk41_081120__0001_vid__...,val,1,1972.812041,5.231545,0.443116,0.331928,28410.043847,0.060137,-0.00362,5.753525,MC_singlenuc96_b1_Tk41_081120__0001_vid__9350_...,MC_singlenuc96_b1_Tk41_081120__0001_vid__9350


In [24]:
N_trainval = len(expanded_trainval_df['track_id'].unique())
print(f'# unique IDs in the combined set: {N_trainval}')

# unique IDs in the combined set: 628


In [25]:
n_train = int(0.8 * N_trainval)
print(f'# unique IDs in new train split: {n_train}')

# unique IDs in new train split: 502


In [26]:
new_train_ids = np.random.choice(expanded_trainval_df['track_id'].unique(), n_train, replace=False)
print(f'Number of selected IDs = {n_train}? {len(new_train_ids) == n_train}')

Number of selected IDs = 502? True


In [27]:
new_expanded_train_df = []
for new_train_id in new_train_ids:
    new_expanded_train_df += expanded_trainval_df.loc[expanded_trainval_df['track_id'] == new_train_id].values.tolist()
    
new_expanded_train_df = pd.DataFrame.from_records(new_expanded_train_df, columns=expanded_trainval_df.columns)
new_expanded_train_df

Unnamed: 0,filepath,split,is_male,distance,speed,acceleration,outreach_ratio,sqr_displacement,mean_turning_angle,jerk,rms_velocity,filename,track_id
0,Male/MC_singlenuc62_3_Tk65_060220__0001_vid__5...,train,1,3157.177255,9.477585,0.717047,0.268440,67149.061139,0.051450,-0.010046,10.975412,MC_singlenuc62_3_Tk65_060220__0001_vid__561__1...,MC_singlenuc62_3_Tk65_060220__0001_vid__561
1,Male/MC_singlenuc62_3_Tk65_060220__0001_vid__5...,train,1,3157.177255,9.477585,0.717047,0.268440,67149.061139,0.051450,-0.010046,10.975412,MC_singlenuc62_3_Tk65_060220__0001_vid__561__1...,MC_singlenuc62_3_Tk65_060220__0001_vid__561
2,Male/MC_singlenuc62_3_Tk65_060220__0001_vid__5...,train,1,3157.177255,9.477585,0.717047,0.268440,67149.061139,0.051450,-0.010046,10.975412,MC_singlenuc62_3_Tk65_060220__0001_vid__561__1...,MC_singlenuc62_3_Tk65_060220__0001_vid__561
3,Male/MC_singlenuc62_3_Tk65_060220__0001_vid__5...,train,1,3157.177255,9.477585,0.717047,0.268440,67149.061139,0.051450,-0.010046,10.975412,MC_singlenuc62_3_Tk65_060220__0001_vid__561__1...,MC_singlenuc62_3_Tk65_060220__0001_vid__561
4,Male/MC_singlenuc62_3_Tk65_060220__0001_vid__5...,train,1,3157.177255,9.477585,0.717047,0.268440,67149.061139,0.051450,-0.010046,10.975412,MC_singlenuc62_3_Tk65_060220__0001_vid__561__8...,MC_singlenuc62_3_Tk65_060220__0001_vid__561
...,...,...,...,...,...,...,...,...,...,...,...,...,...
588891,Male/MC_singlenuc62_3_Tk65_060220__0001_vid__3...,val,1,3073.699191,5.889025,0.413899,0.211283,60082.306047,0.054011,-0.000003,7.922357,MC_singlenuc62_3_Tk65_060220__0001_vid__396__2...,MC_singlenuc62_3_Tk65_060220__0001_vid__396
588892,Male/MC_singlenuc62_3_Tk65_060220__0001_vid__3...,val,1,3073.699191,5.889025,0.413899,0.211283,60082.306047,0.054011,-0.000003,7.922357,MC_singlenuc62_3_Tk65_060220__0001_vid__396__1...,MC_singlenuc62_3_Tk65_060220__0001_vid__396
588893,Male/MC_singlenuc62_3_Tk65_060220__0001_vid__3...,val,1,3073.699191,5.889025,0.413899,0.211283,60082.306047,0.054011,-0.000003,7.922357,MC_singlenuc62_3_Tk65_060220__0001_vid__396__3...,MC_singlenuc62_3_Tk65_060220__0001_vid__396
588894,Male/MC_singlenuc62_3_Tk65_060220__0001_vid__3...,val,1,3073.699191,5.889025,0.413899,0.211283,60082.306047,0.054011,-0.000003,7.922357,MC_singlenuc62_3_Tk65_060220__0001_vid__396__3...,MC_singlenuc62_3_Tk65_060220__0001_vid__396


In [28]:
new_val_ids = list(set(expanded_trainval_df['track_id'].unique()) - set(new_train_ids))
n_val = len(new_val_ids)

print(f'# unique IDs in new val split: {n_val}')
print(f'# unique IDs in new val split + # unique IDs in new train split = {N_trainval}? {(n_train + n_val) == N_trainval}')

# unique IDs in new val split: 126
# unique IDs in new val split + # unique IDs in new train split = 628? True


In [29]:
new_expanded_val_df = []
for new_val_id in new_val_ids:
    new_expanded_val_df += expanded_trainval_df.loc[expanded_trainval_df['track_id'] == new_val_id].values.tolist()
    
new_expanded_val_df = pd.DataFrame.from_records(new_expanded_val_df, columns=expanded_trainval_df.columns)
new_expanded_val_df

Unnamed: 0,filepath,split,is_male,distance,speed,acceleration,outreach_ratio,sqr_displacement,mean_turning_angle,jerk,rms_velocity,filename,track_id
0,Female/MC_singlenuc63_1_Tk9_060220__0005_vid__...,train,0,3552.908342,1.690321,0.211041,0.128829,31401.091643,0.183582,-0.000738,3.414717,MC_singlenuc63_1_Tk9_060220__0005_vid__2114__4...,MC_singlenuc63_1_Tk9_060220__0005_vid__2114
1,Female/MC_singlenuc63_1_Tk9_060220__0005_vid__...,train,0,3552.908342,1.690321,0.211041,0.128829,31401.091643,0.183582,-0.000738,3.414717,MC_singlenuc63_1_Tk9_060220__0005_vid__2114__8...,MC_singlenuc63_1_Tk9_060220__0005_vid__2114
2,Female/MC_singlenuc63_1_Tk9_060220__0005_vid__...,train,0,3552.908342,1.690321,0.211041,0.128829,31401.091643,0.183582,-0.000738,3.414717,MC_singlenuc63_1_Tk9_060220__0005_vid__2114__1...,MC_singlenuc63_1_Tk9_060220__0005_vid__2114
3,Female/MC_singlenuc63_1_Tk9_060220__0005_vid__...,train,0,3552.908342,1.690321,0.211041,0.128829,31401.091643,0.183582,-0.000738,3.414717,MC_singlenuc63_1_Tk9_060220__0005_vid__2114__4...,MC_singlenuc63_1_Tk9_060220__0005_vid__2114
4,Female/MC_singlenuc63_1_Tk9_060220__0005_vid__...,train,0,3552.908342,1.690321,0.211041,0.128829,31401.091643,0.183582,-0.000738,3.414717,MC_singlenuc63_1_Tk9_060220__0005_vid__2114__5...,MC_singlenuc63_1_Tk9_060220__0005_vid__2114
...,...,...,...,...,...,...,...,...,...,...,...,...,...
146884,Female/MC_singlenuc28_1_Tk3_022520__0003_vid__...,val,0,5610.526415,2.207359,0.207192,0.108310,44858.719899,0.125008,-0.001778,3.391302,MC_singlenuc28_1_Tk3_022520__0003_vid__27144__...,MC_singlenuc28_1_Tk3_022520__0003_vid__27144
146885,Female/MC_singlenuc28_1_Tk3_022520__0003_vid__...,val,0,5610.526415,2.207359,0.207192,0.108310,44858.719899,0.125008,-0.001778,3.391302,MC_singlenuc28_1_Tk3_022520__0003_vid__27144__...,MC_singlenuc28_1_Tk3_022520__0003_vid__27144
146886,Female/MC_singlenuc28_1_Tk3_022520__0003_vid__...,val,0,5610.526415,2.207359,0.207192,0.108310,44858.719899,0.125008,-0.001778,3.391302,MC_singlenuc28_1_Tk3_022520__0003_vid__27144__...,MC_singlenuc28_1_Tk3_022520__0003_vid__27144
146887,Female/MC_singlenuc28_1_Tk3_022520__0003_vid__...,val,0,5610.526415,2.207359,0.207192,0.108310,44858.719899,0.125008,-0.001778,3.391302,MC_singlenuc28_1_Tk3_022520__0003_vid__27144__...,MC_singlenuc28_1_Tk3_022520__0003_vid__27144


In [30]:
observed_val_ids = set(new_expanded_val_df['track_id'].unique())
observed_train_ids = set(new_expanded_train_df['track_id'].unique())

print(f'# overlapping track IDs: {len(observed_val_ids & observed_train_ids)}')

# overlapping track IDs: 0


In [31]:
new_expanded_train_df.to_csv('/home/hice1/cclark339/scratch/Data/single_nuc_1/expanded_training_by_track_set.csv', index=False)
new_expanded_val_df.to_csv('/home/hice1/cclark339/scratch/Data/single_nuc_1/expanded_validation_by_track_set.csv', index=False)

In [32]:
new_expanded_test_df = expanded_test_df.iloc[::15].reset_index(drop=True)
new_expanded_test_df

Unnamed: 0,filepath,split,is_male,distance,speed,acceleration,outreach_ratio,sqr_displacement,mean_turning_angle,jerk,rms_velocity
0,male/MC_singlenuc23_1_Tk33_021220__0004_vid__1...,test,1,23589.076095,5.025806,0.449513,0.039682,270184.060313,0.076592,-0.000379,6.049916
1,male/MC_singlenuc23_1_Tk33_021220__0004_vid__1...,test,1,23589.076095,5.025806,0.449513,0.039682,270184.060313,0.076592,-0.000379,6.049916
2,male/MC_singlenuc23_1_Tk33_021220__0004_vid__1...,test,1,23589.076095,5.025806,0.449513,0.039682,270184.060313,0.076592,-0.000379,6.049916
3,male/MC_singlenuc23_1_Tk33_021220__0004_vid__1...,test,1,23589.076095,5.025806,0.449513,0.039682,270184.060313,0.076592,-0.000379,6.049916
4,male/MC_singlenuc23_1_Tk33_021220__0004_vid__1...,test,1,23589.076095,5.025806,0.449513,0.039682,270184.060313,0.076592,-0.000379,6.049916
...,...,...,...,...,...,...,...,...,...,...,...
67674,female/MC_singlenuc96_b1_Tk41_081120__0001_vid...,test,0,4692.119727,1.223667,0.150978,0.103657,50341.782496,0.234763,-0.000308,2.542154
67675,female/MC_singlenuc96_b1_Tk41_081120__0001_vid...,test,0,4692.119727,1.223667,0.150978,0.103657,50341.782496,0.234763,-0.000308,2.542154
67676,female/MC_singlenuc96_b1_Tk41_081120__0001_vid...,test,0,4692.119727,1.223667,0.150978,0.103657,50341.782496,0.234763,-0.000308,2.542154
67677,female/MC_singlenuc96_b1_Tk41_081120__0001_vid...,test,0,4692.119727,1.223667,0.150978,0.103657,50341.782496,0.234763,-0.000308,2.542154


In [33]:
new_expanded_test_df.to_csv('/home/hice1/cclark339/scratch/Data/single_nuc_1/smaller_expanded_testing_set.csv', index=False)

In [34]:
new_expanded_train_df.isna().any()

filepath              False
split                 False
is_male               False
distance              False
speed                 False
acceleration          False
outreach_ratio        False
sqr_displacement      False
mean_turning_angle    False
jerk                  False
rms_velocity          False
filename              False
track_id              False
dtype: bool

In [35]:
new_expanded_val_df.isna().any()

filepath              False
split                 False
is_male               False
distance              False
speed                 False
acceleration          False
outreach_ratio        False
sqr_displacement      False
mean_turning_angle    False
jerk                  False
rms_velocity          False
filename              False
track_id              False
dtype: bool

In [37]:
new_expanded_test_df.isna().any()

filepath              False
split                 False
is_male               False
distance              False
speed                 False
acceleration          False
outreach_ratio        False
sqr_displacement      False
mean_turning_angle    False
jerk                  False
rms_velocity          False
dtype: bool