In [1]:
import os
import numpy as np
import random
from sklearn.model_selection import train_test_split
from pathlib import PurePath
from shutil import copy

In [5]:
window = 256
fileloc = '/scratch/sk7898/pedbike/window_256'
data_type = 'downstream'
if data_type == 'downstream':
    filestrs = ['downstream/Human/human_radial_cuts_stft', 'downstream/Bike/bike_radial_cuts_stft']
    classes = ['Human', 'Bike']
    val_split = 0.1
    test_split = 0.1

In [6]:
outdir = fileloc
path_prefix = 'downstream_stft'
filenames = []
data = []
labels = []
seqs = []

[[filenames.append(os.path.join(os.path.join(fileloc, filestr), filename))for filename in os.listdir(os.path.join(fileloc, filestr))] for filestr in filestrs]

[[data.append(np.fromfile(open(os.path.join(os.path.join(fileloc, filestr), filename), "r"), dtype=np.float64).reshape(-1, window*2))
      for filename in os.listdir(os.path.join(fileloc, filestr))] for filestr in filestrs]
data = np.array(data)

[labels.append(int((fname.split('_')[-1]).split('p')[0])) for fname in filenames]
labels = np.array(labels)

[seqs.append(int(fname.split('_')[-3])) for fname in filenames]
seqs = np.array(seqs)

In [7]:
indices = np.arange(len(filenames))
X_train, X_val, y_train, y_val, indices_train, indices_val, seqs_train, seqs_val = train_test_split(data, labels,\
                                                                                                    indices, seqs,\
                                                                                                    test_size=val_split,\
                                                                                                    random_state=42)

X_train, X_test, y_train, y_test, indices_train, indices_test, seqs_train, seqs_test = train_test_split(X_train, y_train,\
                                                                                                        indices_train,\
                                                                                                        seqs_train,\
                                                                                                        test_size=test_split,\
                                                                                                        random_state=42)
os.makedirs(outdir, exist_ok=True)
os.makedirs(os.path.join(outdir, path_prefix), exist_ok=True)

# Save train data
np.save(os.path.join(outdir, path_prefix, "train.npy"), X_train)
np.save(os.path.join(outdir, path_prefix, "train_seqs.npy"), seqs_train)
np.save(os.path.join(outdir, path_prefix, "train_lbls.npy"), y_train)

# Save validation data
np.save(os.path.join(outdir, path_prefix, "val.npy"), X_val)
np.save(os.path.join(outdir, path_prefix, "val_seqs.npy"), seqs_val)
np.save(os.path.join(outdir, path_prefix, "val_lbls.npy"), y_val)

# Save test data
np.save(os.path.join(outdir, path_prefix, "test.npy"), X_test)
np.save(os.path.join(outdir, path_prefix, "test_seqs.npy"), seqs_test)
np.save(os.path.join(outdir, path_prefix, "test_lbls.npy"), y_test)

In [8]:
files_train = [filenames[i] for i in indices_train]
files_val = [filenames[i] for i in indices_val]
files_test = [filenames[i] for i in indices_test]

train_dir = os.path.join(outdir, path_prefix, "stft_train")
val_dir = os.path.join(outdir, path_prefix, "stft_val")
test_dir = os.path.join(outdir, path_prefix, "stft_test")

for cls in classes:
    os.makedirs(os.path.join(train_dir,cls), exist_ok=True)
    os.makedirs(os.path.join(val_dir,cls), exist_ok=True)
    os.makedirs(os.path.join(test_dir,cls), exist_ok=True)

for tr in files_train:
    cur_class = PurePath(tr).parent.name.split('_')[0]
    copy(tr, os.path.join(train_dir,cur_class))

for val in files_val:
    cur_class = PurePath(val).parent.name.split('_')[0]
    copy(val, os.path.join(val_dir, cur_class))
    
for tst in files_test:
    cur_class = PurePath(tst).parent.name.split('_')[0]
    copy(tst, os.path.join(test_dir, cur_class))

In [9]:
indices_human_train = [idx for idx in indices_train if 'Human' in os.path.basename(filenames[idx])]
indices_bike_train = [idx for idx in indices_train if 'Bike' in os.path.basename(filenames[idx])]

print('Total Dataset Size:', len(labels))
print('Train Dataset Size:', len(y_train))
print('Humans in Train:', len(indices_human_train))
print('Bikes in Train:', len(indices_bike_train))
print('Validation Dataset Size:', len(y_val))

Total Dataset Size: 955
Train Dataset Size: 773
Humans in Train: 375
Bikes in Train: 398
Validation Dataset Size: 96


**Train**

*Humans: 375*
*Bikes: 398*
    
**Validation**

*Humans: 45*
*Bikes: 51*
    
**Test**

*Humans: 45*
*Bikes: 41*