# Create CREMA-D split

This notebook creates a file with the test samples for the CREMA-D dataset.

### Import packages

In [1]:
import os

import numpy as np

os.chdir(os.path.join("..", ".."))
from preprocessing.utils import get_cremad_file_meta_data

### Define paths

In [15]:
# Path to directory with CREMA-D dataset
data_path = os.path.join("/media", "datastore", "c-matsty-data", "CREMA-D_AudioWAV") 
# Save path - file with test file names
save_path = os.path.join("data", "cremad_testing_list.txt")

### Parse actor identifiers from dataset files

In [3]:
file_names = os.listdir(data_path)

In [4]:
actor_ids = []
for file in file_names:
    if not file.endswith('.wav'):
        continue
    class_label, actor_id = get_cremad_file_meta_data(file)
    actor_ids.append(actor_id)

In [5]:
distinct_actors = np.unique(actor_ids)

### Create random split based on actors

#### Get test actors

In [8]:
test_percentage = 0.1
test_size = int(len(distinct_actors) * test_percentage)

In [9]:
np.random.seed(0)

In [11]:
test_actors = np.random.choice(distinct_actors, size=test_size, replace=False)

##### Get test file names

In [12]:
test_file_names = []
for file in file_names:
    if not file.endswith('.wav'):
        continue
    class_label, actor_id = get_cremad_file_meta_data(file)
    if actor_id in test_actors:
        test_file_names.append(file)

### Write to file

In [16]:
with open(save_path, "w") as fp:
    fp.writelines(["%s\n" % item  for item in test_file_names])