In [1]:
import os
import glob
from pathlib import Path
import random

### Make Train Files List

In [2]:
DATA_PATH = Path('../ttmp/Chopin_Mazurkas_Modified/annotations_beat/')
Path('cfg_files').mkdir(parents=True, exist_ok=True)
train_files_path = Path('cfg_files/train.files.list')

In [3]:
with open(train_files_path, 'w') as f:
    for folderpath in DATA_PATH.glob('*'):
        foldername = os.path.basename(folderpath).split('/')[-1]
        if foldername == 'Chopin_Op017No4' or foldername == 'Chopin_Op063No3':
            for filepath in folderpath.glob('*'):
                filename = os.path.basename(filepath).split('/')[-1][:-5]
                f.write(f'{foldername}/{filename}\n')

In [4]:
def generate_query_list(filelist, outfile):
    
    # group files by piece
    d = {}
    with open(filelist, 'r') as f:
        for line in f:
            parts = line.strip().split('/')
            assert len(parts) == 2
            piece, fileid = parts
            if piece not in d:
                d[piece] = []
            d[piece].append(fileid)
            
    # print out all pairings
    with open(outfile, 'w') as fout:
        for piece in d:
            num_recordings = len(d[piece])
            for i in range(num_recordings):
                fileid1 = d[piece][i]
                for j in range(i+1, num_recordings):
                    fileid2 = d[piece][j]
                    line = f'{piece}/{fileid1} {piece}/{fileid2}\n'
                    fout.write(line)
                    
    return

### Make Train Query Lists

**Train-Full**

In [5]:
full_queries_path = Path('cfg_files/queries.train.full')
generate_query_list(train_files_path, full_queries_path)

**Train-Small**

In [6]:
random.seed(42)

In [7]:
small_queries_path = Path('cfg_files/queries.train.small')

In [8]:
with open(full_queries_path, 'r') as f:
    full_queries_list = [line[:-1] for line in f]
small_queries_list = random.sample(full_queries_list, k=200)

In [9]:
with open(small_queries_path, 'w') as f:
    for file in small_queries_list:
        f.write(file + '\n')

**Train-Toy**

In [10]:
toy_queries_path = Path('cfg_files/queries.train.toy')

In [11]:
with open(full_queries_path, 'r') as f:
    full_queries_list = [line[:-1] for line in f]
toy_queries_list = random.sample(full_queries_list, k=5)

In [12]:
with open(toy_queries_path, 'w') as f:
    for file in toy_queries_list:
        f.write(file + '\n')

### Make Test Files List

In [13]:
DATA_PATH = Path('../ttmp/Chopin_Mazurkas_Modified/annotations_beat/')
test_files_path = Path('cfg_files/test.files.list')

In [14]:
with open(test_files_path, 'w') as f:
    for folder in DATA_PATH.glob('*'):
        foldername = os.path.basename(folder)
        if foldername != 'Chopin_Op017No4' and foldername != 'Chopin_Op063No3':
            for file in folder.glob('*'):
                filename = os.path.basename(file)[:-5]
                if 'Chopin_Op068No3_Koczalski-1948_pid9140' not in filename:
                    f.write(f'{foldername}/{filename}\n')                  

**Test-Full**

In [15]:
full_queries_path = Path('cfg_files/queries.test.full')
generate_query_list(test_files_path, full_queries_path)