In [None]:
from pathlib import Path
from tqdm import tqdm

dataset_path = Path("../data/jazzmus_dataset/")
total_scores = list(dataset_path.glob("*.json"))
len(total_scores)

In [None]:
# Get the scores with version_1. or no version in the name
filtered_scores = [
    score
    for score in total_scores
    if "version_1." in score.name or "version_" not in score.name
]
test_n_samples = len(filtered_scores) * 0.2
val_n_samples = len(filtered_scores) * 0.1

unique_scores = [score for score in total_scores if "version_" not in score.name]

# shuffle the list
import random

random.seed(42)
unique_scores = random.sample(unique_scores, len(unique_scores))

# get random samples for test and validation
test_samples = []
val_samples = []

for score in unique_scores:
    if len(test_samples) < test_n_samples:
        test_samples.append(score)
        # pop the score from the list
        unique_scores.pop(unique_scores.index(score))
        continue

    if len(test_samples) >= test_n_samples and len(val_samples) < val_n_samples:
        val_samples.append(score)
        # pop the score from the list
        unique_scores.pop(unique_scores.index(score))
        continue

# remove from total_scores the test and val samples
train_samples = [
    score
    for score in total_scores
    if score not in test_samples and score not in val_samples
]

len(test_samples), len(val_samples), len(train_samples)

In [None]:
len(test_samples) + len(val_samples) + len(train_samples) == len(total_scores)

In [None]:
import json
from PIL import Image

new_dataset_path = Path("../data/jazzmus_dataset_regions/")
new_dataset_path.mkdir(exist_ok=True)


def extract_region_image(image, bounding_box):
    fromX, toX, fromY, toY = (
        bounding_box["fromX"],
        bounding_box["toX"],
        bounding_box["fromY"],
        bounding_box["toY"],
    )
    region = image.crop((fromX, fromY, toX, toY))
    return region


def get_regions(scores, partition):
    partition_files = []
    for score in tqdm(scores):
        with open(score, "r") as f:
            score_content = json.load(f)

            # change the extension to get the image
            image = score.with_suffix(".png")
            regions = score_content["systems"]

            for r in regions:
                bounding_box = r["bounding_box"]
                id = r["id"]
                kern_content = r["**kern"]
                region_name = f"{score.stem}_{id}.png"

                crop_region = extract_region_image(Image.open(image), bounding_box)

                crop_region.save(new_dataset_path / region_name)

                # save the kern content
                with open(new_dataset_path / f"{score.stem}_{id}.kern", "w") as f:
                    f.write(kern_content)

                partition_files.append(new_dataset_path / f"{score.stem}_{id}.kern")

    new_dataset_split_path = new_dataset_path / "splits"
    new_dataset_split_path.mkdir(exist_ok=True)

    with open(new_dataset_split_path / f"{partition}_0.txt", "w") as f:
        for file in partition_files:
            # normalize the path
            f.write(f'{str(file).replace(r"../","")} {str(file.with_suffix(".png")).replace(r"../","")}\n')


get_regions(test_samples, "test")
get_regions(val_samples, "val")
get_regions(train_samples, "train")

In [None]:
from pathlib import Path

from PIL import Image


new_dataset_path = Path("../data/jazzmus_dataset_regions/")
files = list(new_dataset_path.glob("*.png"))

for f in files:
    image = Image.open(f)
    image.save(f, "PNG")