In [None]:
import os
import pandas as pd
import networkx as nx
import pickle
import numpy as np
import shutil
import matplotlib.pyplot as plt
from pandas.core.common import flatten
import numpy as np
import glob
from tqdm import tqdm
from pathlib import Path
import json
import random


In [None]:
dataset_path = '/datasets/ricordi'

In [None]:
def data_json(dataset_path):
    dataset = []
    # iterate through all folders in the directory
    for folder in glob.glob(dataset_path + '/*'):
        for files in glob.glob(folder + '/*'):
            # check if the file is either a folder or a json file
            if os.path.isdir(files):
                for data_path in glob.glob(files + '/*'):
                    dataset.append(glob.glob(data_path + '/*.json'))
            elif files.endswith('.json'):
                dataset.append(files)
    dataset = list(flatten(dataset))
    print("Total json files:", len(dataset))
    return dataset

In [None]:
def sort_by_key(data,key):
    dataset = []
    for i in data:
        f = open(i)
        json_data = json.load(f)
        if key in json_data:
            dataset.append(i)
        else:
            continue
    print("Json with annotazione1 present:",len(dataset))
    return dataset

In [None]:
dmerge_classes = [[2, 15], [7, 9]]
classes_relevant = [5,6,7,8,9,10,11,12,14,16]
classes_irrelevant = [0,1,2,3,4,15,17]
classes = classes_relevant + classes_irrelevant

# Define the class names in the same order as the classes list
class_names_relevant = [
    "Pause (full or almost)",
    "Single note (with at least the head)",
    "Multiple Notes (with at least the head)",
    "Single chord (with at least heads)",
    "Multiple chords (with at least heads)",
    "Accidental(s) (whole or nearly so)",
    "Key(s) (whole(s) or nearly)",
    "Embellishment(s) (whole(s) or nearly)",
    "More categories (with at least one musical sign)",
    "Other (with at least one musical sign)"
]

class_names_irrelevant = [
    "Page border",
    "Erasure",
    "Blurr",
    "Printed Text",
    "Manuscript Text",
    "More categories (no musical signs)",
    "Other (without musical markings)"
]

# Combine the relevant and irrelevant class names
class_names = class_names_relevant + class_names_irrelevant

In [None]:
json_list = data_json(dataset_path)
sorted_list_by_key = sort_by_key(json_list, "annotazione1")

In [None]:
def plot_distribution_by_class(sorted_list_by_key, classes, key):
    count = {i: 0 for i in classes}
    
    for i in sorted_list_by_key:
        with open(i) as f:
            json_data = json.load(f)
            class_index = json_data[key]
            count[class_index] += 1
    
    print(count)

    # reorder classes, relevant first
    count = [count[i] for i in classes_relevant] + [count[i] for i in classes_irrelevant]
    labels = ["Relevant"] + [None] * (len(classes_relevant) - 1) + ["Irrelevant"] + [None] * (len(classes_irrelevant) - 1)
    colors = ["tab:red" for _ in classes_relevant] + ["tab:blue" for _ in classes_irrelevant]
    
    plt.figure(figsize=(10, 6))  # Set the size of the figure
    
    x = np.arange(len(classes))  # Generate an array of class indices
    plt.rcParams.update({'font.size': 8})
    plt.bar(x, count, label=labels, color=colors)
    plt.xticks(x, class_names, rotation=45, ha='right')  # Set custom x-axis tick positions and labels
    plt.xlabel("Classes")
    plt.ylabel("Count")
    plt.title("Class Count")
    plt.legend()
    plt.show()

    return count

In [None]:
count = plot_distribution_by_class(sorted_list_by_key,classes, "annotazione1")

In [None]:
# Merge classes that have cardinality < 0.5 of median cardinality
merged = []
m = np.median(count)
print("Median is", m)
print("Mean is", np.mean(count))
print("Min is", np.min(count))

# note, we force classes 9 and 7 and classes 15 and 2 to look like if they the same number of samples 
# (the sum of the two) so that they are appended to `merged` together
count_ = np.copy(count)
count_[9] += count_[7]
count_[15] += count_[2]
count_[7] = count_[9]
count_[2] = count_[15]

for i, c in enumerate(count_):
    if c < 0.75 * m:
        merged.append(i)

print("Merging classes:", [class_names[i] for i in merged])
print("Total of `Remaining` class: ", sum(count_[c] for c in merged if c not in [15, 9]))

In [None]:
plt.figure(figsize=(10, 6))  # Set the size of the figure

count__ = [count_[i] for i in range(len(class_names)) if i not in merged]
count__.append(sum([count_[c] for c in merged if c not in [15, 9]]))
class_names_ = [class_names[i] for i in range(len(class_names)) if i not in merged]
class_names_.append("Remaining")

x = np.arange(len(class_names_))  # Generate an array of class indices
plt.rcParams.update({'font.size': 8})
plt.bar(x, count__) # , label=labels, color=colors)
plt.xticks(x, class_names_, rotation=45, ha='right')  # Set custom x-axis tick positions and labels
plt.xlabel("Classes")
plt.ylabel("Count")
plt.title("Class Count")
plt.show()
print("Median now is", np.median(count__))
print("Mean now is", np.mean(count__))
print("Min now is", np.min(count__))
print("Cardinalities:", list(zip(class_names_, count__)))

In [None]:
from pathlib import Path

source_dataset_path = Path('./data/source/')
binary_dataset_path = Path('./data/binary_dataset')
relevant_path = binary_dataset_path / 'data' / 'relevant'
irrelevant_path = binary_dataset_path / 'data' / 'irrelevant'
source_dataset_path.mkdir(exist_ok=True, parents=True)
relevant_path.mkdir(exist_ok=True, parents=True)
irrelevant_path.mkdir(exist_ok=True, parents=True)
multiclass_dataset_path = Path('./data/multiclass_dataset') / 'data'

In [None]:
from shutil import copyfile

# copy all relevant images to both datasets
for json_file in tqdm(Path(dataset_path).glob("**/*.json")):
    with open(json_file, 'r') as f:
        data = json.load(f)
        if "annotazione1" in data:
            label = int(data["annotazione1"])
            png_path = Path(data["path"][18:])
            # compute the name of the physical copy
            copied_path = source_dataset_path / png_path
            # copy the file
            copied_path.parent.mkdir(exist_ok=True, parents=True)
            copyfile(dataset_path / png_path, copied_path)
            
            # symlink into the binary dataset
            if label in classes_irrelevant:
                binary_path = irrelevant_path / png_path.name
            elif label in classes_relevant:
                binary_path = relevant_path / png_path.name
            else:
                print("Unknown label!")
            target = os.path.relpath(copied_path, start=binary_path.parent)
            binary_path.symlink_to(target)
            
            # symlink to the multiclass dataset
            # N.B. handle merged classes
            if label in merged:
                class_name = "Remaining"
            elif label in [9, 7]:
                class_name = "Multiple notes or chords"
            elif label in [15, 2]:
                class_name = "Blurr or multiple categories (no music signs)"
            else:
                class_name = class_names[classes.index(label)]
            class_path = multiclass_dataset_path / class_name
            class_path.mkdir(exist_ok=True, parents=True)
            multiclass_name = (class_path / png_path.name)
            # note relative_to doesn't work for an issue in python 3.9 https://bugs.python.org/issue40358
            target = os.path.relpath(copied_path, start=multiclass_name.parent)
            multiclass_name.symlink_to(target)

In [None]:
# count the files copied
print("Files in binary dataset:", len(list(binary_dataset_path.glob("**/*.png"))))
print("Files in multiclass dataset:", len(list(multiclass_dataset_path.glob("**/*.png"))))