Eventually outputs a full subset of the Rijksmuseum dataset, in the form of a csv-file containing image file name and corresponding material.

The Rijksmuseum set is very unbalanced: while there are 400 material classes, 84% falls within the `papier' class. The script below outputs a subset that is much more balanced.

Moreover, a small fraction of the collection has multiple materials, and another fraction has none at all. These are removed from the dataset.

In [None]:
import os
import random
import pandas as pd

In [None]:
# Folder containing all the xml metadata files:
xmlPath = "/home/vincent/Documenten/BachelorsProject/Rijksdata/xml/"

In [None]:
# Takes the contents of an xml metadata file as input, and outputs
# list of materials it specified
def extractMaterials(xmlFile):
    with open(xmlFile) as f:
        xmlStr = f.read()
    
    materials = []
    
    matchStr = "<dc:format>materiaal: "
    begin = xmlStr.find(matchStr)
    while begin != -1:
        end = xmlStr.find("<", begin + len(matchStr))
        materials += [xmlStr[begin + len(matchStr):end]]
        begin = xmlStr.find(matchStr, end)
    
    return materials

In [None]:
# Getting all ("image-filename", [materials]) pairs
pairs_full = [[file.name, extractMaterials(file.path)] for file in os.scandir(xmlPath) if file.is_file()]

In [None]:
# Now only the ones with a single material (so not 0 and not multiple)
pairs = [[pair[0], pair[1][0]] for pair in pairs_full if len(pair[1]) == 1]

In [None]:
# Creating a histogram containing how often each class occurs:
def createHist(pairs):
    hist = {}
    for pair in pairs:
        if pair[1] in hist:
            hist[pair[1]] += 1
        else:
            hist[pair[1]] = 1

    # Convert to sorted list:
    hist = [[mat, hist[mat]] for mat in hist]
    hist.sort(key = lambda x: x[1], reverse = True)

    return hist

hist = createHist(pairs)

In [None]:
# Printing material, count, and percentage count of total
def printHist(hist):
    total = 0
    for row in hist:
        total += row[1]

    for row in hist:
        print(f"{row[0]}, {row[1]}, {(100 * row[1] / total):>0.1f}%")

printHist(hist)

In [None]:
# Creating a subset where there is a maximum to the number of instances per class.
# If a class has more than this maximum, an random sample is picked.
# Moreover, there is a maximum number of classes
max_instances = 1000
num_classes = 30

# Splitting the first 'num_classes' into a set of small enough ones and too big ones:
good_sized_classes = [row[0] for row in hist[:num_classes] if row[1] <= max_instances]
too_big_classes    = [row[0] for row in hist[:num_classes] if row[1] >  max_instances]

# Already adding all instances of 'good_sized_classes':
pairs_subset = [pair for pair in pairs if pair[1] in good_sized_classes]

# Adding 'max_instances' random samples of the too big classes:
for material in too_big_classes:
    all = [pair for pair in pairs if pair[1] == material]
    random.shuffle(all)
    pairs_subset += all[:max_instances]

# Finally, randomly shuffling the subset:
random.shuffle(pairs_subset)

In [None]:
hist_subset = createHist(pairs_subset)

print(f"SUBSET WITH {num_classes} CLASSES, AND {len(pairs_subset)} ELEMENTS\n")
printHist(hist_subset)

In [None]:
# Saving the new subset as a csv file:
subsetDf = pd.DataFrame.from_dict({
    "jpg":      [row[0] for row in pairs_subset],
    "material": [row[1] for row in pairs_subset]
})

subsetDf.to_csv("subset_data.csv")

In [None]:
# Saving the corresponding histogram as well:
subsetHistDf = pd.DataFrame.from_dict({
    "material": [row[0] for row in hist_subset],
    "count":    [row[1] for row in hist_subset]
})

subsetHistDf.to_csv("subset_hist_data.csv")