Eventually outputs a full subset of the Rijksmuseum dataset, in the form of a csv-file containing image file name and corresponding material.

The Rijksmuseum set is very unbalanced: while there are 400 material classes, 84% falls within the `papier' class. The script below outputs a subset that is much more balanced.

Moreover, a small fraction of the collection has multiple materials, and another fraction has none at all. These are removed from the dataset.

In [3]:
import os
import random
import pandas as pd

In [4]:
# Folder containing all the xml metadata files:
xmlPath = "/home/vincent/Documenten/BachelorsProject/Rijksdata/xml/"

In [5]:
# Takes the contents of an xml metadata file as input, and outputs
# list of materials it specified
def extractMaterials(xmlFile):
    with open(xmlFile) as f:
        xmlStr = f.read()
    
    materials = []
    
    matchStr = "<dc:format>materiaal: "
    begin = xmlStr.find(matchStr)
    while begin != -1:
        end = xmlStr.find("<", begin + len(matchStr))
        materials += [xmlStr[begin + len(matchStr):end]]
        begin = xmlStr.find(matchStr, end)
    
    return materials

In [6]:
# Getting all ("image-filename", [materials]) pairs
pairs_full = [[file.name.replace(".xml", ".jpg"), extractMaterials(file.path)]
    for file in os.scandir(xmlPath) if file.is_file()]

In [7]:
# Now only the ones with a single material (so not 0 and not multiple)
pairs = [[pair[0], pair[1][0]] for pair in pairs_full if len(pair[1]) == 1]

In [9]:
# Creating a histogram containing how often each class occurs:
def createHist(pairs):
    hist = {}
    for pair in pairs:
        if pair[1] in hist:
            hist[pair[1]] += 1
        else:
            hist[pair[1]] = 1

    # Convert to sorted list:
    hist = [[mat, hist[mat]] for mat in hist]
    hist.sort(key = lambda x: x[1], reverse = True)

    return hist

hist = createHist(pairs)

In [10]:
# Printing material, count, and percentage count of total
def printHist(hist):
    total = 0
    for row in hist:
        total += row[1]

    for row in hist:
        print(f"{row[0]}, {row[1]}, {(100 * row[1] / total):>0.1f}%")

printHist(hist)

papier, 88298, 91.4%
porselein, 1626, 1.7%
zilver, 1232, 1.3%
faience, 1009, 1.0%
hout, 507, 0.5%
brons, 372, 0.4%
glas (materiaal), 349, 0.4%
perkament, 294, 0.3%
geprepareerd papier, 255, 0.3%
fotopapier, 239, 0.2%
ijzer, 203, 0.2%
Japans papier, 201, 0.2%
ivoor, 180, 0.2%
Oosters papier, 150, 0.2%
eikenhout, 133, 0.1%
terracotta, 107, 0.1%
aardewerk, 88, 0.1%
zijde, 80, 0.1%
koper, 70, 0.1%
messing, 64, 0.1%
goud, 63, 0.1%
klei, 60, 0.1%
tin, 59, 0.1%
karton, 57, 0.1%
steengoed, 55, 0.1%
satijn, 47, 0.0%
kardoespapier, 40, 0.0%
palmhout, 39, 0.0%
paneel, 39, 0.0%
lood (materiaal), 37, 0.0%
wit marmer, 36, 0.0%
linnen, 35, 0.0%
katoen, 33, 0.0%
zandsteen, 32, 0.0%
chine collé, 32, 0.0%
marmer, 29, 0.0%
wol, 23, 0.0%
kraakporselein, 23, 0.0%
notenhout, 19, 0.0%
andesiet, 16, 0.0%
leer, 16, 0.0%
lindehout, 13, 0.0%
stucwerk, 13, 0.0%
blik, 13, 0.0%
albast, 13, 0.0%
Chinees papier, 13, 0.0%
olieverf, 12, 0.0%
schildpad, 12, 0.0%
kalksteen, 12, 0.0%
pijpaarde, 12, 0.0%
geprepareerd linne

In [30]:
# Creating a subset where there is a maximum to the number of instances per class.
# If a class has more than this maximum, an random sample is picked.
# Moreover, there is a maximum number of classes
max_instances = 1000
num_classes = 30

# Splitting the first 'num_classes' into a set of small enough ones and too big ones:
good_sized_classes = [row[0] for row in hist[:num_classes] if row[1] <= max_instances]
too_big_classes    = [row[0] for row in hist[:num_classes] if row[1] >  max_instances]

# Already adding all instances of 'good_sized_classes':
pairs_subset = [pair for pair in pairs if pair[1] in good_sized_classes]

# Adding 'max_instances' random samples of the too big classes:
for material in too_big_classes:
    all = [pair for pair in pairs if pair[1] == material]
    random.shuffle(all)
    pairs_subset += all[:max_instances]

# Finally, randomly shuffling the subset:
random.shuffle(pairs_subset)

In [31]:
hist_subset = createHist(pairs_subset)

print(f"SUBSET WITH {num_classes} CLASSES, AND {len(pairs_subset)} ELEMENTS\n")
printHist(hist_subset)

SUBSET WITH 30 CLASSES, AND 7788 ELEMENTS

porselein, 1000, 12.8%
papier, 1000, 12.8%
zilver, 1000, 12.8%
faience, 1000, 12.8%
hout, 507, 6.5%
brons, 372, 4.8%
glas (materiaal), 349, 4.5%
perkament, 294, 3.8%
geprepareerd papier, 255, 3.3%
fotopapier, 239, 3.1%
ijzer, 203, 2.6%
Japans papier, 201, 2.6%
ivoor, 180, 2.3%
Oosters papier, 150, 1.9%
eikenhout, 133, 1.7%
terracotta, 107, 1.4%
aardewerk, 88, 1.1%
zijde, 80, 1.0%
koper, 70, 0.9%
messing, 64, 0.8%
goud, 63, 0.8%
klei, 60, 0.8%
tin, 59, 0.8%
karton, 57, 0.7%
steengoed, 55, 0.7%
satijn, 47, 0.6%
kardoespapier, 40, 0.5%
palmhout, 39, 0.5%
paneel, 39, 0.5%
lood (materiaal), 37, 0.5%


In [32]:
# Saving the new subset as a csv file:
subsetDf = pd.DataFrame.from_dict({
    "jpg":      [row[0] for row in pairs_subset],
    "material": [row[1] for row in pairs_subset]
})

subsetDf.to_csv("subset_data.csv", index=False)

In [33]:
# Saving the corresponding histogram as well:
subsetHistDf = pd.DataFrame.from_dict({
    "material": [row[0] for row in hist_subset],
    "count":    [row[1] for row in hist_subset]
})

subsetHistDf.to_csv("subset_hist_data.csv", index=False)