Packages and folders

In [6]:
ROOT = "C:\\OneDrive - Netherlands eScience Center\\Project_Art\\ART-projects\\"
PATH_images = ROOT + ""
PATH_metadata = ROOT + "Data\\rijksxml\\"


In [11]:
import numpy
import os
import csv 
import xml.etree.ElementTree as ET 
import fnmatch

In [12]:
dirs = os.listdir(PATH_metadata)
xml_files = fnmatch.filter(dirs, "*.xml")

In [14]:
xml_files[:10]

['0000001_SK-A-4878.xml',
 '0000002_SK-A-4877.xml',
 '0000003_SK-A-4881.xml',
 '0000004_RP-P-1992-35.xml',
 '0000005_RP-P-1992-36.xml',
 '0000006_RP-P-1992-68.xml',
 '0000007_RP-P-1992-80.xml',
 '0000008_RP-P-1992-70.xml',
 '0000009_BK-NM-4821.xml',
 '0000010_BK-NM-10025.xml']

## Import metadata
I had issues using standard XML importers (mostly due to special charcters).  
So here a simple importer function to extract the metadata.

In [203]:
def weird_xml_reader(filename):
    """
    Clumsy, handwritten importer function...
    """
    material = []
    art_type = []
    technique = []
    date = []
    with open(filename, "r", errors='replace') as xmlfile:
        for i, line in enumerate(xmlfile):
            if '<dc:format>materiaal' in line: 
                material.append(line.rstrip().split('<dc:format>')[1].split('</dc:format>')[0].split(' ')[1])
            if '<dc:format>techniek' in line: 
                technique.append(line.rstrip().split('<dc:format>')[1].split('</dc:format>')[0].split(' ')[1])
            if '<dc:date>' in line: 
                date.append(line.rstrip().split('<dc:date>')[1].split('</dc:date>')[0])
            if '<dc:type>' in line: 
                art_type.append(line.rstrip().split('<dc:type>')[1].split('</dc:type>')[0])
                
    return material, art_type, technique, date             

In [190]:
weird_xml_reader(PATH_metadata + xml_files[16])

(['aardewerk'], ['drinkgerei', 'bokaal'], [], ['-2000 -  -1000'])

### Collect metadata from all files

In [204]:
list_metadata = []

for i, file in enumerate(xml_files):
    list_metadata.append(weird_xml_reader(PATH_metadata + file))

In [205]:
len(list_metadata)

112039

In [206]:
list_material, list_art_type, list_technique, list_date = zip(*list_metadata)

### Materials

In [209]:
list_material[:10]

(['olieverf', 'doek'],
 ['papier', 'olieverf', 'hout'],
 ['olieverf', 'doek'],
 ['papier', 'dekverf'],
 ['papier', 'dekverf'],
 ['papier'],
 ['papier'],
 ['papier'],
 ['steengoed', 'glazuur'],
 ['steengoed', 'glazuur'])

In [223]:
unique_materials = []
#flatten the list
for x in list_material:
    for y in x:
        unique_materials.append(y)

unique_materials = list(set(unique_materials))

In [224]:
print("Number of found categories: ",len(unique_materials))

Number of found categories:  405


In [227]:
print(unique_materials[:100])

['', 'drukinkt', 'koord', 'bombazijn', 'dekverf', 'zilverdraad', 'mineraal', 'celluloid', 'schelp', 'velijn', 'vuursteen', 'bruinharthout', 'chalcedoon', 'Delfts', 'wilgenhout', 'eigeel', 'soft', 'gouache', 'inkt', 'citrien', 'sentoku', 'email', 'fotopapier', 'bladzilver', 'lijm', 'gaas', 'knobbelhoorn', 'solnhofersteen', 'verf', 'roggenvel', 'rijstpapier', 'kastoor', 'aluminium', 'amboinahout', 'hardboard', 'ei', 'haaienvel', 'huid', 'crÃªpepapier', 'palissanderhout', 'pen', 'zeildoek', 'notenhout', 'peper', 'hertshoorn', 'goudpoeder', 'luster', 'watten', 'molton', 'smaragd', 'rijstpasta', 'lapis', 'naaldhout', 'ijzerdraad', 'slak', 'cipressenhout', 'spaanplaat', 'katoen', 'campÃªchehout', 'kurk', 'barnsteen', 'cederhout', 'toetssteen', 'chenille', 'metaal', 'vloszijde', 'meerschuim', 'tule', 'palmhout', 'kalksteen', 'tand', 'koper', 'lindehout', 'strohalm', 'waterverf', 'kobalt', 'fluweel', 'garen', 'ijzer', 'samiet', 'palmblad', 'zandsteen', 'maansteen', 'stopverf', 'spiegelglas', '

### Art type

In [228]:
list_art_type[:10]

(['schilderij'],
 ['schilderij'],
 ['schilderij'],
 ['prent', 'historieprent'],
 ['prent', 'historieprent'],
 ['prent'],
 ['prent'],
 ['prent'],
 ['stortebeker'],
 ['kruik'])

In [229]:
unique_art_types = []
#flatten the list
for x in list_art_type:
    for y in x:
        unique_art_types.append(y)

unique_art_types = list(set(unique_art_types))

In [230]:
print("Number of found categories: ",len(unique_art_types))

Number of found categories:  1822


In [231]:
print(unique_art_types[:100])

['koord', 'scheepskompas', 'deurgreep', 'knipsel', 'vaatwerk', 'wandarm', 'gedachtenislepel', 'kogelgiettang', 'molenbeker', 'vochtweger', 'contrazegelstempel', 'bonnet-coiffure', 'soepterrine', 'struisvogelei', 'tafelhorloge', 'kruitmaat', 'seingeweer', 'torchÃ¨re', 'slaapmuts', 'hyacintenvaas', 'kabeltouw', 'stola (liturgisch)', 'priem', 'trektafel', 'ijsbreker', 'tabakspot', 'kruiwagen', 'piano', 'reishorloge', 'pen', 'monstrans', 'snijbord', 'tabernakeldeur', 'schilderskist', 'waldhoorn', 'plengbeker', 'collier', 'kazuifel', 'egoyomi (kalenderblad)', 'raam', 'wijnvat', 'jas', 'wijzerplaat', 'geldkist', 'wijnglas', 'kussenblad', 'wapenrek', 'altaarhanger', 'tiara', 'kerkmeubel', 'olie- en azijnstel', 'mergboor', 'tweehander', 'naaigerei', 'portefeuille', 'schilderijenstandaard', 'misthoorn', 'suikertang', 'calÃ¨che', 'radslot', 'ceintuur', 'blad', 'zoutlepel', 'pistool', 'miniatuur', 'po', 'kuttrolf', 'zakpistool', 'snaphaangeweer', 'peper- en zoutstel', 'garenwinder', 'kinderledika