Packages and folders

In [1]:
ROOT = "C:\\OneDrive - Netherlands eScience Center\\Project_Art\\ART-projects\\"
PATH_images = ROOT + ""
PATH_metadata = ROOT + "Data\\rijksxml\\"


In [2]:
import numpy
import os
import csv 
#import xml.etree.ElementTree as ET 
import fnmatch

In [3]:
dirs = os.listdir(PATH_metadata)
xml_files = fnmatch.filter(dirs, "*.xml")

In [4]:
xml_files[:10]

['0000001_SK-A-4878.xml',
 '0000002_SK-A-4877.xml',
 '0000003_SK-A-4881.xml',
 '0000004_RP-P-1992-35.xml',
 '0000005_RP-P-1992-36.xml',
 '0000006_RP-P-1992-68.xml',
 '0000007_RP-P-1992-80.xml',
 '0000008_RP-P-1992-70.xml',
 '0000009_BK-NM-4821.xml',
 '0000010_BK-NM-10025.xml']

## Import metadata
I had issues using standard XML importers (mostly due to special charcters).  
So here a simple importer function to extract the metadata.

In [47]:
def weird_xml_reader(filename):
    """
    Clumsy, handwritten importer function...
    """
    material = []
    art_type = []
    technique = []
    date = []
    creator = []
    with open(filename, "r", errors='replace') as xmlfile:
        for i, line in enumerate(xmlfile):
            if '<dc:format>materiaal' in line: 
                material.append(line.split('<dc:format>')[1].split('</dc:format>')[0].split(' ')[1])
            if '<dc:format>techniek' in line: 
                technique.append(line.split('<dc:format>')[1].split('</dc:format>')[0].split(' ')[1])
            if '<dc:date>' in line: 
                date.append(line.split('<dc:date>')[1].split('</dc:date>')[0])
            if '<dc:type>' in line: 
                art_type.append(line.split('<dc:type>')[1].split('</dc:type>')[0])
            if '<dc:creator>' in line and 'uitgever' not in line: 
                #line.split('<dc:creator>')[1].split('</dc:creator>')[0]
                creator.append(line.split('<dc:creator>')[1].split('</dc:creator>')[0].split(': ')[1])
                
    return material, art_type, technique, date, creator   

In [48]:
weird_xml_reader(PATH_metadata + xml_files[4])

(['papier', 'dekverf'],
 ['prent', 'historieprent'],
 ['graveren', 'etsen', 'witte', 'penseel'],
 ['1555 -  1556'],
 ['Coornhert, Dirck Volckertsz', 'Bosch, Cornelis', 'Heemskerck, Maarten van'])

### Collect metadata from all files

In [49]:
list_metadata = []

for i, file in enumerate(xml_files):
    list_metadata.append(weird_xml_reader(PATH_metadata + file))

In [8]:
len(list_metadata)

112039

In [50]:
list_material, list_art_type, list_technique, list_date, list_creator = zip(*list_metadata)

### Artists / Creators

In [51]:
list_creator[:100]

(['Everdingen, Caesar BoÃ«tius van'],
 ['Maris, Matthijs'],
 ['Maes, Nicolaes'],
 ['Coornhert, Dirck Volckertsz', 'Heemskerck, Maarten van'],
 ['Coornhert, Dirck Volckertsz', 'Bosch, Cornelis', 'Heemskerck, Maarten van'],
 ['Bos, Balthazar van den', 'Lombard, Lambert'],
 ['Call, Jan van (I)', 'onbekend'],
 ['Hondius, Hendrick (I)', 'Republiek der Zeven Verenigde Nederlanden'],
 ['anoniem'],
 ['anoniem'],
 ['anoniem'],
 ['anoniem'],
 ['anoniem'],
 [''],
 [''],
 [''],
 ['anoniem'],
 ['Boissieu, Jean Jacques de'],
 ['Boissieu, Jean Jacques de'],
 ['Blondel, Jacques FranÃ§ois'],
 ['Blondel, Jacques FranÃ§ois'],
 ['Blarenberghe, Henri Joseph van'],
 ['Bouchardon, EdmÃ©'],
 ['Aubert, Louis'],
 ['Boitard, FranÃ§ois'],
 ['Boitard, FranÃ§ois'],
 ['Boitard, FranÃ§ois'],
 ['Boucher, FranÃ§ois'],
 ['Boucher, FranÃ§ois'],
 ['Boucher, FranÃ§ois'],
 ['Boucher, FranÃ§ois'],
 ['Boucher, FranÃ§ois'],
 ['Boucher, FranÃ§ois'],
 ['Boucher, FranÃ§ois'],
 ['Boucher, FranÃ§ois'],
 ['Boucher, FranÃ§ois'],
 ['B

In [52]:
import pandas as pd

creators_unique = []
creators_all = []

#flatten the list
for x in list_creator:
    for y in x:
        creators_all.append(y)

creators_unique = list(set(creators_all))
creators_occurences = pd.DataFrame(columns=['creator', 'occurences'])

for i, creator in enumerate(creators_unique):
    counts = creators_all.count(creator)
    creators_occurences.loc[i] = [creator, counts]

In [53]:
creators_occurences.sort_values(by=['occurences'], ascending=False)

Unnamed: 0,creator,occurences
2004,anoniem,23410
2249,Rembrandt Harmensz. van Rijn,3142
5468,"Luyken, Jan",2778
3934,"Picart, Bernard",2631
1146,onbekend,2181
1018,"Hooghe, Romeyn de",2016
10156,"Goltzius, Hendrick",1971
2305,"Fokke, Simon",1847
8184,"Callot, Jacques",1704
9410,"Gheyn, Jacob de (II)",1659


### Materials

In [11]:
list_material[150:170]

(['papier'],
 ['steengoed', 'glazuur'],
 ['papier'],
 ['papier'],
 ['papier'],
 ['papier'],
 ['papier'],
 ['papier'],
 ['steengoed', 'glazuur'],
 ['steengoed', 'glazuur'],
 ['hout', 'verfstof'],
 ['steen'],
 ['zandsteen'],
 ['zandsteen'],
 ['leisteen', 'verfstof', 'verguldsel'],
 ['leisteen'],
 ['brons'],
 ['marmer'],
 ['brons'],
 ['zandsteen'])

In [12]:
import pandas as pd

materials_unique = []
materials_all = []

#flatten the list
for x in list_material:
    for y in x:
        materials_all.append(y)

materials_unique = list(set(materials_all))
materials_occurences = pd.DataFrame(columns=['material', 'occurences'])

for i, material in enumerate(materials_unique):
    counts = materials_all.count(material)
    materials_occurences.loc[i] = [material, counts]

In [15]:
materials_occurences.sort_values(by=['occurences'], ascending=False)

Unnamed: 0,material,occurences
393,papier,92785
384,olieverf,3667
366,porselein,2811
2,hout,2166
209,zilver,1791
213,doek,1753
111,paneel,1588
343,glazuur,1423
276,faience,1307
151,inkt,1038


### Art type

In [18]:
list_art_type[:10]

(['schilderij'],
 ['schilderij'],
 ['schilderij'],
 ['prent', 'historieprent'],
 ['prent', 'historieprent'],
 ['prent'],
 ['prent'],
 ['prent'],
 ['stortebeker'],
 ['kruik'])

In [21]:
#import pandas as pd

art_types_unique = []
art_types_all = []

#flatten the list
for x in list_art_type:
    for y in x:
        art_types_all.append(y)

art_types_unique = list(set(art_types_all))
art_types_occurences = pd.DataFrame(columns=['art_type', 'occurences'])

for i, art_type in enumerate(art_types_unique):
    counts = art_types_all.count(art_type)
    art_types_occurences.loc[i] = [art_type, counts]

In [22]:
art_types_occurences.sort_values(by=['occurences'], ascending=False)

Unnamed: 0,art_type,occurences
1706,prent,77052
1480,tekening,14223
764,boekillustratie,7007
1586,ornamentprent,5221
1093,schilderij,3593
1245,historieprent,3503
1359,nieuwsprent,2598
640,foto,2319
178,ontwerp,1966
147,kaart,1923
