### Transfer the XML to CSV file

In [1]:
import xml.etree.ElementTree as ET
import pandas as pd

# Load the XML data
tree = ET.parse('metadata_record.xml')
root = tree.getroot()

# Define a function to extract field names and values
def extract_fields(record):
    fields = {}
    for child in record:
        fields[child.tag] = child.text.strip() if child.text else None
    return fields

# Extract data from each record and store in a list
data = []
for record in root.findall('record'):
    data.append(extract_fields(record))

# Convert the list of dictionaries to a DataFrame
df1 = pd.DataFrame(data)

# Display the DataFrame
df1.head()

Unnamed: 0,Institutional_Owner,Shelfmark,Title,Publisher,Creator,Type_of_Content,Date_of_Map,Technique,Subject,Annotation,...,PPN,Region_Keyword,Additional_Signature,Item_URL,OCLC_number,Date_created,Date_modified,CONTENTdm_number,CONTENTdm_file_name,CONTENTdm_file_path
0,,uklu 01-29-18,"Kaart van de provincie ""Stad en Lande"" in de l...",[Groningen : Hoitsema] ; Groningen : steendr. ...,"Koster, J.P.",,1874,,kb-1874-grp-senl-152 krt-1872-grp-senl,,...,169176452,4.210.230 Groningen (provincie) |,BACKER PORT 07 NO15 | BACKER LIJST r59 | ; krt...,http://cdm21053.contentdm.oclc.org/cdm/ref/col...,,2012-10-24,2012-10-24,801,6402.cpd,/Kaarten/image/6402.cpd
1,,uklu 01-04-50,"[Kaart van Friesland, Groningen, Drenthe en Ov...",[S.l. : s.n.],"Daventriensis, Jacobus",,18XX,,krt-1550-nne-facs,Volgens aantekening onder de kaart is de auteu...,...,154989061,4.210.200 Noord-Nederland | 4.210.270 Overijss...,BACKER PORT 11 NO05 | BB portef. XI nr. 5 | ; ...,http://cdm21053.contentdm.oclc.org/cdm/ref/col...,,2012-10-24,2012-10-24,803,6404.cpd,/Kaarten/image/6404.cpd
2,,uklu 01-13-06,"Groninga, opulenta populosa, et valide contra ...",[S.l. : s.n.],,,1575,,krt-1575-grs Kleurendia,,...,151800677,4.210.230 Groningen (stad) |,BACKER PORT 09 NO11 | BB portef. IX nr. 11 | ;...,http://cdm21053.contentdm.oclc.org/cdm/ref/col...,,2012-10-24,2021-07-14,805,6406.cpd,/Kaarten/image/6406.cpd
3,,uklu 01-01-35,"Icon ciuitatis Campensis : cuius situs, Isulam...",,"Braun, Georg",,1575,,krt-1575-ove krt-1575-ove-v krt-1575-ove-a ...,Genummerd op verso: 30 | Beschreven in: Koeman...,...,160344905,4.210.270 Kampen (Overijssel) |,krt-1575-ove,http://cdm21053.contentdm.oclc.org/cdm/ref/col...,,2012-10-24,2012-10-24,810,6411.cpd,/Kaarten/image/6411.cpd
4,,uklu 01-13-10,Goricum : nobile Hollandiae oppidum ; Broversa...,S.l. : s.n.,"Braun, Georg",,1576,,krt-1576-grs krt-1576-grs-v,Genummerd op verso: 21 | Beschreven in: Koeman...,...,154312789,4.210.230 Groningen (stad) | 4.210.410 Brouwer...,BACKER PORT 10 NO07 | BB portef. 10 nr. 7 | ; ...,http://cdm21053.contentdm.oclc.org/cdm/ref/col...,,2012-10-24,2012-10-24,813,6414.cpd,/Kaarten/image/6414.cpd


In [11]:
df1.to_csv('DCHM.csv', index=False)  # save the dataframe to CSV file

### Completeness Analysis

In [11]:
# Calculate the count of missing values in each column
missing_values = df1.isnull().sum()

# Sort the columns by the number of missing values in ascending order
sorted_columns = missing_values.sort_values()

# Print the columns in order of completeness
print("Columns sorted by completeness:")
sorted_columns

Columns sorted by completeness:


CONTENTdm_file_path           0
CONTENTdm_file_name           0
Date_created                  0
Date_modified                 0
CONTENTdm_number              0
Item_URL                      0
Title                         0
Date_of_Map                   2
Map_Format                    7
Shelfmark                    15
Pagination                   16
PPN                          21
Subject                      24
Source_Materials             37
Geographic_Location          67
Publisher                    75
Other_Annotation            126
Region_Keyword              137
Scale_of_Map                245
Creator                     323
Additional_Signature        351
Annotation                  377
Annotation_Library_Index    740
Annotation_Edition          779
OCLC_number                 963
Institutional_Owner         967
Distribution_Format         967
Date_of_Publication         967
Language                    967
Rights                      967
Type_of_Content             967
Techniqu

In [10]:
df1.columns

Index(['Institutional_Owner', 'Shelfmark', 'Title', 'Publisher', 'Creator',
       'Type_of_Content', 'Date_of_Map', 'Technique', 'Subject', 'Annotation',
       'Rights', 'Language', 'Date_of_Publication', 'Distribution_Format',
       'Geographic_Location', 'Scale_of_Map', 'Reference_System',
       'Mapping_Methods', 'Map_Format', 'Source_Materials', 'Pagination',
       'Annotation_Edition', 'Annotation_Library_Index', 'Other_Annotation',
       'PPN', 'Region_Keyword', 'Additional_Signature', 'Item_URL',
       'OCLC_number', 'Date_created', 'Date_modified', 'CONTENTdm_number',
       'CONTENTdm_file_name', 'CONTENTdm_file_path'],
      dtype='object')

In [15]:
# Calculate the count of missing values in each row
missing_values_rows = df1.isnull().sum(axis=1)

# Sort the DataFrame by the number of missing values in rows
sorted_df = df1.loc[missing_values_rows.sort_values().index]

# Print the top 10 most complete rows
print("Top 10 most complete rows:")
sorted_df.head(10)

Top 10 most complete rows:


Unnamed: 0,Institutional_Owner,Shelfmark,Title,Publisher,Creator,Type_of_Content,Date_of_Map,Technique,Subject,Annotation,...,PPN,Region_Keyword,Additional_Signature,Item_URL,OCLC_number,Date_created,Date_modified,CONTENTdm_number,CONTENTdm_file_name,CONTENTdm_file_path
967,UB Groningen,SYLLABUS 1619 177 (uklu RD--- 1),Europae Tabula IIII,[Amsterdam] : J. Hondius en C. Claesz],,map ; kaart,1605,,Frisia ; Romeinse oudheid ; West-Europa,Also in collection Syllabus: https://facsimile...,...,,Frisia,kb 1605-ned-et4-e-v,http://cdm21053.contentdm.oclc.org/cdm/ref/col...,809314499.0,2023-09-14,2023-09-14,2790,8391.cpd,/Kaarten/supp/2790/index.pdf
968,UB Groningen,uklu RG 1,Wereldkaart 1509,Parrhisijs ... : Henricum Stephanum,[door Gregor Reisch],map,1503,houtsnede,Aarde (planeet) ; Wereldkaarten,Zonder titel met vier windhoofden. Kaart naar ...,...,,,,http://cdm21053.contentdm.oclc.org/cdm/ref/col...,817536588.0,2024-02-21,2024-02-21,2791,8392.jp2,/Kaarten/image/8392.jp2
168,,uklu 01-28-18,Afbeeldinge der stadt Groningen met de omligge...,[S.l. : s.n.],"Haubois, E.",,1694,,krt-1694-grs-grp,Met oorspronkelijk impressum: [Groningen] : Sa...,...,151440972.0,4.210.230 Groningen (stad) | 4.210.250 Coevord...,BACKER PORT 09 NO29 | BB portef. IX nr. 29 | ;...,http://cdm21053.contentdm.oclc.org/cdm/ref/col...,,2012-10-24,2012-10-24,1224,6825.cpd,/Kaarten/image/6825.cpd
788,,Rol middel 003,Italia,Gotha : Haack,"Kampen, Albert",,1984,,Stokkaart,restricted file,...,326892508.0,Italie,4/600#i600,http://cdm21053.contentdm.oclc.org/cdm/ref/col...,,2017-01-23,2018-06-01,2605,8362.jp2,/Kaarten/image/8362.jp2
100,,uklu 02-11-30,Erythraei sive Rvbri Maris periplvs,Amstelodami : sumptibus Ioannis Ian?onii,"Ortelius, Abraham",,1658,,krt-1658-azi-esrm krt-1658-azi-esrm-v,"Beschreven in: Koeman, Atlantes Neerlandici, d...",...,157524140.0,5.200 Zuidwest-Azie | 3.200 Indische Oceaan |,krt-1658-azi-esrm,http://cdm21053.contentdm.oclc.org/cdm/ref/col...,,2012-10-24,2012-10-24,1077,6678.cpd,/Kaarten/image/6678.cpd
84,,uklu 01-28-17,Afbeeldinge der stadt Groningen met de omligge...,[Groningen] : Sam. Pieman excudit,"Haubois, E.",,1652,,krt-1652-grs-grp,"Beschreven in: Vredenberg-Alink, De kaarten va...",...,151432899.0,4.210.230 Groningen (stad) | 4.210.250 Coevord...,BACKER PORT 09 NO28 | BB portef. IX nr. 28 | ;...,http://cdm21053.contentdm.oclc.org/cdm/ref/col...,,2012-10-24,2012-10-24,1028,6629.cpd,/Kaarten/image/6629.cpd
912,,uklu rol 02,Kaart of landtafereel der provincie van Gronin...,[s.l. : s.n.],"Beckeringh, Theodorus",,1781,,krt 1781-grp-1-a | krt 1781-grp-2-a | krt 1781...,handgekleurd ex,...,79320783.0,Groningen (provincie),Uq---- 1 | BACKER kaartenkist | PROEXC P---- 2,http://cdm21053.contentdm.oclc.org/cdm/ref/col...,,2017-01-23,2017-01-23,2729,8184.jp2,/Kaarten/image/8184.jp2
378,,uklu 01-09-01,Kaart van de provincie Groningen : met een ged...,Te Groningen : bij J. Oomkens,"Jappe, J.H.",,1835,,krt-1835-grp-1 krt-1835-grp-2 krt-1835-grp-3...,"Beschreven in: J.J. Vredenburg-Alink, De kaart...",...,111205247.0,4.210.230 Groningen (provincie) | 4.210.210 Fr...,BB kaartenkist 5 | Uq---- 2 | ; krt-1835-grp-1,http://cdm21053.contentdm.oclc.org/cdm/ref/col...,,2012-10-24,2012-10-24,1728,7329.cpd,/Kaarten/image/7329.cpd
330,,uklu 01-09-04,Kaart of landtafereel der provincie van Gronin...,[s.l. : s.n.],"Beckeringh, Theodorus",,1781,,krt-1781-grp-1-a krt-1781-grp-2-a krt-1781-g...,Jaar van uitgave ontleend aan J.J. Vredenburg ...,...,79320783.0,4.210.230 Groningen (provincie) |,Uq---- 1 | BACKER kaartenkist | PROEXC P---- 2...,http://cdm21053.contentdm.oclc.org/cdm/ref/col...,,2012-10-24,2012-10-24,1611,7212.cpd,/Kaarten/image/7212.cpd
436,,uklu 01-10-03,Schoolkaart van de provincie Groningen,Groningen : bij J. Oomkens J.zoon,"Zuidema, E.R.",,1872,,krt-1872-grp-1 krt-1872-grp-2 krt-1872-grp-3...,Beschreven in: Catalogue of maps charts and pl...,...,123583802.0,4.210.230 Groningen (provincie) |,Uq---- 5 | ; krt-1872-grp-1,http://cdm21053.contentdm.oclc.org/cdm/ref/col...,,2012-10-24,2012-10-24,1875,7476.cpd,/Kaarten/image/7476.cpd


### Consistency Analysis

In [21]:
import pandas as pd
import numpy as np

# Count the correct values in the metadata file "Subject/Keywords."

df1['Value_Type'] = df1['Subject'].apply(lambda x: 'File Name' if isinstance(x, str) and '-' in x 
                                       else ('Keywords' if isinstance(x, str) else np.nan))

x = df1[['Subject','Value_Type']]
x

Unnamed: 0,Subject,Value_Type
0,kb-1874-grp-senl-152 krt-1872-grp-senl,File Name
1,krt-1550-nne-facs,File Name
2,krt-1575-grs Kleurendia,File Name
3,krt-1575-ove krt-1575-ove-v krt-1575-ove-a krt-1575-ove-v-a,File Name
4,krt-1576-grs krt-1576-grs-v,File Name
...,...,...
964,"Aurelius, Cornelius, c. 1460-1531 Geography Maps Leiden Divisiekroniek",File Name
965,Zuiderzee,Keywords
966,"Hogenberg, Frans, 1535-1590 Wedderborg Wedde Tachtigjarige oorlog",File Name
967,Frisia ; Romeinse oudheid ; West-Europa,File Name


In [22]:
value_type_counts = df1['Value_Type'].value_counts()
print(value_type_counts)

Value_Type
File Name    768
Keywords     177
Name: count, dtype: int64


### Count map Types

In [4]:
# Count occurrences of each unique value
value_counts = df1['Pagination'].value_counts()

# Print the counts
print(value_counts)

Pagination
1 krt                                                      475
1 wandkaart                                                109
1 plgr                                                      67
1 manuscriptkrt                                             48
1 kaart                                                     24
18 bl. pl                                                   14
1 wandplaat                                                 12
1 plattegrond                                               12
wandplaat                                                   10
64 wandpl. (6 afd.)                                          8
1 zijaanzicht                                                8
1 manuscriptkaart                                            7
1 vogelvluchtplgr                                            7
1 wandkaart in 4 bladen                                      6
6 wandpl                                                     5
Eerste serie (10 wandplaten)                

In [7]:
# Count occurrences of rows containing "krt" or "kaart"
count = df1['Pagination'].str.contains('krt|kaart', case=False, na=False).sum()

print(f"Total count of rows containing 'krt' or 'kaart': {count}")


Total count of rows containing 'krt' or 'kaart': 747


### Count maps in books

In [39]:
# Filter rows containing 'kb' or 'Kaart in boek' in any specified columns
filtered_df = df1[df1.apply(lambda row: row.astype(str).str.contains('Kaart in boek', regex=True).any(), axis=1)]

# Display only the specified columns
filtered_df = filtered_df[['Annotation_Edition', 'Annotation_Library_Index', 'Other_Annotation', 'Annotation', 'Subject']]
filtered_df.shape

(215, 5)

In [28]:
count = df.apply(lambda row: row.astype(str).str.contains('kb|Kaart in boek').any(), axis=1).sum()


In [29]:
count

339

In [31]:
filtered_df = df1[df1.apply(lambda row: row.astype(str).str.contains('kb|Kaart in boek').any(), axis=1)]
filtered_df.head()

Unnamed: 0,Institutional_Owner,Shelfmark,Title,Publisher,Creator,Type_of_Content,Date_of_Map,Technique,Subject,Annotation,...,PPN,Region_Keyword,Additional_Signature,Item_URL,OCLC_number,Date_created,Date_modified,CONTENTdm_number,CONTENTdm_file_name,CONTENTdm_file_path
0,,uklu 01-29-18,"Kaart van de provincie ""Stad en Lande"" in de laatste helft der 17e eeuw",[Groningen : Hoitsema] ; Groningen : steendr. v.d. Weijer,"Koster, J.P.",,1874,,kb-1874-grp-senl-152 krt-1872-grp-senl,,...,169176452,4.210.230 Groningen (provincie) |,BACKER PORT 07 NO15 | BACKER LIJST r59 | ; krt-1872-grp-senl,http://cdm21053.contentdm.oclc.org/cdm/ref/collection/Kaarten/id/801,,2012-10-24,2012-10-24,801,6402.cpd,/Kaarten/image/6402.cpd
27,,uklu 01-23-21,Franicher,[Amsterdam : Willem Jansz],,,1612,,krt-1612-frw-fra kb-1612-frw-fran-33,"Genummerd r.o.: 33 | Lit. : P.A.M. Boele van Hensbroek, Ludovico Guicciardini, Descrittione di tutti i Paesi Bassi. ... Bibliografische studie. In: Bijdragen en mededeelingen van het historische genootschap. 1 (1877) |",...,120875977,4.210.210 Franeker |,krt-1612-frw-fra,http://cdm21053.contentdm.oclc.org/cdm/ref/collection/Kaarten/id/878,,2012-10-24,2012-10-24,878,6479.cpd,/Kaarten/image/6479.cpd
28,,uklu 01-04-28,"Groninga opulenta populosa, et valide contra hostiles insultus munita Phrisie vrbs Ptolemaeo Phileum, constracta et denominata a Grunno Anthenoris Regis Francorum fratre, anno ante incarnationem Christi, CCCLXXVII. vt Humbaldes schribit",[Amsterdam : Willem Jansz],"Guicciardijn, Lowijs",,1612,,krt-1612-grs kb krt-1612-grs-gope-34,Met wapen van Groningen linksboven; genummerd rechtsonder: 34,...,151886180,4.210.230 Groningen (stad) |,BACKER PORT 09 NO03 | ; krt-1612-grs,http://cdm21053.contentdm.oclc.org/cdm/ref/collection/Kaarten/id/880,,2012-10-24,2021-07-14,880,6481.cpd,/Kaarten/image/6481.cpd
29,,uklu 01-04-54,Frisiae occidentalis typus,[S.l. : s.n.],,,1612,,krt-1612-nne kb-1612-nne-fot-31,Genummerd r.o. : 31,...,108746402,4.210.210 Friesland | 4.210.230 Groningen (provincie) |,BACKER PORT 11 NO10 | ; krt-1612-nne,http://cdm21053.contentdm.oclc.org/cdm/ref/collection/Kaarten/id/882,,2012-10-24,2021-07-14,882,6483.cpd,/Kaarten/image/6483.cpd
36,,uklu 01-06-05,Groninga Dominium,[Amstelodami] : P. Kaerius,"Kaerius, P.",,1617,,krt-1617-grp krt-1617-grp-v kb-1617-grp-gron,,...,123487463,4.210.230 Groningen (provincie) |,6241/1972 | 18939/1990 | ; krt-1617-grp,http://cdm21053.contentdm.oclc.org/cdm/ref/collection/Kaarten/id/900,,2012-10-24,2012-10-24,900,6501.cpd,/Kaarten/image/6501.cpd
