Extracts map metadata and transforms it to csv

In [2]:
import csv 
import pandas as pd
import re
import io
import xml.etree.ElementTree as ET

In [None]:
# cleans up extraneous quotation marks in xml

def cleancsvxml(xmlstr):
    a = xmlstr.replace('""','"')
    a = a[1:]
    a = a[:-1]
    a = a.replace("\\r", "\r")
    a = a.replace("\\n", "\n")
    return a

In [None]:
# takes a chunk of xml from the gdas export and returns a dataframe with the metadata

def getdataframe(marcx):
    mx = cleancsvxml(marcx)
    root = ET.fromstring(mx)
    bbox,bbw,bbe,bbn,bbs,cg,cg2,fformat,repstyle,scale = "","","","","","","","","",""
    for datafield in root.iter('{http://www.loc.gov/MARC21/slim}datafield'):
        if datafield.attrib["tag"] == '110':
             auth = datafield[0].text
        elif datafield.attrib["tag"] == '245':
            tt = datafield[0].text
        elif datafield.attrib["tag"] == '246':
            tt = datafield[0].text
        elif datafield.attrib["tag"] == '034':
            for subfield in datafield:
                if subfield.attrib["code"] == 'd':
                    bbw = subfield.text
                elif subfield.attrib["code"] == 'e':
                    bbe = subfield.text
                elif subfield.attrib["code"] == 'f':
                    bbn = subfield.text
                elif subfield.attrib["code"] == 'g':
                    bbs = subfield.text
        elif datafield.attrib["tag"] == '255':
            for subfield in datafield:
                if subfield.attrib["code"] == 'a':
                    scale = subfield.text
                elif subfield.attrib["code"] == 'b':
                    proj = subfield.text
                elif subfield.attrib["code"] == 'c':
                    bbox = subfield.text
        elif datafield.attrib["tag"] == '264':
            for subfield in datafield:
                if subfield.attrib["code"] == 'a':
                    pubplace = subfield.text
                elif subfield.attrib["code"] == 'b':
                    pub = subfield.text
                elif subfield.attrib["code"] == 'c':
                    pubdate = subfield.text
        elif datafield.attrib["tag"] == '300':
            extent = datafield[0].text
        elif datafield.attrib["tag"] == '352':
            for subfield in datafield:
                if subfield.attrib["code"] == 'a':
                    rep = subfield.text
                elif subfield.attrib["code"] == 'b':
                    repstyle = subfield.text
                elif subfield.attrib["code"] == 'c':
                    fformat = subfield.text
        elif datafield.attrib["tag"] == '655':
            if cg == "":
                cg = datafield[0].text
            else:
                cg2 = datafield[0].text
    df2 = pd.DataFrame([[auth, cg, pub, pubdate, tt, bbw,bbe,bbn,bbs, bbox, cg2, " ", extent,fformat, " ",pubplace,rep,repstyle,scale,dname]], columns=['Author','Cartographic_genre','Publisher','Publishing_Date','Title','Bounding_Box_W','Bounding_Box_E','Bounding_Box_N','Bounding_Box_S','Bounding_Box_255','Cartographic_genre_2','epsgCode','Extent','File_Format','ISBN','Place_of_publication','Representation','Representation_style','Scale','File name'])
    return df2

In [None]:
# opens the spreadsheet, pulls out the data and returns a csv

with open('metadata_all.csv', newline='',mode='r',encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    metadf = pd.DataFrame(columns=['Author','Cartographic_genre','Publisher','Publishing_Date','Title','Bounding_Box_W','Bounding_Box_E','Bounding_Box_N','Bounding_Box_S','Bounding_Box_255','Cartographic_genre_2','epsgCode','Extent','File_Format','ISBN','Place_of_publication','Representation','Representation_style','Scale','File name']) 
    for row in reader:
        dname = row['datasetName']
        marcx = row['marcXml']
        df2 = getdataframe(marcx)
        metadf = pd.concat([metadf,df2], ignore_index=True)
    metadf.to_csv("csvxmltodata.csv")

In [None]:
# prints out all of the data held in the xml for one file (Eire Thuaidh) to check for additional data held

with open('metadata_all.csv', newline='',mode='r',encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        if row["snapshotDisplayName"] == "Eire Thuaidh - Irish Translated Map":
            eire = row
            mrx = row["marcXml"]
            marx = cleancsvxml(mrx)
            root = ET.fromstring(marx)
            for dataf in root.iter('{http://www.loc.gov/MARC21/slim}datafield'):
                for subf in dataf:
                    print(dataf.attrib,subf.attrib,subf.text)

In [None]:
# prints out all of the data held in the xml to check for additional data held
with open('metadata_all.csv', newline='',mode='r',encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        mrx = row["marcXml"]
        marx = cleancsvxml(mrx)
        root = ET.fromstring(marx)
        for sub in root.iter('{http://www.loc.gov/MARC21/slim}datafield'):
            if sub.attrib["tag"] == '034':
                for subf in sub:
                    print(subf.tag,subf.attrib,subf.text)