### Extraction of data from MARCXML

This dataset represents the descriptive metadata from the [Moving Image Archive catalogue](https://data.nls.uk/data/metadata-collections/moving-image-archive/), which is Scotland’s national collection of moving images.

In [1]:
import pymarc, re, csv
import pandas as pd
from pymarc import parse_xml_to_array

## Extraction of the data to a CSV

In [23]:
with open('../data/output/movingImageArchive.csv', 'w') as csv_fichero:
    csv_output = csv.writer(csv_fichero, delimiter = ',', quotechar = '"', quoting = csv.QUOTE_MINIMAL)
    csv_output.writerow(['title', 'author', 'authorOrganisation', 'place_publication', 'date', 'extent', 'credits', 'subjects',\
                         'summary', 'details', 'link', 'geographicNames',\
                         'contentType', 'mediaType', 'carrierType', 'generalNote','thumbnail'])
    
    records = parse_xml_to_array(open('../data/movingImageArchive/Moving-Image-Archive-dataset-MARC.xml'))

    for record in records:

        title = author = authorOrganisation = place_publication = date = extent = credits = subjects =\
        summary = details = link = geographicNames = contentType = mediaType = carrierType =\
        generalNote = personalName = thumbnail = ''

        # title
        if record['245'] is not None:
            title = record['245']['a']
            if record['245']['b'] is not None:
                title = title + " " + record['245']['b']
            title = title.strip()

        # place of publication
        if record['264'] is not None:
            place_publication = record['264']['a']

        # date
        for f in record.get_fields('264'):
            dates = f.get_subfields('c')
            if len(dates):
                date = dates[0]

                if date.endswith('.'): date = date[:-1]


        # Physical Description - extent
        for f in record.get_fields('300'):
            extent = f.get_subfields('a')
            if len(extent):
                extent = extent[0]
                # TODO cleaning
            details = f.get_subfields('b')
            if len(details):
                details = details[0]
                
        # Content Type
        for f in record.get_fields('336'):
            contentType = f.get_subfields('a')[0] + ' -- ' + f.get_subfields('2')[0]
        
        # Media Type
        for f in record.get_fields('337'):
            mediaType = f.get_subfields('a')[0] + ' -- ' + f.get_subfields('2')[0]
            
        # Carrier Type
        for f in record.get_fields('338'):
            carrierType = f.get_subfields('a')[0] + ' -- ' + f.get_subfields('2')[0]
            
        # General Note
        for f in record.get_fields('500'):
            generalNote = f.get_subfields('a')

        # credits
        if record['508'] is not None:
            for f in record.get_fields('508'):
                credits += f.get_subfields('a')[0].strip() + " -- "
                credits = credits.replace("\n", " ")
                
            # remove -- at the end
            credits = re.sub(' -- $', '', credits)

        # summary
        if record['520'] is not None:
            summary = record['520']['a'].strip()

        # subjects
        if record['653'] is not None:
            subjects = ''
            geographicNames = ''
            for f in record.get_fields('653'):
                
                if f.indicator2 == '0':
                    subjects += f.get_subfields('a')[0] + ' -- '
                elif f.indicator2 == '5':
                    geographicNames += f.get_subfields('a')[0] + ' -- '
            
            # remove -- at the end
            subjects = re.sub(' -- $', '', subjects)
            geographicNames = re.sub(' -- $', '', geographicNames)
            
        # author
        # Added Entry-Personal Name and Corporate Name
        if record['700'] is not None:
            author = record['700']['a'].strip()
        if record['710'] is not None:
            authorOrganisation = record['710']['a'].strip()
            
        # link
        if record['856'] is not None:
            link = record['856']['u']
            
        # link
        if record['859'] is not None:
            thumbnail = record['859']['u']

        csv_output.writerow([title,author,authorOrganisation,place_publication,date,extent,credits,\
                             subjects,summary,details,link,geographicNames,\
                             contentType,mediaType,carrierType,generalNote,thumbnail])

## References

- https://pymarc.readthedocs.io/en/latest/#api-docs
- https://www.loc.gov/marc/bibliographic/