Fixes Cheat Sheet

Oliver Schihin edited this page May 4, 2018 · 53 revisions

This cheat sheet summarizes the fix language. For more on the marc_* methods, see MARC mapping rules.

# Fixes clean your data. As input you get a Perl HASH. Each fix function is a command
# to transform the Perl HASH. Some fixes such as marc_map contain logic to transform
# complex data structures such as MARC.
set_field("my.name","patrick")             # { my => { name => 'Patrick'} }
add_field("my.name2","nicolas")
move_field("my.name","your.name")
copy_field("your.name","your.name2")
remove_field("your.name")
# Replace in all the field names in 'foo' all dots into underscores
rename(foo,"\.","_")

set_array("foo")                           # Create an empty array foo => []
set_array("foo","a","b","c")               # Create an array with three values foo => ['a','b','c']
set_hash("foo")                            # Create an empty hash foo => {}
set_hash("foo",a: b,c: d)                  # Create an hash with two values foo => { a => 'b' , c => 'd' }

array("foo")                               # Create an array from a hash :
                                           # foo => {"name":"value"} => [ "name" , "value" ]
hash("foo")                                # Create a hash from an array
                                           # foo => [ "name" , "value" ] => {"name":"value"}

assoc(fields, pairs.*.key, pairs.*.val)    # Associate two values as a hash key and value
                                           # {pairs => [{key => 'year', val => 2009}, {key => 'subject', val => 'Perl'}]}
                                           # {fields => {subject => 'Perl', year => 2009}, pairs => [...]}

upcase("title")                            # marc -> MARC
downcase("title")                          # MARC -> marc
capitalize("my.deeply.nested.field.0")     # marc -> Marc
trim("field_with_spaces")                  # "  marc  " -> marc
substring("title",0,1)                     # marc -> m
prepend("title","die ")                    # marc -> die marc
append("title"," must die")                # marc -> marc must die

# {author => "tom jones"}  -> {author => "senoj mot"}
reverse(author)
 
# {numbers => [1,14,2]} -> {numbers => [2,14,1]}
reverse(numbers)

# replace the value with a formatted (sprintf-like) version
# e.g. numbers: 
#         - 41
#         - 15
format(number,"%-10.10d %-5.5d") # numbers => "0000000041 00015"
# e.g. hash:
#        name: Albert
format(name,"%-10s: %s") # hash: "name      : Albert"

# date: "2015-03-07"
parse_text(date, '(\d\d\d\d)-(\d\d)-(\d\d)')
# date: 
#    - 2015
#    - 03
#    - 07

#  parses a text into an array or hash of values
# date: "2015-03-07"
parse_text(date, '(\d\d\d\d)-(\d\d)-(\d\d)')
# date: 
#    - 2015
#    - 03
#    - 07 
 
# If you data record is:
#   a: eeny
#   b: meeny
#   c: miny
#   d: moe
paste(my.string,a,b,c,d)                 # my.string: eeny meeny miny moe
 
# Use a join character
paste(my.string,a,b,c,d,join_char:", ")  # my.string: eeny, meeny, miny, moe
 
# Paste literal strings with a tilde sign
paste(my.string,~Hi,a,~how are you?)     # my.string: Hi eeny how are you?

# date: "2015-03-07"
parse_text(date, '(?<year>\d\d\d\d)-(?<month>\d\d)-(?<day>\d\d)')
# date:
#   year: "2015"
#   month: "03" 
#   day: "07"
 
# date: "abcd"
parse_text(date, '(\d\d\d\d)-(\d\d)-(\d\d)')
# date: "abcd"

lookup("title","dict.csv", sep_char:'|')  # lookup 'marc' in dict.csv and replace the value
lookup("title","dict.csv", default:test)  # lookup 'marc' in dict.csv and replace the value or set it to 'test'
lookup("title","dict.csv", delete:1)    # lookup 'marc' in dict.csv and replace the value or delete nothing found

lookup_in_store('title', 'MongoDB', database_name:lookups)  # lookup the (id)-value of title in 'lookups' and
                                           # replace it with the data found
lookup_in_store('title', 'MongoDB', default:'default value' , delete:1) 

# Query a Solr index with the query stored in the 'query' field and overwrite it with all the results
search_in_store('query','Solr',url:"http://localhost:8983/solr",limit:10)

# Replace the data in foo.bar with an external file or url
import(foo.bar, JSON, file: "http://foo.com/bar.json", data_path: data.*)

add_to_store('authors.*', 'MongoDB', bag:authors, database_name:catalog)  # add matching values to a store as a side effect

add_to_exporter(data,CSV,header:1,file:/tmp/data.csv) # send the 'data' path to an alternative exporter
add_to_exporter(.,CSV,header:1,file:/tmp/data.csv)    # send the complete record to an alternative exporter

count("myarray")                           # count number of elements in an array or hash
sum("numbers")                             # replace an array element with the sum of its values
sort_field("tags")                         # sort the values of an array
sort_field("tags", uniq:1)                 # sort the values plus keep unique values
sort_field("tags", reverse:1)              # revese sort
sort_field("tags", numeric:1)              # sort numerical values
uniq(tags)                                 # strip duplicate values from an array
filter("tags","[Cc]at")                    # filter array values tags = ["Cats","Dogs"] => ["Cats"]
flatten(deep)                              # {deep => [1, [2, 3], 4, [5, [6, 7]]]} => {deep => [1, 2, 3, 4, 5, 6, 7]}

cmd("java MyClass")                        # Use an external program that can read JSON 
                                           # from stdin and write JSON to stdout
perlcode("myscript.pl")                    # Execute Perl code as fix function
sleep(1,SECOND)                            # Do nothing for one second

split_field("foo",":")                     # marc:must:die -> ['marc','must','die']
join_field("foo",":")                      # ['marc','must','die'] -> marc:must:die
retain("id","id2","id3")                   # delete any field except 'id', 'id2', 'id3'
replace_all("title","a","x")               # marc -> mxrc

# Most functions can work also work on arrays. E.g.
replace_all("author.*","a","x")            # [ 'marc','jan'] => ['mxrc','jxn']
# Use:
#   authors.$end (last entry)
#   authors.$start (first entry)
#   authors.$append (last + 1)
#   authors.$prepend (first - 1)
#   authors.* (all authors)
#   authors.2 (3rd author)

collapse()                                 # collapse deep nested hash to a flat hash
expand()                                   # expand flat hash to deep nested hash
clone()                                    # clone the perl hash and work on the clone
reject()                                   # Reject (skip) a record
reject [condition]                         # Reject a record on some condition:
                                           #   reject all_match(...)
                                           #   reject any_match(...)
                                           #   reject exists(...)
select()                                   # Select a record
select [condition]                         # Select only those records that match a condition (see reject)

to_json('my.field')                        # convert a value of a field to json
from_json('my.field')                      # replace the json field with the parsed value

export_to_string('my.field',CSV,sep_char:";")   # convert the value of a field into CSV
import_from_string('my.field',CSV,sep_char:";") # replace a CSV field with the parsed value

error("eek!")                              # abort the processing and say "eek!"
nothing()                                  # do nothing (used in benchmarking)

# Include fixes from another file
include('/path/to/myfixes.txt')

# Send debug messages to a logger
log('test123')
log('hello world' , level: 'DEBUG')

# Boolean AND and OR, need a Condition + 'and'/'or' + a Fix 
exists(foo) and log('foo exists' , level: INFO)
exists(foo) or log('foo doesnt exist' , level: INFO)
valid('', JSONSchema, schema: "my/schema.json") or log('this record is wrong', level: ERROR)

# '3%A9' => 'café'
uri_decode(place)
# 'café' => '3%A9'
uri_encode(place)

# Add a new field 'foo' with a random value between 0 and 9
random(foo, 10)

# Delete all the empty fields
vacuum()

# Copy all 245 subfields into the my.title hash
marc_map('245','my.title') 
# Copy the 245-$a$b$c subfields into the my.title hash in the order of the record
marc_map('245abc','my.title') 
# Copy the 245-$c$b$a subfields into the my.title hash in the order of the mapping
marc_map('245cba','my.title' , pluck:1) 
# Copy the 100 subfields into the my.authors array
marc_map('100','my.authors.$append') 
# Add the 710 subfields into the my.authors array
marc_map('710','my.authors.$append')
# Copy the 600-$x subfields into the my.subjects array while packing each into a genre.text hash
marc_map('600x','my.subjects.$append.genre.text')
# Copy the 008 characters 35-35 into the my.language hash
marc_map('008_/35-35','my.language')
# Copy all the 600 fields into a my.stringy hash joining them by '; '
marc_map('600','my.stringy', join:'; ')
# When 024 field exists create the my.has024 hash with value 'found'
marc_map('024','my.has024', value:found)
# Do the same examples now with the marc fields in 'record2'
marc_map('245','my.title', record:record2)
# Remove the 900 fields
marc_remove('900')
# Add a marc field (in Catmandu::MARC 0.110)
marc_add('999', ind1, ' ' , ind2, '1' , a, 'test123')
# Add a marc field populated with data from your record
marc_add('245', a , $.my.title.field, c , $.my.author.field)
# Set a marc value of one (sub)field to a new value
marc_set('LDR/6','p')
marc_set('650p','test')
marc_set('100[3]a','Farquhar family.')

# Map all 650 subjects into an array 
marc_map('650','subject', join:'###') 
split_field('subject','###')

# uppercase the value of field 'foo' if all members of 'oogly' have the value 'doogly'
if all_match('oogly.*', 'doogly')
  upcase('foo') # foo => 'BAR'
else
  downcase('foo') # foo => 'bar'
end

# inverted
unless all_match('oogly.*', 'doogly')
  upcase('foo') # foo => 'BAR'
end;

# uppercase the value of field 'foo' if field 'oogly' has the value 'doogly'
if any_match('oogly', 'doogly')
  upcase('foo') # foo => 'BAR'
end

# inverted
unless any_match('oogly', 'doogly')
  upcase('foo') # foo => 'BAR'
end

# uppercase the value of field 'foo' if the field 'oogly' exists
if exists('oogly')
  upcase('foo') # foo => 'BAR'
end

# inverted
unless exists('oogly')
  upcase('foo') # foo => 'bar'
end

# add a new field when the 'year' field is equal to 2018
if all_equal('year','2018')
 add_field('my.funny.title','true')
end

# add a new field when at least one of the 'year'-s is equal to 2018
if any_equal('years.*','2018')
 add_field('my.funny.title','true')
end

# compare things (needs Catmandu 0.92 or better)
if greater_than('year',2000)
  add_field('recent','yes')
end

if less_than('year',1970)
  add_field('ancient','yes')
end

# execute fixes if one path is contained in another
# foo => 1 , bar => [3,2,1]  => in(foo,bar) -> true
if in(foo,bar)
   add_field(test,ok)
end

# only execute fixes if all path values are the boolean true, 1 or "true"
if is_true(data.*.has_error)
  add_field(error,yes)
end

# only execute fixes if all path values are the boolean true, 0 or "false"
if is_false(data.*.has_error)
  add_field(error,no)
end

# only execute the fixes if the path contains an array
if is_array(data)
  upcase(data.0)
end

# only execute the fixes if the path contains an object (an hash, nested field)
if is_object(data)
  add_field(data.ok,yes)
end

# only execute the fixes if the path contains a number
if is_number(data)
  append(data," : is a number")
end

# only execute the fixes if the path contains a string
if is_string(data)
  append(data," : is a string")
end

# only execute the fixes if the path contains 'null' values
if is_null(data)
  set_field(data,"I'm empty!")
end

# Evaluates true when one or all marc (sub)fields match a regular expression
if marc_all_match('245','My funny title')
  add_field('funny.title','yes')
end
if marc_all_match('LDR/6','c')
  marc_set('LDR/6','p')
end

# Evaluates to true when at least one of the marc (sub)fields match a regular expression
if marc_any_match('650','catmandu')
  add_field('important.books','yes')
end


# Evaluates true when the JSON fragment is valid against a JSON Schema
if valid(data,JSONSchema,schema:myschema.json)
   ...
end

## Binds (needs Catmandu 0.92 or better)

# The identity binder doesn't embody any computational strategy. It simply 
# applies the bound fix functions sequentially to its input without any 
# modification.
do identity()
  add_field(foo,bar)
  add_field(foo2,bar2)
end

# Maybe, computes all the fix functions and ignores fixes once they throw errors
# or return undef.
do maybe()
  foo()
  return_undef() # rest will be ignored
  bar()
end

# List over all items in demo and add a foo => bar field
# { demo => [{},{},{}] } => { demo => [{foo=>bar},{foo=>bar},{foo=>bar}]}
do list(path: demo)
  add_field(foo,bar)
end

# Print statistical information on the processing speed of fixes to the standaard error.
do benchmark(output:/dev/stderr)
  foo()
end

# Find all ISBN in a stream
do hashmap(exporter: JSON, join:',')
  # Need an identity binder to group all operations that calculate key_value pairs
  do identity()
   copy_field(isbn,key)
   copy_field(_id,value)
  end
end

# Count the number of ISBN occurrences in a stream
do hashmap(count: 1)
  copy_field(isbn,key)
end

# Filter out an array (needs Catmandu 0.9302 or better)
#    data:
#       - name: patrick
#       - name: nicolas
# to:
#    data:
#       - name: patrick
do with(path:data)
  reject all_match(name,nicolas)
  # Or:
  # if all_match(name,nicolas)
  #  reject()
  # end
end

#  run fixes that should run within a time limit
do timeout(time => 5, units => seconds)
  ...
end

# a binder that computes Fix-es for every element in record
do visitor()
   # upcase all the 'name' fields in the record
   if all_match(key,name)
     upcase(scalar)
   end
end

# a binder runs fixes on records from an importer
do importer(OAI,url: "http://lib.ugent.be/oai") 
  retain(_id)
  add_to_exporter(.,YAML)
end
You can’t perform that action at this time.
You signed in with another tab or window. Reload to refresh your session. You signed out in another tab or window. Reload to refresh your session.
Press h to open a hovercard with more details.