Fixes Cheat Sheet

Oliver Schihin edited this page May 4, 2018 · 53 revisions

This cheat sheet summarizes the fix language. For more on the marc_* methods, see MARC mapping rules.

# Fixes clean your data. As input you get a Perl HASH. Each fix function is a command
# to transform the Perl HASH. Some fixes such as marc_map contain logic to transform
# complex data structures such as MARC.
set_field("my.name","patrick")             # { my => { name => 'Patrick'} }
add_field("my.name2","nicolas")
move_field("my.name","your.name")
copy_field("your.name","your.name2")
remove_field("your.name")
# Replace in all the field names in 'foo' all dots into underscores
rename(foo,"\.","_")

set_array("foo")                           # Create an empty array foo => []
set_array("foo","a","b","c")               # Create an array with three values foo => ['a','b','c']
set_hash("foo")                            # Create an empty hash foo => {}
set_hash("foo",a: b,c: d)                  # Create an hash with two values foo => { a => 'b' , c => 'd' }

array("foo")                               # Create an array from a hash :
                                           # foo => {"name":"value"} => [ "name" , "value" ]
hash("foo")                                # Create a hash from an array
                                           # foo => [ "name" , "value" ] => {"name":"value"}

assoc(fields, pairs.*.key, pairs.*.val)    # Associate two values as a hash key and value
                                           # {pairs => [{key => 'year', val => 2009}, {key => 'subject', val => 'Perl'}]}
                                           # {fields => {subject => 'Perl', year => 2009}, pairs => [...]}

upcase("title")                            # marc -> MARC
downcase("title")                          # MARC -> marc
capitalize("my.deeply.nested.field.0")     # marc -> Marc
trim("field_with_spaces")                  # "  marc  " -> marc
substring("title",0,1)                     # marc -> m
prepend("title","die ")                    # marc -> die marc
append("title"," must die")                # marc -> marc must die

# {author => "tom jones"}  -> {author => "senoj mot"}
reverse(author)
 
# {numbers => [1,14,2]} -> {numbers => [2,14,1]}
reverse(numbers)

# replace the value with a formatted (sprintf-like) version
# e.g. numbers: 
#         - 41
#         - 15
format(number,"%-10.10d %-5.5d") # numbers => "0000000041 00015"
# e.g. hash:
#        name: Albert
format(name,"%-10s: %s") # hash: "name      : Albert"

# date: "2015-03-07"
parse_text(date, '(\d\d\d\d)-(\d\d)-(\d\d)')
# date: 
#    - 2015
#    - 03
#    - 07

#  parses a text into an array or hash of values
# date: "2015-03-07"
parse_text(date, '(\d\d\d\d)-(\d\d)-(\d\d)')
# date: 
#    - 2015
#    - 03
#    - 07 
 
# If you data record is:
#   a: eeny
#   b: meeny
#   c: miny
#   d: moe
paste(my.string,a,b,c,d)                 # my.string: eeny meeny miny moe
 
# Use a join character
paste(my.string,a,b,c,d,join_char:", ")  # my.string: eeny, meeny, miny, moe
 
# Paste literal strings with a tilde sign
paste(my.string,~Hi,a,~how are you?)     # my.string: Hi eeny how are you?

# date: "2015-03-07"
parse_text(date, '(?<year>\d\d\d\d)-(?<month>\d\d)-(?<day>\d\d)')
# date:
#   year: "2015"
#   month: "03" 
#   day: "07"
 
# date: "abcd"
parse_text(date, '(\d\d\d\d)-(\d\d)-(\d\d)')
# date: "abcd"

lookup("title","dict.csv", sep_char:'|')  # lookup 'marc' in dict.csv and replace the value
lookup("title","dict.csv", default:test)  # lookup 'marc' in dict.csv and replace the value or set it to 'test'
lookup("title","dict.csv", delete:1)    # lookup 'marc' in dict.csv and replace the value or delete nothing found

lookup_in_store('title', 'MongoDB', database_name:lookups)  # lookup the (id)-value of title in 'lookups' and
                                           # replace it with the data found
lookup_in_store('title', 'MongoDB', default:'default value' , delete:1) 

# Query a Solr index with the query stored in the 'query' field and overwrite it with all the results
search_in_store('query','Solr',url:"http://localhost:8983/solr",limit:10)

# Replace the data in foo.bar with an external file or url
import(foo.bar, JSON, file: "http://foo.com/bar.json", data_path: data.*)

add_to_store('authors.*', 'MongoDB', bag:authors, database_name:catalog)  # add matching values to a store as a side effect

add_to_exporter(data,CSV,header:1,file:/tmp/data.csv) # send the 'data' path to an alternative exporter
add_to_exporter(.,CSV,header:1,file:/tmp/data.csv)    # send the complete record to an alternative exporter

count("myarray")                           # count number of elements in an array or hash
sum("numbers")                             # replace an array element with the sum of its values
sort_field("tags")                         # sort the values of an array
sort_field("tags", uniq:1)                 # sort the values plus keep unique values
sort_field("tags", reverse:1)              # revese sort
sort_field("tags", numeric:1)              # sort numerical values
uniq(tags)                                 # strip duplicate values from an array
filter("tags","[Cc]at")                    # filter array values tags = ["Cats","Dogs"] => ["Cats"]
flatten(deep)                              # {deep => [1, [2, 3], 4, [5, [6, 7]]]} => {deep => [1, 2, 3, 4, 5, 6, 7]}

cmd("java MyClass")                        # Use an external program that can read JSON 
                                           # from stdin and write JSON to stdout
perlcode("myscript.pl")                    # Execute Perl code as fix function
sleep(1,SECOND)                            # Do nothing for one second

split_field("foo",":")                     # marc:must:die -> ['marc','must','die']
join_field("foo",":")                      # ['marc','must','die'] -> marc:must:die
retain("id","id2","id3")                   # delete any field except 'id', 'id2', 'id3'
replace_all("title","a","x")               # marc -> mxrc

# Most functions can work also work on arrays. E.g.
replace_all("author.*","a","x")            # [ 'marc','jan'] => ['mxrc','jxn']
# Use:
#   authors.$end (last entry)
#   authors.$start (first entry)
#   authors.$append (last + 1)
#   authors.$prepend (first - 1)
#   authors.* (all authors)
#   authors.2 (3rd author)

collapse()                                 # collapse deep nested hash to a flat hash
expand()                                   # expand flat hash to deep nested hash
clone()                                    # clone the perl hash and work on the clone
reject()                                   # Reject (skip) a record
reject [condition]                         # Reject a record on some condition:
                                           #   reject all_match(...)
                                           #   reject any_match(...)
                                           #   reject exists(...)
select()                                   # Select a record
select [condition]                         # Select only those records that match a condition (see reject)

to_json('my.field')                        # convert a value of a field to json
from_json('my.field')                      # replace the json field with the parsed value

export_to_string('my.field',CSV,sep_char:";")   # convert the value of a field into CSV
import_from_string('my.field',CSV,sep_char:";") # replace a CSV field with the parsed value

error("eek!")                              # abort the processing and say "eek!"
nothing()                                  # do nothing (used in benchmarking)

# Include fixes from another file
include('/path/to/myfixes.txt')

# Send debug messages to a logger
log('test123')
log('hello world' , level: 'DEBUG')

# Boolean AND and OR, need a Condition + 'and'/'or' + a Fix 
exists(foo) and log('foo exists' , level: INFO)
exists(foo) or log('foo doesnt exist' , level: INFO)
valid('', JSONSchema, schema: "my/schema.json") or log('this record is wrong', level: ERROR)

# '3%A9' => 'café'
uri_decode(place)
# 'café' => '3%A9'
uri_encode(place)

# Add a new field 'foo' with a random value between 0 and 9
random(foo, 10)

# Delete all the empty fields
vacuum()

# Copy all 245 subfields into the my.title hash
marc_map('245','my.title') 
# Copy the 245-$a$b$c subfields into the my.title hash in the order of the record
marc_map('245abc','my.title') 
# Copy the 245-$c$b$a subfields into the my.title hash in the order of the mapping
marc_map('245cba','my.title' , pluck:1) 
# Copy the 100 subfields into the my.authors array
marc_map('100','my.authors.$append') 
# Add the 710 subfields into the my.authors array
marc_map('710','my.authors.$append')
# Copy the 600-$x subfields into the my.subjects array while packing each into a genre.text hash
marc_map('600x','my.subjects.$append.genre.text')
# Copy the 008 characters 35-35 into the my.language hash
marc_map('008_/35-35','my.language')
# Copy all the 600 fields into a my.stringy hash joining them by '; '
marc_map('600','my.stringy', join:'; ')
# When 024 field exists create the my.has024 hash with value 'found'
marc_map('024','my.has024', value:found)
# Do the same examples now with the marc fields in 'record2'
marc_map('245','my.title', record:record2)
# Remove the 900 fields
marc_remove('900')
# Add a marc field (in Catmandu::MARC 0.110)
marc_add('999', ind1, ' ' , ind2, '1' , a, 'test123')
# Add a marc field populated with data from your record
marc_add('245', a , $.my.title.field, c , $.my.author.field)
# Set a marc value of one (sub)field to a new value
marc_set('LDR/6','p')
marc_set('650p','test')
marc_set('100[3]a','Farquhar family.')

# Map all 650 subjects into an array 
marc_map('650','subject', join:'###') 
split_field('subject','###')

# uppercase the value of field 'foo' if all members of 'oogly' have the value 'doogly'
if all_match('oogly.*', 'doogly')
  upcase('foo') # foo => 'BAR'
else
  downcase('foo') # foo => 'bar'
end

# inverted
unless all_match('oogly.*', 'doogly')
  upcase('foo') # foo => 'BAR'
end;

# uppercase the value of field 'foo' if field 'oogly' has the value 'doogly'
if any_match('oogly', 'doogly')
  upcase('foo') # foo => 'BAR'
end

# inverted
unless any_match('oogly', 'doogly')
  upcase('foo') # foo => 'BAR'
end

# uppercase the value of field 'foo' if the field 'oogly' exists
if exists('oogly')
  upcase('foo') # foo => 'BAR'
end

# inverted
unless exists('oogly')
  upcase('foo') # foo => 'bar'
end

# add a new field when the 'year' field is equal to 2018
if all_equal('year','2018')
 add_field('my.funny.title','true')
end

# add a new field when at least one of the 'year'-s is equal to 2018
if any_equal('years.*','2018')
 add_field('my.funny.title','true')
end

# compare things (needs Catmandu 0.92 or better)
if greater_than('year',2000)
  add_field('recent','yes')
end

if less_than('year',1970)
  add_field('ancient','yes')
end

# execute fixes if one path is contained in another
# foo => 1 , bar => [3,2,1]  => in(foo,bar) -> true
if in(foo,bar)
   add_field(test,ok)
end

# only execute fixes if all path values are the boolean true, 1 or "true"
if is_true(data.*.has_error)
  add_field(error,yes)
end

# only execute fixes if all path values are the boolean true, 0 or "false"
if is_false(data.*.has_error)
  add_field(error,no)
end

# only execute the fixes if the path contains an array
if is_array(data)
  upcase(data.0)
end

# only execute the fixes if the path contains an object (an hash, nested field)
if is_object(data)
  add_field(data.ok,yes)
end

# only execute the fixes if the path contains a number
if is_number(data)
  append(data," : is a number")
end

# only execute the fixes if the path contains a string
if is_string(data)
  append(data," : is a string")
end

# only execute the fixes if the path contains 'null' values
if is_null(data)
  set_field(data,"I'm empty!")
end

# Evaluates true when one or all marc (sub)fields match a regular expression
if marc_all_match('245','My funny title')
  add_field('funny.title','yes')
end
if marc_all_match('LDR/6','c')
  marc_set('LDR/6','p')
end

# Evaluates to true when at least one of the marc (sub)fields match a regular expression
if marc_any_match('650','catmandu')
  add_field('important.books','yes')
end


# Evaluates true when the JSON fragment is valid against a JSON Schema
if valid(data,JSONSchema,schema:myschema.json)
   ...
end

## Binds (needs Catmandu 0.92 or better)

# The identity binder doesn't embody any computational strategy. It simply 
# applies the bound fix functions sequentially to its input without any 
# modification.
do identity()
  add_field(foo,bar)
  add_field(foo2,bar2)
end

# Maybe, computes all the fix functions and ignores fixes once they throw errors
# or return undef.
do maybe()
  foo()
  return_undef() # rest will be ignored
  bar()
end

# List over all items in demo and add a foo => bar field
# { demo => [{},{},{}] } => { demo => [{foo=>bar},{foo=>bar},{foo=>bar}]}
do list(path: demo)
  add_field(foo,bar)
end

# Print statistical information on the processing speed of fixes to the standaard error.
do benchmark(output:/dev/stderr)
  foo()
end

# Find all ISBN in a stream
do hashmap(exporter: JSON, join:',')
  # Need an identity binder to group all operations that calculate key_value pairs
  do identity()
   copy_field(isbn,key)
   copy_field(_id,value)
  end
end

# Count the number of ISBN occurrences in a stream
do hashmap(count: 1)
  copy_field(isbn,key)
end

# Filter out an array (needs Catmandu 0.9302 or better)
#    data:
#       - name: patrick
#       - name: nicolas
# to:
#    data:
#       - name: patrick
do with(path:data)
  reject all_match(name,nicolas)
  # Or:
  # if all_match(name,nicolas)
  #  reject()
  # end
end

#  run fixes that should run within a time limit
do timeout(time => 5, units => seconds)
  ...
end

# a binder that computes Fix-es for every element in record
do visitor()
   # upcase all the 'name' fields in the record
   if all_match(key,name)
     upcase(scalar)
   end
end

# a binder runs fixes on records from an importer
do importer(OAI,url: "http://lib.ugent.be/oai") 
  retain(_id)
  add_to_exporter(.,YAML)
end