From 81dcc4e7eaf5d144623ae639eba2c30baef0aa29 Mon Sep 17 00:00:00 2001 From: drewthorp Date: Thu, 8 Feb 2024 13:33:08 +0000 Subject: [PATCH] Cleaned the cleaning code --- lib/ndr_support/string/clean_methodable.rb | 140 +++++++++++++++++++++ lib/ndr_support/string/cleaning.rb | 120 ++---------------- 2 files changed, 147 insertions(+), 113 deletions(-) create mode 100644 lib/ndr_support/string/clean_methodable.rb diff --git a/lib/ndr_support/string/clean_methodable.rb b/lib/ndr_support/string/clean_methodable.rb new file mode 100644 index 0000000..aff66ef --- /dev/null +++ b/lib/ndr_support/string/clean_methodable.rb @@ -0,0 +1,140 @@ +# Adds the 'clean' method to String, which can be used to clean strings in various ways +# depending on the contents +module CleanMethodable + extend ActiveSupport::Concern + + CLEAN_METHODS = { + nhsnumber: :clean_nhsnumber, + postcode: :clean_postcode, get_postcode: :clean_postcode, + lpi: :clean_lpi, + gender: :clean_gender, sex: :clean_sex, sex_c: :clean_sex_c, + name: :clean_name, + ethniccategory: :clean_ethniccategory, + code: :clean_code, code_icd: :clean_code_icd, icd: :clean_icd, + code_opcs: :clean_code_opcs, + hospitalnumber: :clean_hospitalnumber, + xmlsafe: :clean_xmlsafe, make_xml_safe: :clean_xmlsafe, + roman5: :clean_roman5, + tnmcategory: :clean_tnmcategory, + strip: :strip, upcase: :upcase, itself: :itself + }.freeze + + def clean(what) + cleaning_method = CLEAN_METHODS[what] + return send(cleaning_method) if cleaning_method + + gsub(' ?', ' ') + end + + private + + def clean_nhsnumber + delete('^0-9')[0..9] + end + + def clean_postcode + postcodeize(:db) + end + + def clean_lpi + upcase.delete('^0-9A-Z') + end + + def clean_gender + return '1' if self =~ /\AM(ale)?/i + return '2' if self =~ /\AF(emale)?/i + + self + end + + def clean_sex + # SECURE: BNS 2012-10-09: But may behave oddly for multi-line input + return '1' if self =~ /^M|1/i + return '2' if self =~ /^F|2/i + + '0' + end + + def clean_sex_c + return 'M' if self =~ /^M|1/i + return 'F' if self =~ /^F|2/i + + '' + end + + def clean_name + substitutions = { + '.' => '', + /,|;/ => ' ', + /\s{2,}/ => ' ', + '`' => '\'' + } + substitutions.inject(upcase) { |a, e| a.gsub(*e) }.strip + end + + def clean_ethniccategory + replace_ethniccategory = { + '0' => '0', '1' => 'M', '2' => 'N', + '3' => 'H', '4' => 'J', '5' => 'K', + '6' => 'R', '7' => '8', '&' => 'X', + ' ' => 'X', '99' => 'X' + } + replace_ethniccategory[self] || upcase + end + + def clean_code + split_on_separators.map do |code| + code.blank? ? next : code.delete('.') + end.compact.join(' ') + end + + def clean_code_icd + warn '[DEPRECATION] clean(:code_icd) is deprecated - consider using clean(:icd) instead.' + # regexp = /[A-Z][0-9]{2}(\.(X|[0-9]{1,2})|[0-9]?)( *(D|A)( |,|;|$))/ + codes = upcase.split_on_separators.delete_if { |x| x.squash.blank? } + cleaned_codes = [] + codes.each do |code| + if code == 'D' || code == 'A' + cleaned_codes[-1] += code + else + cleaned_codes << code + end + end + cleaned_codes.join(' ') + end + + def clean_icd + codes = upcase.squish.split_on_separators.reject(&:blank?) + codes.map { |code| code.gsub(/(?<=\d)(\.?X?)/, '') }.join(' ') + end + + def clean_hospitalnumber + self[-1..-1] =~ /\d/ ? self : self[0..-2] + end + + def clean_xmlsafe + strip_xml_unsafe_characters + end + + def clean_roman5 + # This deromanises roman numerals between 1 and 5 + gsub(/[IV]+/i) { |match| ROMAN_ONE_TO_FIVE_MAPPING[match.upcase] } + end + + def clean_tnmcategory + sub!(/\A[tnm]/i, '') + if self =~ /\Ax\z/i + upcase + else + downcase + end + end + + def clean_code_opcs + split_on_separators.map do |code| + db_code = code.squash + next unless 4 == db_code.length || db_code =~ /CZ00[12]/ + db_code + end.compact.join(' ') + end +end diff --git a/lib/ndr_support/string/cleaning.rb b/lib/ndr_support/string/cleaning.rb index a3773fc..7681c92 100644 --- a/lib/ndr_support/string/cleaning.rb +++ b/lib/ndr_support/string/cleaning.rb @@ -1,6 +1,11 @@ require 'active_support/core_ext/string/filters' +require 'ndr_support/string/clean_methodable' +# Extends String clean with various methods of cleaning strings +# zand polishing them class String + include CleanMethodable + INVALID_CONTROL_CHARS = /[\x00-\x08\x0b-\x0c\x0e-\x1f]/ ROMAN_ONE_TO_FIVE_MAPPING = { 'I' => '1', 'II' => '2', 'III' => '3', 'IIII' => '4', 'IV' => '4', 'V' => '5' } @@ -26,9 +31,8 @@ def squash # Parameter "option" can be :user, :compact, :db def postcodeize(option = :user) nspce = gsub(/[[:space:]]/, '').upcase - unless nspce.blank? || POSTCODE_REGEXP =~ nspce - return self # Don't change old-style or malformed postcodes - end + return self unless nspce.blank? || POSTCODE_REGEXP =~ nspce # Don't change old-style or malformed postcodes + case option when :compact nspce @@ -43,106 +47,6 @@ def postcodeize(option = :user) end end - def clean(what) - case what - when :nhsnumber - delete('^0-9')[0..9] - when :postcode, :get_postcode - postcodeize(:db) - when :lpi - upcase.delete('^0-9A-Z') - when :gender - if self =~ /\AM(ale)?/i - '1' - elsif self =~ /\AF(emale)?/i - '2' - else - self - end - when :sex - # SECURE: BNS 2012-10-09: But may behave oddly for multi-line input - if self =~ /^M|1/i - '1' - elsif self =~ /^F|2/i - '2' - else - '0' - end - when :sex_c - if self =~ /^M|1/i - 'M' - elsif self =~ /^F|2/i - 'F' - else - '' - end - when :name - substitutions = { - '.' => '', - /,|;/ => ' ', - /\s{2,}/ => ' ', - '`' => '\'' - } - substitutions.inject(upcase) { |a, e| a.gsub(*e) }.strip - when :ethniccategory - replace_ethniccategory = { - '0' => '0', - '1' => 'M', - '2' => 'N', - '3' => 'H', - '4' => 'J', - '5' => 'K', - '6' => 'R', - '7' => '8', - '&' => 'X', - ' ' => 'X', - '99' => 'X' - } - replace_ethniccategory[self] || upcase - when :code - split_on_separators.map do |code| - code.blank? ? next : code.delete('.') - end.compact.join(' ') - when :code_icd - warn '[DEPRECATION] clean(:code_icd) is deprecated - consider using clean(:icd) instead.' - # regexp = /[A-Z][0-9]{2}(\.(X|[0-9]{1,2})|[0-9]?)( *(D|A)( |,|;|$))/ - codes = upcase.split_on_separators.delete_if { |x| x.squash.blank? } - cleaned_codes = [] - codes.each do |code| - if code == 'D' || code == 'A' - cleaned_codes[-1] += code - else - cleaned_codes << code - end - end - cleaned_codes.join(' ') - when :icd - codes = upcase.squish.split_on_separators.reject(&:blank?) - codes.map { |code| code.gsub(/(?<=\d)(\.?X?)/, '') }.join(' ') - when :code_opcs - clean_code_opcs - when :hospitalnumber - self[-1..-1] =~ /\d/ ? self : self[0..-2] - when :xmlsafe, :make_xml_safe - strip_xml_unsafe_characters - when :roman5 - # This deromanises roman numerals between 1 and 5 - gsub(/[IV]+/i) { |match| ROMAN_ONE_TO_FIVE_MAPPING[match.upcase] } - when :tnmcategory - sub!(/\A[tnm]/i, '') - if self =~ /\Ax\z/i - upcase - else - downcase - end - when :strip, :upcase, :itself - # SECURE: 14-06-2017 TPG Fixed list of executable methods (whats) - send(what) - else - gsub(' ?', ' ') - end - end - def strip_xml_unsafe_characters gsub(String::INVALID_CONTROL_CHARS, '') end @@ -156,14 +60,4 @@ def xml_unsafe? def split_on_separators(regexp = / |,|;/) split(regexp) end - - private - - def clean_code_opcs - split_on_separators.map do |code| - db_code = code.squash - next unless 4 == db_code.length || db_code =~ /CZ00[12]/ - db_code - end.compact.join(' ') - end end