Skip to content

Commit

Permalink
Cleaned the cleaning code
Browse files Browse the repository at this point in the history
  • Loading branch information
drewthorp committed Feb 8, 2024
1 parent a58c105 commit 81dcc4e
Show file tree
Hide file tree
Showing 2 changed files with 147 additions and 113 deletions.
140 changes: 140 additions & 0 deletions lib/ndr_support/string/clean_methodable.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
# Adds the 'clean' method to String, which can be used to clean strings in various ways
# depending on the contents
module CleanMethodable
extend ActiveSupport::Concern

CLEAN_METHODS = {
nhsnumber: :clean_nhsnumber,
postcode: :clean_postcode, get_postcode: :clean_postcode,
lpi: :clean_lpi,
gender: :clean_gender, sex: :clean_sex, sex_c: :clean_sex_c,
name: :clean_name,
ethniccategory: :clean_ethniccategory,
code: :clean_code, code_icd: :clean_code_icd, icd: :clean_icd,
code_opcs: :clean_code_opcs,
hospitalnumber: :clean_hospitalnumber,
xmlsafe: :clean_xmlsafe, make_xml_safe: :clean_xmlsafe,
roman5: :clean_roman5,
tnmcategory: :clean_tnmcategory,
strip: :strip, upcase: :upcase, itself: :itself
}.freeze

def clean(what)
cleaning_method = CLEAN_METHODS[what]
return send(cleaning_method) if cleaning_method

gsub(' ?', ' ')
end

private

def clean_nhsnumber
delete('^0-9')[0..9]
end

def clean_postcode
postcodeize(:db)
end

def clean_lpi
upcase.delete('^0-9A-Z')
end

def clean_gender
return '1' if self =~ /\AM(ale)?/i
return '2' if self =~ /\AF(emale)?/i

self
end

def clean_sex
# SECURE: BNS 2012-10-09: But may behave oddly for multi-line input
return '1' if self =~ /^M|1/i
return '2' if self =~ /^F|2/i

'0'
end

def clean_sex_c
return 'M' if self =~ /^M|1/i
return 'F' if self =~ /^F|2/i

''
end

def clean_name
substitutions = {
'.' => '',
/,|;/ => ' ',
/\s{2,}/ => ' ',
'`' => '\''
}
substitutions.inject(upcase) { |a, e| a.gsub(*e) }.strip
end

def clean_ethniccategory
replace_ethniccategory = {
'0' => '0', '1' => 'M', '2' => 'N',
'3' => 'H', '4' => 'J', '5' => 'K',
'6' => 'R', '7' => '8', '&' => 'X',
' ' => 'X', '99' => 'X'
}
replace_ethniccategory[self] || upcase
end

def clean_code
split_on_separators.map do |code|
code.blank? ? next : code.delete('.')
end.compact.join(' ')
end

def clean_code_icd
warn '[DEPRECATION] clean(:code_icd) is deprecated - consider using clean(:icd) instead.'
# regexp = /[A-Z][0-9]{2}(\.(X|[0-9]{1,2})|[0-9]?)( *(D|A)( |,|;|$))/
codes = upcase.split_on_separators.delete_if { |x| x.squash.blank? }
cleaned_codes = []
codes.each do |code|
if code == 'D' || code == 'A'
cleaned_codes[-1] += code
else
cleaned_codes << code
end
end
cleaned_codes.join(' ')
end

def clean_icd
codes = upcase.squish.split_on_separators.reject(&:blank?)
codes.map { |code| code.gsub(/(?<=\d)(\.?X?)/, '') }.join(' ')
end

def clean_hospitalnumber
self[-1..-1] =~ /\d/ ? self : self[0..-2]
end

def clean_xmlsafe
strip_xml_unsafe_characters
end

def clean_roman5
# This deromanises roman numerals between 1 and 5
gsub(/[IV]+/i) { |match| ROMAN_ONE_TO_FIVE_MAPPING[match.upcase] }
end

def clean_tnmcategory
sub!(/\A[tnm]/i, '')
if self =~ /\Ax\z/i
upcase
else
downcase
end
end

def clean_code_opcs
split_on_separators.map do |code|
db_code = code.squash
next unless 4 == db_code.length || db_code =~ /CZ00[12]/
db_code
end.compact.join(' ')
end
end
120 changes: 7 additions & 113 deletions lib/ndr_support/string/cleaning.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
require 'active_support/core_ext/string/filters'
require 'ndr_support/string/clean_methodable'

# Extends String clean with various methods of cleaning strings
# zand polishing them
class String
include CleanMethodable

INVALID_CONTROL_CHARS = /[\x00-\x08\x0b-\x0c\x0e-\x1f]/
ROMAN_ONE_TO_FIVE_MAPPING = { 'I' => '1', 'II' => '2', 'III' => '3', 'IIII' => '4', 'IV' => '4', 'V' => '5' }

Expand All @@ -26,9 +31,8 @@ def squash
# Parameter "option" can be :user, :compact, :db
def postcodeize(option = :user)
nspce = gsub(/[[:space:]]/, '').upcase
unless nspce.blank? || POSTCODE_REGEXP =~ nspce
return self # Don't change old-style or malformed postcodes
end
return self unless nspce.blank? || POSTCODE_REGEXP =~ nspce # Don't change old-style or malformed postcodes

case option
when :compact
nspce
Expand All @@ -43,106 +47,6 @@ def postcodeize(option = :user)
end
end

def clean(what)
case what
when :nhsnumber
delete('^0-9')[0..9]
when :postcode, :get_postcode
postcodeize(:db)
when :lpi
upcase.delete('^0-9A-Z')
when :gender
if self =~ /\AM(ale)?/i
'1'
elsif self =~ /\AF(emale)?/i
'2'
else
self
end
when :sex
# SECURE: BNS 2012-10-09: But may behave oddly for multi-line input
if self =~ /^M|1/i
'1'
elsif self =~ /^F|2/i
'2'
else
'0'
end
when :sex_c
if self =~ /^M|1/i
'M'
elsif self =~ /^F|2/i
'F'
else
''
end
when :name
substitutions = {
'.' => '',
/,|;/ => ' ',
/\s{2,}/ => ' ',
'`' => '\''
}
substitutions.inject(upcase) { |a, e| a.gsub(*e) }.strip
when :ethniccategory
replace_ethniccategory = {
'0' => '0',
'1' => 'M',
'2' => 'N',
'3' => 'H',
'4' => 'J',
'5' => 'K',
'6' => 'R',
'7' => '8',
'&' => 'X',
' ' => 'X',
'99' => 'X'
}
replace_ethniccategory[self] || upcase
when :code
split_on_separators.map do |code|
code.blank? ? next : code.delete('.')
end.compact.join(' ')
when :code_icd
warn '[DEPRECATION] clean(:code_icd) is deprecated - consider using clean(:icd) instead.'
# regexp = /[A-Z][0-9]{2}(\.(X|[0-9]{1,2})|[0-9]?)( *(D|A)( |,|;|$))/
codes = upcase.split_on_separators.delete_if { |x| x.squash.blank? }
cleaned_codes = []
codes.each do |code|
if code == 'D' || code == 'A'
cleaned_codes[-1] += code
else
cleaned_codes << code
end
end
cleaned_codes.join(' ')
when :icd
codes = upcase.squish.split_on_separators.reject(&:blank?)
codes.map { |code| code.gsub(/(?<=\d)(\.?X?)/, '') }.join(' ')
when :code_opcs
clean_code_opcs
when :hospitalnumber
self[-1..-1] =~ /\d/ ? self : self[0..-2]
when :xmlsafe, :make_xml_safe
strip_xml_unsafe_characters
when :roman5
# This deromanises roman numerals between 1 and 5
gsub(/[IV]+/i) { |match| ROMAN_ONE_TO_FIVE_MAPPING[match.upcase] }
when :tnmcategory
sub!(/\A[tnm]/i, '')
if self =~ /\Ax\z/i
upcase
else
downcase
end
when :strip, :upcase, :itself
# SECURE: 14-06-2017 TPG Fixed list of executable methods (whats)
send(what)
else
gsub(' ?', ' ')
end
end

def strip_xml_unsafe_characters
gsub(String::INVALID_CONTROL_CHARS, '')
end
Expand All @@ -156,14 +60,4 @@ def xml_unsafe?
def split_on_separators(regexp = / |,|;/)
split(regexp)
end

private

def clean_code_opcs
split_on_separators.map do |code|
db_code = code.squash
next unless 4 == db_code.length || db_code =~ /CZ00[12]/
db_code
end.compact.join(' ')
end
end

0 comments on commit 81dcc4e

Please sign in to comment.