forked from rails/rails
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add methods for string verification and encoding cleanup code.
Signed-off-by: Michael Koziarski <michael@koziarski.com>
- Loading branch information
Showing
4 changed files
with
165 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
module ActiveSupport #:nodoc: | ||
module Multibyte #:nodoc: | ||
# Returns a regular expression that matches valid characters in the current encoding | ||
def self.valid_character | ||
case $KCODE | ||
when 'UTF8' | ||
VALID_CHARACTER['UTF-8'] | ||
when 'SJIS' | ||
VALID_CHARACTER['Shift_JIS'] | ||
end | ||
end | ||
|
||
# Verifies the encoding of a string | ||
def self.verify(string) | ||
if expression = valid_character | ||
for c in string.split(//) | ||
return false unless valid_character.match(c) | ||
end | ||
end | ||
true | ||
end | ||
|
||
# Verifies the encoding of the string and raises an exception when it's not valid | ||
def self.verify!(string) | ||
raise ActiveSupport::Multibyte::Handlers::EncodingError.new("Found characters with invalid encoding") unless verify(string) | ||
end | ||
|
||
# Removes all invalid characters from the string | ||
def self.clean(string) | ||
if expression = valid_character | ||
stripped = []; for c in string.split(//) | ||
stripped << c if valid_character.match(c) | ||
end; stripped.join | ||
else | ||
string | ||
end | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
require 'abstract_unit' | ||
|
||
class MultibyteUtilsTest < Test::Unit::TestCase | ||
|
||
def test_valid_character_returns_an_expression_for_the_current_encoding | ||
with_kcode('None') do | ||
assert_nil ActiveSupport::Multibyte.valid_character | ||
end | ||
with_kcode('UTF8') do | ||
assert_equal ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'], ActiveSupport::Multibyte.valid_character | ||
end | ||
with_kcode('SJIS') do | ||
assert_equal ActiveSupport::Multibyte::VALID_CHARACTER['Shift_JIS'], ActiveSupport::Multibyte.valid_character | ||
end | ||
end | ||
|
||
def test_verify_verifies_ASCII_strings_are_properly_encoded | ||
with_kcode('None') do | ||
examples.each do |example| | ||
assert ActiveSupport::Multibyte.verify(example) | ||
end | ||
end | ||
end | ||
|
||
def test_verify_verifies_UTF_8_strings_are_properly_encoded | ||
with_kcode('UTF8') do | ||
assert ActiveSupport::Multibyte.verify(example('valid UTF-8')) | ||
assert !ActiveSupport::Multibyte.verify(example('invalid UTF-8')) | ||
end | ||
end | ||
|
||
def test_verify_verifies_Shift_JIS_strings_are_properly_encoded | ||
with_kcode('SJIS') do | ||
assert ActiveSupport::Multibyte.verify(example('valid Shift-JIS')) | ||
assert !ActiveSupport::Multibyte.verify(example('invalid Shift-JIS')) | ||
end | ||
end | ||
|
||
def test_verify_bang_raises_an_exception_when_it_finds_an_invalid_character | ||
with_kcode('UTF8') do | ||
assert_raises(ActiveSupport::Multibyte::Handlers::EncodingError) do | ||
ActiveSupport::Multibyte.verify!(example('invalid UTF-8')) | ||
end | ||
end | ||
end | ||
|
||
def test_verify_bang_doesnt_raise_an_exception_when_the_encoding_is_valid | ||
with_kcode('UTF8') do | ||
assert_nothing_raised do | ||
ActiveSupport::Multibyte.verify!(example('valid UTF-8')) | ||
end | ||
end | ||
end | ||
|
||
def test_clean_leaves_ASCII_strings_intact | ||
with_kcode('None') do | ||
[ | ||
'word', "\270\236\010\210\245" | ||
].each do |string| | ||
assert_equal string, ActiveSupport::Multibyte.clean(string) | ||
end | ||
end | ||
end | ||
|
||
def test_clean_cleans_invalid_characters_from_UTF_8_encoded_strings | ||
with_kcode('UTF8') do | ||
cleaned_utf8 = [8].pack('C*') | ||
assert_equal example('valid UTF-8'), ActiveSupport::Multibyte.clean(example('valid UTF-8')) | ||
assert_equal cleaned_utf8, ActiveSupport::Multibyte.clean(example('invalid UTF-8')) | ||
end | ||
end | ||
|
||
def test_clean_cleans_invalid_characters_from_Shift_JIS_encoded_strings | ||
with_kcode('SJIS') do | ||
cleaned_sjis = [184, 0, 136, 165].pack('C*') | ||
assert_equal example('valid Shift-JIS'), ActiveSupport::Multibyte.clean(example('valid Shift-JIS')) | ||
assert_equal cleaned_sjis, ActiveSupport::Multibyte.clean(example('invalid Shift-JIS')) | ||
end | ||
end | ||
|
||
private | ||
|
||
STRINGS = { | ||
'valid ASCII' => [65, 83, 67, 73, 73].pack('C*'), | ||
'invalid ASCII' => [128].pack('C*'), | ||
'valid UTF-8' => [227, 129, 147, 227, 129, 171, 227, 129, 161, 227, 130, 143].pack('C*'), | ||
'invalid UTF-8' => [184, 158, 8, 136, 165].pack('C*'), | ||
'valid Shift-JIS' => [131, 122, 129, 91, 131, 128].pack('C*'), | ||
'invalid Shift-JIS' => [184, 158, 8, 0, 255, 136, 165].pack('C*') | ||
} | ||
|
||
def example(key) | ||
STRINGS[key] | ||
end | ||
|
||
def examples | ||
STRINGS.values | ||
end | ||
|
||
def with_kcode(code) | ||
before = $KCODE | ||
$KCODE = code | ||
yield | ||
$KCODE = before | ||
end | ||
end |