Skip to content

Commit

Permalink
Active Normalizer
Browse files Browse the repository at this point in the history
  • Loading branch information
Juanito Fatas committed Jun 15, 2018
0 parents commit 445321f
Show file tree
Hide file tree
Showing 34 changed files with 1,279 additions and 0 deletions.
11 changes: 11 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
/.bundle/
/.yardoc
/_yardoc/
/coverage/
/doc/
/pkg/
/spec/reports/
/tmp/

# rspec failure tracking
.rspec_status
3 changes: 3 additions & 0 deletions .rspec
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
--format documentation
--color
--require spec_helper
4 changes: 4 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
sudo: false
language: ruby
rvm:
- 2.5.1
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Changelog
22 changes: 22 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
source "https://rubygems.org"

git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }

# Specify your gem's dependencies in active_normalizer.gemspec
gemspec


group :development do
gem "bundler", "~> 1.0"
gem "rake", "~> 10.0"
gem "rspec", "~> 3.7"
gem "pry"
gem "benchmark-ips"
end

group :test do
gem "unf"
gem "unicode"
gem "unicode_utils"
gem "activesupport"
end
64 changes: 64 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
PATH
remote: .
specs:
active_normalizer (0.0.0)

GEM
remote: https://rubygems.org/
specs:
activesupport (5.2.0)
concurrent-ruby (~> 1.0, >= 1.0.2)
i18n (>= 0.7, < 2)
minitest (~> 5.1)
tzinfo (~> 1.1)
benchmark-ips (2.7.2)
coderay (1.1.2)
concurrent-ruby (1.0.5)
diff-lcs (1.3)
i18n (1.0.1)
concurrent-ruby (~> 1.0)
method_source (0.9.0)
minitest (5.11.3)
pry (0.11.3)
coderay (~> 1.1.0)
method_source (~> 0.9.0)
rake (10.5.0)
rspec (3.7.0)
rspec-core (~> 3.7.0)
rspec-expectations (~> 3.7.0)
rspec-mocks (~> 3.7.0)
rspec-core (3.7.1)
rspec-support (~> 3.7.0)
rspec-expectations (3.7.0)
diff-lcs (>= 1.2.0, < 2.0)
rspec-support (~> 3.7.0)
rspec-mocks (3.7.0)
diff-lcs (>= 1.2.0, < 2.0)
rspec-support (~> 3.7.0)
rspec-support (3.7.1)
thread_safe (0.3.6)
tzinfo (1.2.5)
thread_safe (~> 0.1)
unf (0.1.4)
unf_ext
unf_ext (0.0.7.5)
unicode (0.4.4.4)
unicode_utils (1.4.0)

PLATFORMS
ruby

DEPENDENCIES
active_normalizer!
activesupport
benchmark-ips
bundler (~> 1.0)
pry
rake (~> 10.0)
rspec (~> 3.7)
unf
unicode
unicode_utils

BUNDLED WITH
1.16.2
21 changes: 21 additions & 0 deletions LICENSE.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
The MIT License (MIT)

Copyright (c) 2018 Juanito Fatas

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
184 changes: 184 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
# Active Normalizer

Normalize weird Japanese characters, see [tests](/spec) for examples.

Normalize fullwidth, halfwidth hiragana, katakana, symbols.

## Usage

Each normalizer class accepts option of `:nfc`, `:nfd`, `:nfkd`, `:nfkc` (See [Normalization Forms][unicode-nf] for more information).
Each normalizer instance responds to `run`.

```ruby
require "active_normalizer/normalizers/ruby"
nfkc_normalizer = ActiveNormalizer.new(
ActiveNormalizer::Normalizers::Ruby,
options: :nfkc
)
nfkc_normalizer.run(input)
```

## Benchmark

```
Benchmarking simple string: 800ー12345
Warming up --------------------------------------
UNF 92.981k i/100ms
Unicode 36.002k i/100ms
Ruby 17.044k i/100ms
UnicodeUtils 12.681k i/100ms
ActiveSupport 7.482k i/100ms
Calculating -------------------------------------
UNF 1.173M (±17.6%) i/s - 5.672M in 5.041037s
Unicode 404.502k (± 6.8%) i/s - 2.016M in 5.008748s
Ruby 191.562k (±30.3%) i/s - 835.156k in 5.106057s
UnicodeUtils 132.477k (± 5.3%) i/s - 672.093k in 5.088759s
ActiveSupport 75.011k (±34.9%) i/s - 329.208k in 5.058559s
Comparison:
UNF: 1172663.8 i/s
Unicode: 404502.1 i/s - 2.90x slower
Ruby: 191562.4 i/s - 6.12x slower
UnicodeUtils: 132477.3 i/s - 8.85x slower
ActiveSupport: 75010.6 i/s - 15.63x slower
Warming up --------------------------------------
UNF 67.181k i/100ms
Unicode 31.572k i/100ms
Ruby 14.947k i/100ms
UnicodeUtils 12.443k i/100ms
ActiveSupport 5.561k i/100ms
Calculating -------------------------------------
UNF 997.098k (±25.2%) i/s - 27.477M in 30.052018s
Unicode 328.071k (±19.5%) i/s - 9.503M in 30.090451s
Ruby 177.045k (±32.8%) i/s - 4.529M in 30.071040s
UnicodeUtils 134.513k (± 6.7%) i/s - 4.019M in 30.059621s
ActiveSupport 68.063k (±44.7%) i/s - 1.668M in 30.131968s
Comparison:
UNF: 997097.6 i/s
Unicode: 328070.8 i/s - 3.04x slower
Ruby: 177044.6 i/s - 5.63x slower
UnicodeUtils: 134512.7 i/s - 7.41x slower
ActiveSupport: 68063.1 i/s - 14.65x slower
Benchmarking longer string: ㍻㍼㍽㍾㌀㌁㌂㌃㌄㌅㌆㌇㌈㌉㌊㌋㌌㌍㌎㌏㌐㌑㌒㌓㌔㌕㌖㌗㌘㌙㌚㌛㌜㌝㌞㌟㌠㌡㌢㌣㌤㌥㌦㌧㌨㌩㌪㌫㌬㌭㌮㌯㌰㌱㌲㌳㌴㌵㌶㌷㌸㌹㌺㌻㌼㌽㌾㌿㍀㍁㍂㍃㍄㍅㍆㍇㍈㍉㍊㍋㍌㍍㍎㍏㍐㍑㍒㍓㍔㍕㍖㍗
Warming up --------------------------------------
UNF 6.023k i/100ms
Unicode 1.238k i/100ms
Ruby 1.068k i/100ms
UnicodeUtils 319.000 i/100ms
ActiveSupport 258.000 i/100ms
Calculating -------------------------------------
UNF 59.891k (± 6.8%) i/s - 301.150k in 5.055411s
Unicode 11.740k (± 9.0%) i/s - 59.424k in 5.103353s
Ruby 10.655k (±10.9%) i/s - 53.400k in 5.091860s
UnicodeUtils 3.087k (± 8.9%) i/s - 15.312k in 5.004688s
ActiveSupport 2.533k (±11.1%) i/s - 12.642k in 5.064477s
Comparison:
UNF: 59890.8 i/s
Unicode: 11740.2 i/s - 5.10x slower
Ruby: 10655.0 i/s - 5.62x slower
UnicodeUtils: 3087.4 i/s - 19.40x slower
ActiveSupport: 2532.6 i/s - 23.65x slower
Warming up --------------------------------------
UNF 5.739k i/100ms
Unicode 1.122k i/100ms
Ruby 1.113k i/100ms
UnicodeUtils 312.000 i/100ms
ActiveSupport 254.000 i/100ms
Calculating -------------------------------------
UNF 59.371k (± 4.4%) i/s - 1.779M in 30.026571s
Unicode 10.780k (±17.3%) i/s - 310.794k in 30.106556s
Ruby 11.144k (± 6.7%) i/s - 332.787k in 30.034689s
UnicodeUtils 3.164k (± 4.9%) i/s - 94.848k in 30.056928s
ActiveSupport 2.635k (± 8.8%) i/s - 78.486k in 30.075836s
Comparison:
UNF: 59371.2 i/s
Ruby: 11143.9 i/s - 5.33x slower
Unicode: 10779.6 i/s - 5.51x slower
UnicodeUtils: 3163.5 i/s - 18.77x slower
ActiveSupport: 2635.3 i/s - 22.53x slower
```

Benchmark code can be found at [bin/benchmark](bin/benchmark).

## Installation

Add this line to your application's Gemfile:

```ruby
gem "active_normalizer"
```

And then execute:

$ bundle

Or install it yourself as:

$ gem install active_normalizer

## Dependnecies

Active Normalizer provides a handful of normalizers. Their dependencies are not bundled except for one that utilizes standard library. You must bundle the normalizer's gem dependency.

#### ActiveNormalizer::Normalizers::Ruby

```ruby
# no dependency required, standard library

require "active_normalizer/normalizers/ruby"
```

#### ActiveNormalizer::Normalizers::UNF - unf

```ruby
gem "unf"

require "active_normalizer/normalizers/unf"
```

#### ActiveNormalizer::Normalizers::Unicode - unicode

```ruby
gem "unicode"

require "active_normalizer/normalizers/unicode"
```

#### ActiveNormalizer::Normalizers::UnicodeUtils - unicode_utils

```ruby
gem "unicode_utils"

require "active_normalizer/normalizers/unicode_utils"
```

#### ActiveNormalizer::Normalizers::ActiveSupportMultibyte - active_support

```ruby
gem "active_support"

require "active_normalizer/normalizers/active_support"
```

## Development

After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/hack` for an interactive prompt that will allow you to experiment.

To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).

## Contributing

Bug reports and pull requests are welcome on GitHub at https://github.com/JuanitoFatas/active_normalizer.

## License

The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).

[unicode-nf]: http://unicode.org/reports/tr15/#Norm_Forms
6 changes: 6 additions & 0 deletions Rakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
require "bundler/gem_tasks"
require "rspec/core/rake_task"

RSpec::Core::RakeTask.new(:spec)

task default: :spec
61 changes: 61 additions & 0 deletions bin/benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/usr/bin/env ruby

require "bundler/setup"
require "active_normalizer"

require "active_normalizer/normalizers/unf"
require "active_normalizer/normalizers/unicode"
require "active_normalizer/normalizers/ruby"
require "active_normalizer/normalizers/unicode_utils"
require "active_normalizer/normalizers/active_support"

require "benchmark/ips"

class GCSuite
def warming(*); run_gc; end
def running(*); run_gc; end
def warmup_stats(*); end
def add_report(*); end
private
def run_gc
GC.enable
GC.start
GC.disable
end
end

def benchmark(input, suite)
Benchmark.ips do |x|
x.config(time: 5, warmup: 2)

x.config(suite: suite)
x.report("UNF") { ActiveNormalizer::Normalizers::UNF.new(:nfkc).run(input) }
x.report("Unicode") { ActiveNormalizer::Normalizers::Unicode.new(:nfkc).run(input) }
x.report("Ruby") { ActiveNormalizer::Normalizers::Ruby.new(:nfkc).run(input) }
x.report("UnicodeUtils") { ActiveNormalizer::Normalizers::UnicodeUtils.new(:nfkc).run(input) }
x.report("ActiveSupport") { ActiveNormalizer::Normalizers::ActiveSupport.new(:nfkc).run(input) }

x.compare!
end

Benchmark.ips do |x|
x.config(time: 30, warmup: 10)

x.config(suite: suite)
x.report("UNF") { ActiveNormalizer::Normalizers::UNF.new(:nfkc).run(input) }
x.report("Unicode") { ActiveNormalizer::Normalizers::Unicode.new(:nfkc).run(input) }
x.report("Ruby") { ActiveNormalizer::Normalizers::Ruby.new(:nfkc).run(input) }
x.report("UnicodeUtils") { ActiveNormalizer::Normalizers::UnicodeUtils.new(:nfkc).run(input) }
x.report("ActiveSupport") { ActiveNormalizer::Normalizers::ActiveSupport.new(:nfkc).run(input) }

x.compare!
end
end

suite = GCSuite.new

puts "Benchmarking simple string: 800ー12345"
benchmark("800ー12345", suite)

puts "\nBenchmarking longer string: ㍻㍼㍽㍾㌀㌁㌂㌃㌄㌅㌆㌇㌈㌉㌊㌋㌌㌍㌎㌏㌐㌑㌒㌓㌔㌕㌖㌗㌘㌙㌚㌛㌜㌝㌞㌟㌠㌡㌢㌣㌤㌥㌦㌧㌨㌩㌪㌫㌬㌭㌮㌯㌰㌱㌲㌳㌴㌵㌶㌷㌸㌹㌺㌻㌼㌽㌾㌿㍀㍁㍂㍃㍄㍅㍆㍇㍈㍉㍊㍋㍌㍍㍎㍏㍐㍑㍒㍓㍔㍕㍖㍗"
benchmark("㍻㍼㍽㍾㌀㌁㌂㌃㌄㌅㌆㌇㌈㌉㌊㌋㌌㌍㌎㌏㌐㌑㌒㌓㌔㌕㌖㌗㌘㌙㌚㌛㌜㌝㌞㌟㌠㌡㌢㌣㌤㌥㌦㌧㌨㌩㌪㌫㌬㌭㌮㌯㌰㌱㌲㌳㌴㌵㌶㌷㌸㌹㌺㌻㌼㌽㌾㌿㍀㍁㍂㍃㍄㍅㍆㍇㍈㍉㍊㍋㍌㍍㍎㍏㍐㍑㍒㍓㍔㍕㍖㍗", suite)
10 changes: 10 additions & 0 deletions bin/hack
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/usr/bin/env ruby

require "bundler/setup"
require "active_normalizer"

# You can add fixtures and/or initialization code here to make experimenting
# with your gem easier. You can also use a different console, if you like.

require "pry"
Pry.start(__FILE__)
Loading

0 comments on commit 445321f

Please sign in to comment.