-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Juanito Fatas
committed
Jun 15, 2018
0 parents
commit 445321f
Showing
34 changed files
with
1,279 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
/.bundle/ | ||
/.yardoc | ||
/_yardoc/ | ||
/coverage/ | ||
/doc/ | ||
/pkg/ | ||
/spec/reports/ | ||
/tmp/ | ||
|
||
# rspec failure tracking | ||
.rspec_status |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
--format documentation | ||
--color | ||
--require spec_helper |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
sudo: false | ||
language: ruby | ||
rvm: | ||
- 2.5.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
# Changelog |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
source "https://rubygems.org" | ||
|
||
git_source(:github) {|repo_name| "https://github.com/#{repo_name}" } | ||
|
||
# Specify your gem's dependencies in active_normalizer.gemspec | ||
gemspec | ||
|
||
|
||
group :development do | ||
gem "bundler", "~> 1.0" | ||
gem "rake", "~> 10.0" | ||
gem "rspec", "~> 3.7" | ||
gem "pry" | ||
gem "benchmark-ips" | ||
end | ||
|
||
group :test do | ||
gem "unf" | ||
gem "unicode" | ||
gem "unicode_utils" | ||
gem "activesupport" | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
PATH | ||
remote: . | ||
specs: | ||
active_normalizer (0.0.0) | ||
|
||
GEM | ||
remote: https://rubygems.org/ | ||
specs: | ||
activesupport (5.2.0) | ||
concurrent-ruby (~> 1.0, >= 1.0.2) | ||
i18n (>= 0.7, < 2) | ||
minitest (~> 5.1) | ||
tzinfo (~> 1.1) | ||
benchmark-ips (2.7.2) | ||
coderay (1.1.2) | ||
concurrent-ruby (1.0.5) | ||
diff-lcs (1.3) | ||
i18n (1.0.1) | ||
concurrent-ruby (~> 1.0) | ||
method_source (0.9.0) | ||
minitest (5.11.3) | ||
pry (0.11.3) | ||
coderay (~> 1.1.0) | ||
method_source (~> 0.9.0) | ||
rake (10.5.0) | ||
rspec (3.7.0) | ||
rspec-core (~> 3.7.0) | ||
rspec-expectations (~> 3.7.0) | ||
rspec-mocks (~> 3.7.0) | ||
rspec-core (3.7.1) | ||
rspec-support (~> 3.7.0) | ||
rspec-expectations (3.7.0) | ||
diff-lcs (>= 1.2.0, < 2.0) | ||
rspec-support (~> 3.7.0) | ||
rspec-mocks (3.7.0) | ||
diff-lcs (>= 1.2.0, < 2.0) | ||
rspec-support (~> 3.7.0) | ||
rspec-support (3.7.1) | ||
thread_safe (0.3.6) | ||
tzinfo (1.2.5) | ||
thread_safe (~> 0.1) | ||
unf (0.1.4) | ||
unf_ext | ||
unf_ext (0.0.7.5) | ||
unicode (0.4.4.4) | ||
unicode_utils (1.4.0) | ||
|
||
PLATFORMS | ||
ruby | ||
|
||
DEPENDENCIES | ||
active_normalizer! | ||
activesupport | ||
benchmark-ips | ||
bundler (~> 1.0) | ||
pry | ||
rake (~> 10.0) | ||
rspec (~> 3.7) | ||
unf | ||
unicode | ||
unicode_utils | ||
|
||
BUNDLED WITH | ||
1.16.2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
The MIT License (MIT) | ||
|
||
Copyright (c) 2018 Juanito Fatas | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in | ||
all copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
THE SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,184 @@ | ||
# Active Normalizer | ||
|
||
Normalize weird Japanese characters, see [tests](/spec) for examples. | ||
|
||
Normalize fullwidth, halfwidth hiragana, katakana, symbols. | ||
|
||
## Usage | ||
|
||
Each normalizer class accepts option of `:nfc`, `:nfd`, `:nfkd`, `:nfkc` (See [Normalization Forms][unicode-nf] for more information). | ||
Each normalizer instance responds to `run`. | ||
|
||
```ruby | ||
require "active_normalizer/normalizers/ruby" | ||
nfkc_normalizer = ActiveNormalizer.new( | ||
ActiveNormalizer::Normalizers::Ruby, | ||
options: :nfkc | ||
) | ||
nfkc_normalizer.run(input) | ||
``` | ||
|
||
## Benchmark | ||
|
||
``` | ||
Benchmarking simple string: 800ー12345 | ||
Warming up -------------------------------------- | ||
UNF 92.981k i/100ms | ||
Unicode 36.002k i/100ms | ||
Ruby 17.044k i/100ms | ||
UnicodeUtils 12.681k i/100ms | ||
ActiveSupport 7.482k i/100ms | ||
Calculating ------------------------------------- | ||
UNF 1.173M (±17.6%) i/s - 5.672M in 5.041037s | ||
Unicode 404.502k (± 6.8%) i/s - 2.016M in 5.008748s | ||
Ruby 191.562k (±30.3%) i/s - 835.156k in 5.106057s | ||
UnicodeUtils 132.477k (± 5.3%) i/s - 672.093k in 5.088759s | ||
ActiveSupport 75.011k (±34.9%) i/s - 329.208k in 5.058559s | ||
Comparison: | ||
UNF: 1172663.8 i/s | ||
Unicode: 404502.1 i/s - 2.90x slower | ||
Ruby: 191562.4 i/s - 6.12x slower | ||
UnicodeUtils: 132477.3 i/s - 8.85x slower | ||
ActiveSupport: 75010.6 i/s - 15.63x slower | ||
Warming up -------------------------------------- | ||
UNF 67.181k i/100ms | ||
Unicode 31.572k i/100ms | ||
Ruby 14.947k i/100ms | ||
UnicodeUtils 12.443k i/100ms | ||
ActiveSupport 5.561k i/100ms | ||
Calculating ------------------------------------- | ||
UNF 997.098k (±25.2%) i/s - 27.477M in 30.052018s | ||
Unicode 328.071k (±19.5%) i/s - 9.503M in 30.090451s | ||
Ruby 177.045k (±32.8%) i/s - 4.529M in 30.071040s | ||
UnicodeUtils 134.513k (± 6.7%) i/s - 4.019M in 30.059621s | ||
ActiveSupport 68.063k (±44.7%) i/s - 1.668M in 30.131968s | ||
Comparison: | ||
UNF: 997097.6 i/s | ||
Unicode: 328070.8 i/s - 3.04x slower | ||
Ruby: 177044.6 i/s - 5.63x slower | ||
UnicodeUtils: 134512.7 i/s - 7.41x slower | ||
ActiveSupport: 68063.1 i/s - 14.65x slower | ||
Benchmarking longer string: ㍻㍼㍽㍾㌀㌁㌂㌃㌄㌅㌆㌇㌈㌉㌊㌋㌌㌍㌎㌏㌐㌑㌒㌓㌔㌕㌖㌗㌘㌙㌚㌛㌜㌝㌞㌟㌠㌡㌢㌣㌤㌥㌦㌧㌨㌩㌪㌫㌬㌭㌮㌯㌰㌱㌲㌳㌴㌵㌶㌷㌸㌹㌺㌻㌼㌽㌾㌿㍀㍁㍂㍃㍄㍅㍆㍇㍈㍉㍊㍋㍌㍍㍎㍏㍐㍑㍒㍓㍔㍕㍖㍗ | ||
Warming up -------------------------------------- | ||
UNF 6.023k i/100ms | ||
Unicode 1.238k i/100ms | ||
Ruby 1.068k i/100ms | ||
UnicodeUtils 319.000 i/100ms | ||
ActiveSupport 258.000 i/100ms | ||
Calculating ------------------------------------- | ||
UNF 59.891k (± 6.8%) i/s - 301.150k in 5.055411s | ||
Unicode 11.740k (± 9.0%) i/s - 59.424k in 5.103353s | ||
Ruby 10.655k (±10.9%) i/s - 53.400k in 5.091860s | ||
UnicodeUtils 3.087k (± 8.9%) i/s - 15.312k in 5.004688s | ||
ActiveSupport 2.533k (±11.1%) i/s - 12.642k in 5.064477s | ||
Comparison: | ||
UNF: 59890.8 i/s | ||
Unicode: 11740.2 i/s - 5.10x slower | ||
Ruby: 10655.0 i/s - 5.62x slower | ||
UnicodeUtils: 3087.4 i/s - 19.40x slower | ||
ActiveSupport: 2532.6 i/s - 23.65x slower | ||
Warming up -------------------------------------- | ||
UNF 5.739k i/100ms | ||
Unicode 1.122k i/100ms | ||
Ruby 1.113k i/100ms | ||
UnicodeUtils 312.000 i/100ms | ||
ActiveSupport 254.000 i/100ms | ||
Calculating ------------------------------------- | ||
UNF 59.371k (± 4.4%) i/s - 1.779M in 30.026571s | ||
Unicode 10.780k (±17.3%) i/s - 310.794k in 30.106556s | ||
Ruby 11.144k (± 6.7%) i/s - 332.787k in 30.034689s | ||
UnicodeUtils 3.164k (± 4.9%) i/s - 94.848k in 30.056928s | ||
ActiveSupport 2.635k (± 8.8%) i/s - 78.486k in 30.075836s | ||
Comparison: | ||
UNF: 59371.2 i/s | ||
Ruby: 11143.9 i/s - 5.33x slower | ||
Unicode: 10779.6 i/s - 5.51x slower | ||
UnicodeUtils: 3163.5 i/s - 18.77x slower | ||
ActiveSupport: 2635.3 i/s - 22.53x slower | ||
``` | ||
|
||
Benchmark code can be found at [bin/benchmark](bin/benchmark). | ||
|
||
## Installation | ||
|
||
Add this line to your application's Gemfile: | ||
|
||
```ruby | ||
gem "active_normalizer" | ||
``` | ||
|
||
And then execute: | ||
|
||
$ bundle | ||
|
||
Or install it yourself as: | ||
|
||
$ gem install active_normalizer | ||
|
||
## Dependnecies | ||
|
||
Active Normalizer provides a handful of normalizers. Their dependencies are not bundled except for one that utilizes standard library. You must bundle the normalizer's gem dependency. | ||
|
||
#### ActiveNormalizer::Normalizers::Ruby | ||
|
||
```ruby | ||
# no dependency required, standard library | ||
|
||
require "active_normalizer/normalizers/ruby" | ||
``` | ||
|
||
#### ActiveNormalizer::Normalizers::UNF - unf | ||
|
||
```ruby | ||
gem "unf" | ||
|
||
require "active_normalizer/normalizers/unf" | ||
``` | ||
|
||
#### ActiveNormalizer::Normalizers::Unicode - unicode | ||
|
||
```ruby | ||
gem "unicode" | ||
|
||
require "active_normalizer/normalizers/unicode" | ||
``` | ||
|
||
#### ActiveNormalizer::Normalizers::UnicodeUtils - unicode_utils | ||
|
||
```ruby | ||
gem "unicode_utils" | ||
|
||
require "active_normalizer/normalizers/unicode_utils" | ||
``` | ||
|
||
#### ActiveNormalizer::Normalizers::ActiveSupportMultibyte - active_support | ||
|
||
```ruby | ||
gem "active_support" | ||
|
||
require "active_normalizer/normalizers/active_support" | ||
``` | ||
|
||
## Development | ||
|
||
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/hack` for an interactive prompt that will allow you to experiment. | ||
|
||
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org). | ||
|
||
## Contributing | ||
|
||
Bug reports and pull requests are welcome on GitHub at https://github.com/JuanitoFatas/active_normalizer. | ||
|
||
## License | ||
|
||
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT). | ||
|
||
[unicode-nf]: http://unicode.org/reports/tr15/#Norm_Forms |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
require "bundler/gem_tasks" | ||
require "rspec/core/rake_task" | ||
|
||
RSpec::Core::RakeTask.new(:spec) | ||
|
||
task default: :spec |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
#!/usr/bin/env ruby | ||
|
||
require "bundler/setup" | ||
require "active_normalizer" | ||
|
||
require "active_normalizer/normalizers/unf" | ||
require "active_normalizer/normalizers/unicode" | ||
require "active_normalizer/normalizers/ruby" | ||
require "active_normalizer/normalizers/unicode_utils" | ||
require "active_normalizer/normalizers/active_support" | ||
|
||
require "benchmark/ips" | ||
|
||
class GCSuite | ||
def warming(*); run_gc; end | ||
def running(*); run_gc; end | ||
def warmup_stats(*); end | ||
def add_report(*); end | ||
private | ||
def run_gc | ||
GC.enable | ||
GC.start | ||
GC.disable | ||
end | ||
end | ||
|
||
def benchmark(input, suite) | ||
Benchmark.ips do |x| | ||
x.config(time: 5, warmup: 2) | ||
|
||
x.config(suite: suite) | ||
x.report("UNF") { ActiveNormalizer::Normalizers::UNF.new(:nfkc).run(input) } | ||
x.report("Unicode") { ActiveNormalizer::Normalizers::Unicode.new(:nfkc).run(input) } | ||
x.report("Ruby") { ActiveNormalizer::Normalizers::Ruby.new(:nfkc).run(input) } | ||
x.report("UnicodeUtils") { ActiveNormalizer::Normalizers::UnicodeUtils.new(:nfkc).run(input) } | ||
x.report("ActiveSupport") { ActiveNormalizer::Normalizers::ActiveSupport.new(:nfkc).run(input) } | ||
|
||
x.compare! | ||
end | ||
|
||
Benchmark.ips do |x| | ||
x.config(time: 30, warmup: 10) | ||
|
||
x.config(suite: suite) | ||
x.report("UNF") { ActiveNormalizer::Normalizers::UNF.new(:nfkc).run(input) } | ||
x.report("Unicode") { ActiveNormalizer::Normalizers::Unicode.new(:nfkc).run(input) } | ||
x.report("Ruby") { ActiveNormalizer::Normalizers::Ruby.new(:nfkc).run(input) } | ||
x.report("UnicodeUtils") { ActiveNormalizer::Normalizers::UnicodeUtils.new(:nfkc).run(input) } | ||
x.report("ActiveSupport") { ActiveNormalizer::Normalizers::ActiveSupport.new(:nfkc).run(input) } | ||
|
||
x.compare! | ||
end | ||
end | ||
|
||
suite = GCSuite.new | ||
|
||
puts "Benchmarking simple string: 800ー12345" | ||
benchmark("800ー12345", suite) | ||
|
||
puts "\nBenchmarking longer string: ㍻㍼㍽㍾㌀㌁㌂㌃㌄㌅㌆㌇㌈㌉㌊㌋㌌㌍㌎㌏㌐㌑㌒㌓㌔㌕㌖㌗㌘㌙㌚㌛㌜㌝㌞㌟㌠㌡㌢㌣㌤㌥㌦㌧㌨㌩㌪㌫㌬㌭㌮㌯㌰㌱㌲㌳㌴㌵㌶㌷㌸㌹㌺㌻㌼㌽㌾㌿㍀㍁㍂㍃㍄㍅㍆㍇㍈㍉㍊㍋㍌㍍㍎㍏㍐㍑㍒㍓㍔㍕㍖㍗" | ||
benchmark("㍻㍼㍽㍾㌀㌁㌂㌃㌄㌅㌆㌇㌈㌉㌊㌋㌌㌍㌎㌏㌐㌑㌒㌓㌔㌕㌖㌗㌘㌙㌚㌛㌜㌝㌞㌟㌠㌡㌢㌣㌤㌥㌦㌧㌨㌩㌪㌫㌬㌭㌮㌯㌰㌱㌲㌳㌴㌵㌶㌷㌸㌹㌺㌻㌼㌽㌾㌿㍀㍁㍂㍃㍄㍅㍆㍇㍈㍉㍊㍋㍌㍍㍎㍏㍐㍑㍒㍓㍔㍕㍖㍗", suite) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
#!/usr/bin/env ruby | ||
|
||
require "bundler/setup" | ||
require "active_normalizer" | ||
|
||
# You can add fixtures and/or initialization code here to make experimenting | ||
# with your gem easier. You can also use a different console, if you like. | ||
|
||
require "pry" | ||
Pry.start(__FILE__) |
Oops, something went wrong.