Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

Optimize Canonical Ordering Algorithm.

Canonical Ordering Algorithm requires stable sorting of every sequence
of combining characters (characters with combining class > 0) inside the
string. Performing stable sorting (bubble sort) only on those sequences
instead of applying sorting to the whole string reduces the running time
of the nomalization test (on the full suit from NormalizationTest.txt)
in half (on my machine it reduced from ~40 seconds to ~20 seconds). I
think the performance boost might be even more noticeable on longer
strings.
  • Loading branch information...
commit ea3f3eb3fe22e5fb85919cb25a260c6fb662ea02 1 parent 2c6ffe7
Kirill Lashuk authored May 11, 2012
45  lib/twitter_cldr/normalizers/canonical/nfd.rb
@@ -35,7 +35,6 @@ def normalize(string)
35 35
         def normalize_code_points(code_points)
36 36
           code_points = code_points.map { |code_point| decompose code_point }.flatten
37 37
           reorder(code_points)
38  
-          code_points
39 38
         end
40 39
 
41 40
         # Recursively replace the given code point with the values in its Decomposition_Mapping property.
@@ -79,16 +78,46 @@ def decompose_hangul(code_point)
79 78
 
80 79
         # Swap any two adjacent code points A & B if ccc(A) > ccc(B) > 0.
81 80
         def reorder(code_points)
82  
-          code_points.size.times do
83  
-            code_points.each_with_index do |cp, i|
84  
-              unless i == (code_points.size - 1)
85  
-                ccc_a, ccc_b = combining_class_for(cp), combining_class_for(code_points[i + 1])
86  
-                if (ccc_a > ccc_b) && (ccc_b > 0)
87  
-                  code_points[i], code_points[i + 1] = code_points[i + 1], code_points[i]
88  
-                end
  81
+          code_points_with_cc = code_points.map { |cp| [cp, combining_class_for(cp)] }
  82
+
  83
+          result = []
  84
+          accum  = []
  85
+
  86
+          code_points_with_cc.each do |cp_with_cc|
  87
+            if cp_with_cc[1] == 0
  88
+              unless accum.empty?
  89
+                result.concat(stable_sort(accum))
  90
+                accum = []
  91
+              end
  92
+              result << cp_with_cc
  93
+            else
  94
+              accum << cp_with_cc
  95
+            end
  96
+          end
  97
+
  98
+          result.concat(stable_sort(accum)) unless accum.empty?
  99
+
  100
+          result.map { |cp_with_cc| cp_with_cc[0] }
  101
+        end
  102
+
  103
+        def stable_sort(code_points_with_cc)
  104
+          n = code_points_with_cc.size - 2
  105
+
  106
+          code_points_with_cc.size.times do
  107
+            swapped = false
  108
+
  109
+            (0..n).each do |j|
  110
+              if code_points_with_cc[j][1] > code_points_with_cc[j + 1][1]
  111
+                code_points_with_cc[j], code_points_with_cc[j + 1] = code_points_with_cc[j + 1], code_points_with_cc[j]
  112
+                swapped = true
89 113
               end
90 114
             end
  115
+
  116
+            break unless swapped
  117
+            n -= 1
91 118
           end
  119
+
  120
+          code_points_with_cc
92 121
         end
93 122
 
94 123
         def combining_class_for(code_point)

0 notes on commit ea3f3eb

Please sign in to comment.
Something went wrong with that request. Please try again.