forked from twitter/twitter-cldr-rb
-
Notifications
You must be signed in to change notification settings - Fork 1
/
nfd.rb
133 lines (103 loc) · 4.04 KB
/
nfd.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# encoding: UTF-8
# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0
module TwitterCldr
module Normalizers
class NFD < Base
HANGUL_CONSTANTS = {
:SBase => 0xAC00,
:LBase => 0x1100,
:VBase => 0x1161,
:TBase => 0x11A7,
:LCount => 19,
:VCount => 21,
:TCount => 28,
:NCount => 588, # VCount * TCount
:Scount => 11172 # LCount * NCount
}
class << self
def normalize(string)
# Convert string to code points
code_points = string.split('').map { |char| char_to_code_point(char) }
# Normalize code points
normalized_code_points = normalize_code_points(code_points)
# Convert normalized code points back to string
normalized_code_points.map { |code_point| code_point_to_char(code_point) }.join
end
def normalize_code_points(code_points)
code_points = code_points.map { |code_point| decompose code_point }.flatten
reorder(code_points)
end
# Recursively replace the given code point with the values in its Decomposition_Mapping property.
def decompose(code_point)
unicode_data = TwitterCldr::Shared::UnicodeData.for_code_point(code_point)
return code_point unless unicode_data
decomposition_mapping = unicode_data.decomposition.split
if unicode_data.name.include?('Hangul')
decompose_hangul(code_point)
# Return the code point if compatibility mapping or if no mapping exists
elsif decomposition_mapping.first =~ /<.*>/ || decomposition_mapping.empty?
code_point
else
decomposition_mapping.map do |decomposition_code_point|
decompose(decomposition_code_point)
end.flatten
end
end
private
# Special decomposition for Hangul syllables.
# Documented in Section 3.12 at http://www.unicode.org/versions/Unicode6.1.0/ch03.pdf
def decompose_hangul(code_point)
s_index = code_point.hex - HANGUL_CONSTANTS[:SBase]
l_index = s_index / HANGUL_CONSTANTS[:NCount]
v_index = (s_index % HANGUL_CONSTANTS[:NCount]) / HANGUL_CONSTANTS[:TCount]
t_index = s_index % HANGUL_CONSTANTS[:TCount]
result = []
result << (HANGUL_CONSTANTS[:LBase] + l_index).to_s(16).upcase
result << (HANGUL_CONSTANTS[:VBase] + v_index).to_s(16).upcase
result << (HANGUL_CONSTANTS[:TBase] + t_index).to_s(16).upcase if t_index > 0
result
end
# Swap any two adjacent code points A & B if ccc(A) > ccc(B) > 0.
def reorder(code_points)
code_points_with_cc = code_points.map { |cp| [cp, combining_class_for(cp)] }
result = []
accum = []
code_points_with_cc.each do |cp_with_cc|
if cp_with_cc[1] == 0
unless accum.empty?
result.concat(stable_sort(accum))
accum = []
end
result << cp_with_cc
else
accum << cp_with_cc
end
end
result.concat(stable_sort(accum)) unless accum.empty?
result.map { |cp_with_cc| cp_with_cc[0] }
end
def stable_sort(code_points_with_cc)
n = code_points_with_cc.size - 2
code_points_with_cc.size.times do
swapped = false
(0..n).each do |j|
if code_points_with_cc[j][1] > code_points_with_cc[j + 1][1]
code_points_with_cc[j], code_points_with_cc[j + 1] = code_points_with_cc[j + 1], code_points_with_cc[j]
swapped = true
end
end
break unless swapped
n -= 1
end
code_points_with_cc
end
def combining_class_for(code_point)
TwitterCldr::Shared::UnicodeData.for_code_point(code_point).combining_class.to_i
rescue NoMethodError
0
end
end
end
end
end