-
-
Notifications
You must be signed in to change notification settings - Fork 60
/
ScriptUtils.kt
187 lines (182 loc) · 8.46 KB
/
ScriptUtils.kt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
/*
* Copyright (C) 2012 The Android Open Source Project
* modified
* SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only
*/
package helium314.keyboard.latin.utils
import java.util.Locale
/**
* A class to help with handling different writing scripts.
*/
object ScriptUtils {
// Unicode scripts (ISO 15924), incomplete
const val SCRIPT_UNKNOWN = "" // Used for hardware keyboards
const val SCRIPT_ARABIC = "Arab"
const val SCRIPT_ARMENIAN = "Armn"
const val SCRIPT_BENGALI = "Beng"
const val SCRIPT_CYRILLIC = "Cyrl"
const val SCRIPT_DEVANAGARI = "Deva"
const val SCRIPT_GEORGIAN = "Geor"
const val SCRIPT_GREEK = "Grek"
const val SCRIPT_HEBREW = "Hebr"
const val SCRIPT_KANNADA = "Knda"
const val SCRIPT_KHMER = "Khmr"
const val SCRIPT_LAO = "Laoo"
const val SCRIPT_LATIN = "Latn"
const val SCRIPT_MALAYALAM = "Mlym"
const val SCRIPT_MYANMAR = "Mymr"
const val SCRIPT_SINHALA = "Sinh"
const val SCRIPT_TAMIL = "Taml"
const val SCRIPT_TELUGU = "Telu"
const val SCRIPT_THAI = "Thai"
const val SCRIPT_HANGUL = "Hang"
const val SCRIPT_GUJARATI = "Gujr"
@JvmStatic
fun scriptSupportsUppercase(locale: Locale): Boolean {
// only Latin, Cyrillic, Greek and Armenian have upper/lower case
// https://unicode.org/faq/casemap_charprop.html#3
return when (locale.script()) {
SCRIPT_LATIN, SCRIPT_CYRILLIC, SCRIPT_GREEK, SCRIPT_ARMENIAN -> true
else -> false
}
}
/*
* Returns whether the code point is a letter that makes sense for the specified
* locale for this spell checker.
* The dictionaries supported by Latin IME are described in res/xml/spellchecker.xml
* and is limited to EFIGS languages and Russian.
* Hence at the moment this explicitly tests for Cyrillic characters or Latin characters
* as appropriate, and explicitly excludes CJK, Arabic and Hebrew characters.
*/
@JvmStatic
fun isLetterPartOfScript(codePoint: Int, script: String): Boolean {
return when (script) {
SCRIPT_ARABIC ->
// Arabic letters can be in any of the following blocks:
// Arabic U+0600..U+06FF
// Arabic Supplement, Thaana U+0750..U+077F, U+0780..U+07BF
// Arabic Extended-A U+08A0..U+08FF
// Arabic Presentation Forms-A U+FB50..U+FDFF
// Arabic Presentation Forms-B U+FE70..U+FEFF
codePoint in 0x600..0x6FF
|| codePoint in 0x750..0x7BF
|| codePoint in 0x8A0..0x8FF
|| codePoint in 0xFB50..0xFDFF
|| codePoint in 0xFE70..0xFEFF
SCRIPT_ARMENIAN ->
// Armenian letters are in the Armenian unicode block, U+0530..U+058F and
// Alphabetic Presentation Forms block, U+FB00..U+FB4F, but only in the Armenian part
// of that block, which is U+FB13..U+FB17.
codePoint in 0x530..0x58F || codePoint in 0xFB13..0xFB17
SCRIPT_BENGALI ->
// Bengali unicode block is U+0980..U+09FF
codePoint in 0x980..0x9FF
SCRIPT_CYRILLIC ->
// All Cyrillic characters are in the 400~52F block. There are some in the upper
// Unicode range, but they are archaic characters that are not used in modern
// Russian and are not used by our dictionary.
codePoint in 0x400..0x52F && Character.isLetter(codePoint)
SCRIPT_DEVANAGARI ->
// Devanagari unicode block is +0900..U+097F
codePoint in 0x900..0x97F
SCRIPT_GEORGIAN ->
// Georgian letters are in the Georgian unicode block, U+10A0..U+10FF,
// or Georgian supplement block, U+2D00..U+2D2F
codePoint in 0x10A0..0x10FF || codePoint in 0x2D00..0x2D2F
SCRIPT_GREEK ->
// Greek letters are either in the 370~3FF range (Greek & Coptic), or in the
// 1F00~1FFF range (Greek extended). Our dictionary contains both sort of characters.
// Our dictionary also contains a few words with 0xF2; it would be best to check
// if that's correct, but a web search does return results for these words so
// they are probably okay.
codePoint in 0x370..0x3FF || codePoint in 0x1F00..0x1FFF || codePoint == 0xF2
SCRIPT_HEBREW ->
// Hebrew letters are in the Hebrew unicode block, which spans from U+0590 to U+05FF,
// or in the Alphabetic Presentation Forms block, U+FB00..U+FB4F, but only in the
// Hebrew part of that block, which is U+FB1D..U+FB4F.
codePoint in 0x590..0x5FF || codePoint in 0xFB1D..0xFB4F
SCRIPT_KANNADA ->
// Kannada unicode block is U+0C80..U+0CFF
codePoint in 0xC80..0xCFF
SCRIPT_KHMER ->
// Khmer letters are in unicode block U+1780..U+17FF, and the Khmer symbols block
// is U+19E0..U+19FF
codePoint in 0x1780..0x17FF || codePoint in 0x19E0..0x19FF
SCRIPT_LAO ->
// The Lao block is U+0E80..U+0EFF
codePoint in 0xE80..0xEFF
SCRIPT_LATIN ->
// Our supported latin script dictionaries (EFIGS) at the moment only include
// characters in the C0, C1, Latin Extended A and B, IPA extensions unicode
// blocks. As it happens, those are back-to-back in the code range 0x40 to 0x2AF,
// so the below is a very efficient way to test for it. As for the 0-0x3F, it's
// excluded from isLetter anyway.
codePoint <= 0x2AF && Character.isLetter(codePoint)
SCRIPT_MALAYALAM ->
// Malayalam unicode block is U+0D00..U+0D7F
codePoint in 0xD00..0xD7F
SCRIPT_MYANMAR ->
// Myanmar has three unicode blocks :
// Myanmar U+1000..U+109F
// Myanmar extended-A U+AA60..U+AA7F
// Myanmar extended-B U+A9E0..U+A9FF
codePoint in 0x1000..0x109F || codePoint in 0xAA60..0xAA7F || codePoint in 0xA9E0..0xA9FF
SCRIPT_SINHALA ->
// Sinhala unicode block is U+0D80..U+0DFF
codePoint in 0xD80..0xDFF
SCRIPT_TAMIL ->
// Tamil unicode block is U+0B80..U+0BFF
codePoint in 0xB80..0xBFF
SCRIPT_TELUGU ->
// Telugu unicode block is U+0C00..U+0C7F
codePoint in 0xC00..0xC7F
SCRIPT_THAI ->
// Thai unicode block is U+0E00..U+0E7F
codePoint in 0xE00..0xE7F
SCRIPT_HANGUL -> codePoint in 0xAC00..0xD7A3
|| codePoint in 0x3131..0x318E
|| codePoint in 0x1100..0x11FF
|| codePoint in 0xA960..0xA97C
|| codePoint in 0xD7B0..0xD7C6
|| codePoint in 0xD7CB..0xD7FB
SCRIPT_GUJARATI ->
// Gujarati unicode block is U+0A80..U+0AFF
codePoint in 0xA80..0xAFF
SCRIPT_UNKNOWN -> true
else -> throw RuntimeException("Unknown value of script: $script")
}
}
/**
* returns the locale script with fallback to default scripts
*/
@JvmStatic
fun Locale.script(): String {
if (script.isNotEmpty()) return script
if (country.equals("ZZ", true)) {
Log.w("ScriptUtils", "old _ZZ locale found: $this")
return SCRIPT_LATIN
}
return when (language) {
"ar", "ur", "fa" -> SCRIPT_ARABIC
"hy" -> SCRIPT_ARMENIAN
"bn" -> SCRIPT_BENGALI
"sr", "mk", "ru", "uk", "mn", "be", "kk", "ky", "bg", "xdq", "cv", "mhr", "mns" -> SCRIPT_CYRILLIC
"ka" -> SCRIPT_GEORGIAN
"el" -> SCRIPT_GREEK
"iw" -> SCRIPT_HEBREW
"km" -> SCRIPT_KHMER
"lo" -> SCRIPT_LAO
"ml" -> SCRIPT_MALAYALAM
"my" -> SCRIPT_MYANMAR
"si" -> SCRIPT_SINHALA
"ta" -> SCRIPT_TAMIL
"te" -> SCRIPT_TELUGU
"th" -> SCRIPT_THAI
"ko" -> SCRIPT_HANGUL
"hi", "mr", "ne" -> SCRIPT_DEVANAGARI
"kn" -> SCRIPT_KANNADA
"gu" -> SCRIPT_GUJARATI
else -> SCRIPT_LATIN // use as fallback
}
}
}