Skip to content

Commit

Permalink
fix #2: add charwidth function
Browse files Browse the repository at this point in the history
  • Loading branch information
stevengj committed Mar 8, 2015
1 parent 50381b9 commit 63ba255
Show file tree
Hide file tree
Showing 8 changed files with 10,584 additions and 10,087 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Expand Up @@ -9,6 +9,8 @@
*.dylib
*.dSYM
*.txt
*.ttf
*.sfd
*.out
bench/bench
bench/icu
Expand Down
5 changes: 5 additions & 0 deletions .travis.yml
Expand Up @@ -4,6 +4,11 @@ compiler:
- clang
notifications:
email: false
before_install:
- sudo add-apt-repository ppa:staticfloat/julia-deps -y
- sudo add-apt-repository ppa:staticfloat/juliareleases -y
- sudo apt-get update -qq -y
- sudo apt-get install libpcre3-dev julia -y
script:
- make prefix=`pwd`/local install
- make check
Expand Down
12 changes: 8 additions & 4 deletions Makefile
Expand Up @@ -7,6 +7,7 @@ PERL=perl
MAKE=make
AR=ar
INSTALL=install
JULIA=julia

# compiler settings
cflags = -O2 -std=c99 -pedantic -Wall -fpic -DUTF8PROC_EXPORTS $(CFLAGS)
Expand Down Expand Up @@ -41,16 +42,19 @@ all: c-library
c-library: libutf8proc.a libutf8proc.$(SHLIB_EXT)

clean:
rm -f utf8proc.o libutf8proc.a libutf8proc.$(SHLIB_VERS_EXT) libutf8proc.$(SHLIB_EXT) test/normtest test/graphemetest data/UnicodeData.txt data/DerivedCoreProperties.txt data/CompositionExclusions.txt data/CaseFolding.txt data/NormalizationTest.txt data/GraphemeBreakTest.txt
rm -f utf8proc.o libutf8proc.a libutf8proc.$(SHLIB_VERS_EXT) libutf8proc.$(SHLIB_EXT) test/normtest test/graphemetest data/UnicodeData.txt data/DerivedCoreProperties.txt data/CompositionExclusions.txt data/CaseFolding.txt data/NormalizationTest.txt data/GraphemeBreakTest.txt data/CharWidths.txt data/unifont*.ttf data/unifont*.sfd
$(MAKE) -C bench clean

update: utf8proc_data.c.new
cp -f utf8proc_data.c.new utf8proc_data.c

# real targets

utf8proc_data.c.new: data/data_generator.rb data/UnicodeData.txt data/GraphemeBreakProperty.txt data/DerivedCoreProperties.txt data/CompositionExclusions.txt data/CaseFolding.txt
(cd data; $(RUBY) data_generator.rb < UnicodeData.txt) > utf8proc_data.c.new
utf8proc_data.c.new: data/data_generator.rb data/UnicodeData.txt data/GraphemeBreakProperty.txt data/DerivedCoreProperties.txt data/CompositionExclusions.txt data/CaseFolding.txt data/CharWidths.txt
(cd data; $(RUBY) data_generator.rb < UnicodeData.txt) > $@

data/CharWidths.txt: data/charwidths.jl libutf8proc.$(SHLIB_EXT)
(cd data; $(JULIA) charwidths.jl) > $@

data/UnicodeData.txt:
$(CURL) -o $@ -O http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
Expand Down Expand Up @@ -85,7 +89,7 @@ libutf8proc.$(MAJOR).dylib: utf8proc.o
$(cc) -dynamiclib -o $@ $^ -install_name $(libdir)/$@ -Wl,-compatibility_version -Wl,$(MAJOR) -Wl,-current_version -Wl,$(MAJOR).$(MINOR).$(PATCH)

libutf8proc.dylib: libutf8proc.$(MAJOR).dylib
ln -s libutf8proc.$(MAJOR).dylib $@
ln -f -s libutf8proc.$(MAJOR).dylib $@

install: libutf8proc.a libutf8proc.$(SHLIB_EXT) libutf8proc.$(SHLIB_VERS_EXT)
mkdir -m 755 -p $(includedir)
Expand Down
151 changes: 151 additions & 0 deletions data/charwidths.jl
@@ -0,0 +1,151 @@
# Following work by @jiahao, we compute character widths using a combination of
# * advance widths from GNU Unifont (advance width 512 = 1 en)
# * UAX 11: East Asian Width
# * a few exceptions as needed
# Adapted from http://nbviewer.ipython.org/gist/jiahao/07e8b08bf6d8671e9734
#
# Requires Julia (obviously) and FontForge.

#############################################################################
# Widths from GNU Unifont

universion="7.0.06"
for fontfile in ["unifont-$universion", "unifont_upper-$universion"]
isfile("$fontfile.ttf") || download("http://unifoundry.com/pub/unifont-$universion/font-builds/$fontfile.ttf", "$fontfile.ttf")
isfile("$fontfile.sfd") || run(`fontforge -lang=ff -c "Open(\"$fontfile.ttf\");Save(\"$fontfile.sfd\");Quit(0);"`)
end

#Read sfdfile for character widths
function parsesfd(filename::String, CharWidths::Dict{Int,Int}=Dict{Int,Int}())
state=:seekchar
lineno = 0
for line in readlines(open(filename))
lineno += 1
if state==:seekchar #StartChar: nonmarkingreturn
if contains(line, "StartChar: ")
codepoint = nothing
width = nothing
state = :readdata
end
elseif state==:readdata #Encoding: 65538 -1 2, Width: 1024
contains(line, "Encoding:") && (codepoint = int(split(line)[3]))
contains(line, "Width:") && (width = int(split(line)[2]))
if codepoint!=nothing && width!=nothing && codepoint >= 0
CharWidths[codepoint]=width
state = :seekchar
end
end
end
CharWidths
end
CharWidths=parsesfd("unifont-$universion.sfd")
CharWidths=parsesfd("unifont_upper-$universion.sfd", CharWidths)

# convert from advance width (512 units to the en) to character width
for (c,v) in CharWidths
CharWidths[c] = div(v, 512)
end

#############################################################################
# Widths from UAX #11: East Asian Width

isfile("EastAsianWidth.txt") || download("http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt", "EastAsianWidth.txt")
for line in readlines(open("EastAsianWidth.txt"))
#Strip comments
line[1] == '#' && continue
precomment = split(line, '#')[1]
#Parse code point range and width code
tokens = split(precomment, ';')
length(tokens) >= 2 || continue
charrange = tokens[1]
width = strip(tokens[2])
#Parse code point range into Julia UnitRange
rangetokens = split(charrange, "..")
charstart = uint32("0x"*rangetokens[1])
charend = uint32("0x"*rangetokens[length(rangetokens)>1 ? 2 : 1])

#Assign widths
for c in charstart:charend
width=="N" && continue #Ignore neutral characters
CharWidths[c]=(width=="W" || width=="F") ? 2 : #Wide or full
(width=="Na"|| width=="H" || width=="A") ? 1 : #Narrow or half or ambiguous (default to narrow in non-East-Asian contexts, which we can assume to be the default)
error("Unknown East Asian width code: $width for code point: $c")
end
end

#############################################################################
# A few exceptions to the above cases, found by manual comparison
# to other wcwidth functions.

# Use ../libutf8proc for category codes, rather than the one in Julia,
# to minimize bootstrapping complexity when a new version of Unicode comes out.
function catcode(c)
uint(c) > 0x10FFFF && return 0x0000 # see utf8proc_get_property docs
return unsafe_load(ccall((:utf8proc_get_property,"../libutf8proc"), Ptr{UInt16}, (Int32,), c))
end


# use Base.UTF8proc module to get category codes constants, since
# we aren't goint to change these in utf8proc.
import Base.UTF8proc

# make sure format control character (category Cf) have width 0,
# except for the Arabic characters 0x06xx (see unicode std 6.2, sec. 8.2)
for c in keys(CharWidths)
if catcode(c)==UTF8proc.UTF8PROC_CATEGORY_CF &&
c [0x0601,0x0602,0x0603,0x06dd]
CharWidths[c]=0
end
end

#By definition, should have zero width (on the same line)
#0x002028 '
' category: Zl name: LINE SEPARATOR/
#0x002029 '
' category: Zp name: PARAGRAPH SEPARATOR/
CharWidths[0x2028]=0
CharWidths[0x2029]=0

#By definition, should be narrow = width of 1 en space
#0x00202f ' ' category: Zs name: NARROW NO-BREAK SPACE/
CharWidths[0x202f]=1

#By definition, should be wide = width of 1 em space
#0x002001 ' ' category: Zs name: EM QUAD/
#0x002003 ' ' category: Zs name: EM SPACE/
CharWidths[0x2001]=2
CharWidths[0x2003]=2

#############################################################################
# Non-printable control characters will be assigned a width of zero
# (wcwidth returns -1 for such characters)

isprintable(c::Union(Char,Integer)) = c <= 0x10ffff && is_valid_char(c) && isprintable_category(catcode(c))
isprintable_category(category) =
!( category==UTF8proc.UTF8PROC_CATEGORY_CN # Unassigned
|| category==UTF8proc.UTF8PROC_CATEGORY_CS # Surrogate
|| category==UTF8proc.UTF8PROC_CATEGORY_CC # Control
|| category==0 # Invalid
)

# Question: should we just use Julia's isprint algorithm here? It is different,
# though it is also based on the character category.

#############################################################################
# Output (to a file or pipe) for processing by data_generator.rb
# ... don't bother to output zero widths since that will be the default.

firstc = 0x000000
lastv = 0
uhex(c) = uppercase(hex(c,4))
for c in 0x0000:0x110000
v = isprintable(c) ? get(CharWidths, c, 0) : 0
if v != lastv || c == 0x110000
v < 4 || error("invalid charwidth $v for $c")
if firstc+1 < c
println(uhex(firstc), "..", uhex(c-1), "; ", lastv)
else
println(uhex(firstc), "; ", lastv)
end
firstc = c
lastv = v
end
end
18 changes: 14 additions & 4 deletions data/data_generator.rb
Expand Up @@ -85,14 +85,23 @@
end
end

$charwidth_list = File.read("CharWidths.txt")
$charwidth = Hash.new(0)
$charwidth_list.each_line do |entry|
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*([0-9]+)/
$1.hex.upto($2.hex) { |e2| $charwidth[e2] = $3.to_i }
elsif entry =~ /^([0-9A-F]+)\s*;\s*([0-9]+)/
$charwidth[$1.hex] = $2.to_i
end
end

$exclusions = File.read("CompositionExclusions.txt")[/# \(1\) Script Specifics.*?# Total code points:/m]
$exclusions = $exclusions.chomp.split("\n").collect { |e| e.hex }

$excl_version = File.read("CompositionExclusions.txt")[/# \(2\) Post Composition Version precomposed characters.*?# Total code points:/m]
$excl_version = $excl_version.chomp.split("\n").collect { |e| e.hex }

$case_folding_string = File.open("CaseFolding.txt").read

$case_folding_string = File.open("CaseFolding.txt", :encoding => 'utf-8').read
$case_folding = {}
$case_folding_string.chomp.split("\n").each do |line|
next unless line =~ /([0-9A-F]+); [CFS]; ([0-9A-F ]+);/i
Expand Down Expand Up @@ -172,7 +181,8 @@ def c_entry(comb1_indicies, comb2_indicies)
"#{$exclusions.include?(code) or $excl_version.include?(code)}, " <<
"#{$ignorable.include?(code)}, " <<
"#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " <<
"#{$grapheme_boundclass[code]}},\n"
"#{$grapheme_boundclass[code]}, " <<
"#{$charwidth[code]}},\n"
end
end

Expand Down Expand Up @@ -295,7 +305,7 @@ def c_entry(comb1_indicies, comb2_indicies)
$stdout << "};\n\n"

$stdout << "const utf8proc_property_t utf8proc_properties[] = {\n"
$stdout << " {0, 0, 0, 0, NULL, NULL, -1, -1, -1, -1, -1, false,false,false,false, UTF8PROC_BOUNDCLASS_OTHER},\n"
$stdout << " {0, 0, 0, 0, NULL, NULL, -1, -1, -1, -1, -1, false,false,false,false, UTF8PROC_BOUNDCLASS_OTHER, 0},\n"
properties.each { |line|
$stdout << line
}
Expand Down
6 changes: 6 additions & 0 deletions utf8proc.c
Expand Up @@ -223,6 +223,12 @@ DLLEXPORT bool utf8proc_grapheme_break(int32_t c1, int32_t c2) {
utf8proc_get_property(c2)->boundclass);
}

/* return a character width analogous to wcwidth (except portable and
hopefully less buggy than most system wcwidth functions). */
DLLEXPORT int utf8proc_charwidth(int32_t c) {
return utf8proc_get_property(c)->charwidth;
}

#define utf8proc_decompose_lump(replacement_uc) \
return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
options & ~UTF8PROC_LUMP, last_boundclass)
Expand Down
4 changes: 4 additions & 0 deletions utf8proc.h
Expand Up @@ -181,6 +181,7 @@ typedef struct utf8proc_property_struct {
unsigned ignorable:1;
unsigned control_boundary:1;
unsigned boundclass:4;
unsigned charwidth:2;
} utf8proc_property_t;

#define UTF8PROC_CATEGORY_CN 0
Expand Down Expand Up @@ -388,6 +389,9 @@ DLLEXPORT bool utf8proc_grapheme_break(int32_t c1, int32_t c2);
* permitted between them (as defined by the extended grapheme clusters in UAX#29).
*/

DLLEXPORT int utf8proc_charwidth(int32_t c);
/* Given a codepoint c, return a character width analogous to wcwidth(c). */

DLLEXPORT ssize_t utf8proc_map(
const uint8_t *str, ssize_t strlen, uint8_t **dstptr, int options
);
Expand Down

0 comments on commit 63ba255

Please sign in to comment.