Skip to content

Commit

Permalink
fix #2: add charwidth function
Browse files Browse the repository at this point in the history
  • Loading branch information
stevengj committed Mar 9, 2015
1 parent 50381b9 commit f6d46cf
Show file tree
Hide file tree
Showing 10 changed files with 10,660 additions and 10,115 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Expand Up @@ -9,6 +9,8 @@
*.dylib
*.dSYM
*.txt
*.ttf
*.sfd
*.out
bench/bench
bench/icu
Expand Down
7 changes: 6 additions & 1 deletion .travis.yml
Expand Up @@ -4,7 +4,12 @@ compiler:
- clang
notifications:
email: false
before_install:
- sudo add-apt-repository ppa:staticfloat/julia-deps -y
- sudo add-apt-repository ppa:staticfloat/juliareleases -y
- sudo apt-get update -qq -y
- sudo apt-get install libpcre3-dev julia fontforge -y
script:
- make prefix=`pwd`/local install
- make check
- make utf8proc_data.c.new && (diff utf8proc_data.c.new utf8proc_data.c > /dev/null)
- make data && (diff data/utf8proc_data.c.new utf8proc_data.c > /dev/null)
43 changes: 14 additions & 29 deletions Makefile
@@ -1,9 +1,6 @@
# libutf8proc Makefile

# programs
CURL=curl
RUBY=ruby
PERL=perl
MAKE=make
AR=ar
INSTALL=install
Expand Down Expand Up @@ -36,36 +33,24 @@ includedir=$(prefix)/include

# meta targets

all: c-library
.PHONY: all, clean, update, data

c-library: libutf8proc.a libutf8proc.$(SHLIB_EXT)
all: libutf8proc.a libutf8proc.$(SHLIB_EXT)

clean:
rm -f utf8proc.o libutf8proc.a libutf8proc.$(SHLIB_VERS_EXT) libutf8proc.$(SHLIB_EXT) test/normtest test/graphemetest data/UnicodeData.txt data/DerivedCoreProperties.txt data/CompositionExclusions.txt data/CaseFolding.txt data/NormalizationTest.txt data/GraphemeBreakTest.txt
rm -f utf8proc.o libutf8proc.a libutf8proc.$(SHLIB_VERS_EXT) libutf8proc.$(SHLIB_EXT) test/normtest test/graphemetest
$(MAKE) -C bench clean
$(MAKE) -C data clean

update: utf8proc_data.c.new
cp -f utf8proc_data.c.new utf8proc_data.c
data: data/utf8proc_data.c.new

# real targets

utf8proc_data.c.new: data/data_generator.rb data/UnicodeData.txt data/GraphemeBreakProperty.txt data/DerivedCoreProperties.txt data/CompositionExclusions.txt data/CaseFolding.txt
(cd data; $(RUBY) data_generator.rb < UnicodeData.txt) > utf8proc_data.c.new

data/UnicodeData.txt:
$(CURL) -o $@ -O http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
update: data/utf8proc_data.c.new
cp -f data/utf8proc_data.c.new utf8proc_data.c

data/GraphemeBreakProperty.txt:
$(CURL) -o $@ -O http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt

data/DerivedCoreProperties.txt:
$(CURL) -o $@ -O http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt

data/CompositionExclusions.txt:
$(CURL) -o $@ -O http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt
# real targets

data/CaseFolding.txt:
$(CURL) -o $@ -O http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
data/utf8proc_data.c.new: libutf8proc.$(SHLIB_EXT)
$(MAKE) -C data utf8proc_data.c.new

utf8proc.o: utf8proc.h utf8proc.c utf8proc_data.c
$(cc) -c -o utf8proc.o utf8proc.c
Expand All @@ -85,7 +70,7 @@ libutf8proc.$(MAJOR).dylib: utf8proc.o
$(cc) -dynamiclib -o $@ $^ -install_name $(libdir)/$@ -Wl,-compatibility_version -Wl,$(MAJOR) -Wl,-current_version -Wl,$(MAJOR).$(MINOR).$(PATCH)

libutf8proc.dylib: libutf8proc.$(MAJOR).dylib
ln -s libutf8proc.$(MAJOR).dylib $@
ln -f -s libutf8proc.$(MAJOR).dylib $@

install: libutf8proc.a libutf8proc.$(SHLIB_EXT) libutf8proc.$(SHLIB_VERS_EXT)
mkdir -m 755 -p $(includedir)
Expand All @@ -98,10 +83,10 @@ install: libutf8proc.a libutf8proc.$(SHLIB_EXT) libutf8proc.$(SHLIB_VERS_EXT)
# Test programs

data/NormalizationTest.txt:
$(CURL) -o $@ -O http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt
$(MAKE) -C data NormalizationTest.txt

data/GraphemeBreakTest.txt:
$(CURL) http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt | $(PERL) -pe 's,÷,/,g;s,×,+,g' > $@
$(MAKE) -C data GraphemeBreakTest.txt

test/normtest: test/normtest.c utf8proc.o utf8proc.h test/tests.h
$(cc) test/normtest.c utf8proc.o -o $@
Expand All @@ -112,6 +97,6 @@ test/graphemetest: test/graphemetest.c utf8proc.o utf8proc.h test/tests.h
test/printproperty: test/printproperty.c utf8proc.o utf8proc.h test/tests.h
$(cc) test/printproperty.c utf8proc.o -o $@

check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt
check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty
test/normtest data/NormalizationTest.txt
test/graphemetest data/GraphemeBreakTest.txt
59 changes: 59 additions & 0 deletions data/Makefile
@@ -0,0 +1,59 @@
# Unicode data generation rules. Except for the test data files, most
# users will not use these Makefile rules, which are primarily to re-generate
# unicode_data.c when we get a new Unicode version or charwidth data; they
# require ruby, fontforge, and julia to be installed.

# programs
CURL=curl
RUBY=ruby
PERL=perl
MAKE=make
JULIA=julia
CURLFLAGS = --retry 5 --location

# use JuliaLang caching (https://github.com/staticfloat/cache.julialang.org)
# so that Travis builds do not depend on anyone's flaky servers but our own
URLCACHE=https://cache.e.ip.saba.us/

.PHONY: clean

.DELETE_ON_ERROR:

utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt CharWidths.txt
$(RUBY) data_generator.rb < UnicodeData.txt > $@

# GNU Unifont version for font-metric calculations:
UNIFONT_VERSION=7.0.06

unifont-$(UNIFONT_VERSION).ttf:
$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)http://unifoundry.com/pub/unifont-$(UNIFONT_VERSION)/font-builds/unifont-$(UNIFONT_VERSION).ttf

unifont_upper-$(UNIFONT_VERSION).ttf:
$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)http://unifoundry.com/pub/unifont-$(UNIFONT_VERSION)/font-builds/unifont_upper-$(UNIFONT_VERSION).ttf

CharWidths.txt: charwidths.jl unifont-$(UNIFONT_VERSION).ttf unifont_upper-$(UNIFONT_VERSION).ttf
UNIFONT_VERSION=$(UNIFONT_VERSION) $(JULIA) charwidths.jl > $@

UnicodeData.txt:
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://www.unicode.org/Public/UNIDATA/UnicodeData.txt

GraphemeBreakProperty.txt:
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt

DerivedCoreProperties.txt:
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt

CompositionExclusions.txt:
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt

CaseFolding.txt:
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://www.unicode.org/Public/UNIDATA/CaseFolding.txt

NormalizationTest.txt:
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt

GraphemeBreakTest.txt:
$(CURL) $(CURLFLAGS) $(URLCACHE)http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt | $(PERL) -pe 's,÷,/,g;s,×,+,g' > $@

clean:
rm -f UnicodeData.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt CharWidths.txt unifont*.ttf unifont*.sfd
157 changes: 157 additions & 0 deletions data/charwidths.jl
@@ -0,0 +1,157 @@
# Following work by @jiahao, we compute character widths using a combination of
# * advance widths from GNU Unifont (advance width 512 = 1 en)
# * UAX 11: East Asian Width
# * a few exceptions as needed
# Adapted from http://nbviewer.ipython.org/gist/jiahao/07e8b08bf6d8671e9734
#
# Requires Julia (obviously) and FontForge.

#############################################################################
# Julia 0.3/0.4 compatibility (taken from Compat package)
if VERSION < v"0.4.0-dev+1419"
const UInt16 = Uint16
end

#############################################################################
# Widths from GNU Unifont

universion=get(ENV, "UNIFONT_VERSION", "7.0.06")
for fontfile in ["unifont-$universion", "unifont_upper-$universion"]
isfile("$fontfile.ttf") || download("http://unifoundry.com/pub/unifont-$universion/font-builds/$fontfile.ttf", "$fontfile.ttf")
isfile("$fontfile.sfd") || run(`fontforge -lang=ff -c "Open(\"$fontfile.ttf\");Save(\"$fontfile.sfd\");Quit(0);"`)
end

#Read sfdfile for character widths
function parsesfd(filename::String, CharWidths::Dict{Int,Int}=Dict{Int,Int}())
state=:seekchar
lineno = 0
for line in readlines(open(filename))
lineno += 1
if state==:seekchar #StartChar: nonmarkingreturn
if contains(line, "StartChar: ")
codepoint = nothing
width = nothing
state = :readdata
end
elseif state==:readdata #Encoding: 65538 -1 2, Width: 1024
contains(line, "Encoding:") && (codepoint = int(split(line)[3]))
contains(line, "Width:") && (width = int(split(line)[2]))
if codepoint!=nothing && width!=nothing && codepoint >= 0
CharWidths[codepoint]=width
state = :seekchar
end
end
end
CharWidths
end
CharWidths=parsesfd("unifont-$universion.sfd")
CharWidths=parsesfd("unifont_upper-$universion.sfd", CharWidths)

# convert from advance width (512 units to the en) to character width
for (c,v) in CharWidths
CharWidths[c] = div(v, 512)
end

#############################################################################
# Widths from UAX #11: East Asian Width

isfile("EastAsianWidth.txt") || download("http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt", "EastAsianWidth.txt")
for line in readlines(open("EastAsianWidth.txt"))
#Strip comments
line[1] == '#' && continue
precomment = split(line, '#')[1]
#Parse code point range and width code
tokens = split(precomment, ';')
length(tokens) >= 2 || continue
charrange = tokens[1]
width = strip(tokens[2])
#Parse code point range into Julia UnitRange
rangetokens = split(charrange, "..")
charstart = uint32("0x"*rangetokens[1])
charend = uint32("0x"*rangetokens[length(rangetokens)>1 ? 2 : 1])

#Assign widths
for c in charstart:charend
width=="N" && continue #Ignore neutral characters
CharWidths[c]=(width=="W" || width=="F") ? 2 : #Wide or full
(width=="Na"|| width=="H" || width=="A") ? 1 : #Narrow or half or ambiguous (default to narrow in non-East-Asian contexts, which we can assume to be the default)
error("Unknown East Asian width code: $width for code point: $c")
end
end

#############################################################################
# A few exceptions to the above cases, found by manual comparison
# to other wcwidth functions.

# Use ../libutf8proc for category codes, rather than the one in Julia,
# to minimize bootstrapping complexity when a new version of Unicode comes out.
function catcode(c)
uint(c) > 0x10FFFF && return 0x0000 # see utf8proc_get_property docs
return unsafe_load(ccall((:utf8proc_get_property,"../libutf8proc"), Ptr{UInt16}, (Int32,), c))
end


# use Base.UTF8proc module to get category codes constants, since
# we aren't goint to change these in utf8proc.
import Base.UTF8proc

# make sure format control character (category Cf) have width 0,
# except for the Arabic characters 0x06xx (see unicode std 6.2, sec. 8.2)
for c in keys(CharWidths)
if catcode(c)==UTF8proc.UTF8PROC_CATEGORY_CF &&
c [0x0601,0x0602,0x0603,0x06dd]
CharWidths[c]=0
end
end

#By definition, should have zero width (on the same line)
#0x002028 '
' category: Zl name: LINE SEPARATOR/
#0x002029 '
' category: Zp name: PARAGRAPH SEPARATOR/
CharWidths[0x2028]=0
CharWidths[0x2029]=0

#By definition, should be narrow = width of 1 en space
#0x00202f ' ' category: Zs name: NARROW NO-BREAK SPACE/
CharWidths[0x202f]=1

#By definition, should be wide = width of 1 em space
#0x002001 ' ' category: Zs name: EM QUAD/
#0x002003 ' ' category: Zs name: EM SPACE/
CharWidths[0x2001]=2
CharWidths[0x2003]=2

#############################################################################
# Non-printable control characters will be assigned a width of zero
# (wcwidth returns -1 for such characters)

isprintable(c::Union(Char,Integer)) = c <= 0x10ffff && is_valid_char(c) && isprintable_category(catcode(c))
isprintable_category(category) =
!( category==UTF8proc.UTF8PROC_CATEGORY_CN # Unassigned
|| category==UTF8proc.UTF8PROC_CATEGORY_CS # Surrogate
|| category==UTF8proc.UTF8PROC_CATEGORY_CC # Control
|| category==0 # Invalid
)

# Question: should we just use Julia's isprint algorithm here? It is different,
# though it is also based on the character category.

#############################################################################
# Output (to a file or pipe) for processing by data_generator.rb
# ... don't bother to output zero widths since that will be the default.

firstc = 0x000000
lastv = 0
uhex(c) = uppercase(hex(c,4))
for c in 0x0000:0x110000
v = isprintable(c) ? get(CharWidths, c, 0) : 0
if v != lastv || c == 0x110000
v < 4 || error("invalid charwidth $v for $c")
if firstc+1 < c
println(uhex(firstc), "..", uhex(c-1), "; ", lastv)
else
println(uhex(firstc), "; ", lastv)
end
firstc = c
lastv = v
end
end
18 changes: 14 additions & 4 deletions data/data_generator.rb
Expand Up @@ -85,14 +85,23 @@
end
end

$charwidth_list = File.read("CharWidths.txt")
$charwidth = Hash.new(0)
$charwidth_list.each_line do |entry|
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*([0-9]+)/
$1.hex.upto($2.hex) { |e2| $charwidth[e2] = $3.to_i }
elsif entry =~ /^([0-9A-F]+)\s*;\s*([0-9]+)/
$charwidth[$1.hex] = $2.to_i
end
end

$exclusions = File.read("CompositionExclusions.txt")[/# \(1\) Script Specifics.*?# Total code points:/m]
$exclusions = $exclusions.chomp.split("\n").collect { |e| e.hex }

$excl_version = File.read("CompositionExclusions.txt")[/# \(2\) Post Composition Version precomposed characters.*?# Total code points:/m]
$excl_version = $excl_version.chomp.split("\n").collect { |e| e.hex }

$case_folding_string = File.open("CaseFolding.txt").read

$case_folding_string = File.open("CaseFolding.txt", :encoding => 'utf-8').read
$case_folding = {}
$case_folding_string.chomp.split("\n").each do |line|
next unless line =~ /([0-9A-F]+); [CFS]; ([0-9A-F ]+);/i
Expand Down Expand Up @@ -172,7 +181,8 @@ def c_entry(comb1_indicies, comb2_indicies)
"#{$exclusions.include?(code) or $excl_version.include?(code)}, " <<
"#{$ignorable.include?(code)}, " <<
"#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " <<
"#{$grapheme_boundclass[code]}},\n"
"#{$grapheme_boundclass[code]}, " <<
"#{$charwidth[code]}},\n"
end
end

Expand Down Expand Up @@ -295,7 +305,7 @@ def c_entry(comb1_indicies, comb2_indicies)
$stdout << "};\n\n"

$stdout << "const utf8proc_property_t utf8proc_properties[] = {\n"
$stdout << " {0, 0, 0, 0, NULL, NULL, -1, -1, -1, -1, -1, false,false,false,false, UTF8PROC_BOUNDCLASS_OTHER},\n"
$stdout << " {0, 0, 0, 0, NULL, NULL, -1, -1, -1, -1, -1, false,false,false,false, UTF8PROC_BOUNDCLASS_OTHER, 0},\n"
properties.each { |line|
$stdout << line
}
Expand Down
6 changes: 4 additions & 2 deletions test/printproperty.c
Expand Up @@ -24,7 +24,8 @@ int main(int argc, char **argv)
" comp_exclusion = %d\n"
" ignorable = %d\n"
" control_boundary = %d\n"
" boundclass = %d\n",
" boundclass = %d\n"
" charwidth = %d\n",
argv[i],
p->category,
p->combining_class,
Expand All @@ -39,7 +40,8 @@ int main(int argc, char **argv)
p->comp_exclusion,
p->ignorable,
p->control_boundary,
p->boundclass);
p->boundclass,
p->charwidth);
}
return 0;
}

0 comments on commit f6d46cf

Please sign in to comment.