Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: charwidth function #27

Merged
merged 1 commit into from Mar 12, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Expand Up @@ -10,10 +10,13 @@
*.dSYM
*.out
data/*.txt
data/*.ttf
data/*.sfd
bench/bench
bench/icu
bench/unistring
normtest
graphemetest
utf8proc_data.c.new
printproperty
charwidth
17 changes: 8 additions & 9 deletions .travis.yml
Expand Up @@ -4,15 +4,14 @@ compiler:
- clang
notifications:
email: false
before_install:
- sudo add-apt-repository ppa:staticfloat/julia-deps -y
- sudo add-apt-repository ppa:staticfloat/juliareleases -y
- sudo apt-get update -qq -y
- sudo apt-get install libpcre3-dev julia fontforge -y
script:
- make prefix=`pwd`/local install
- make check
- make utf8proc_data.c.new && (diff utf8proc_data.c.new utf8proc_data.c > /dev/null)
- mkdir build_static
- cd build_static
- cmake .. -DCMAKE_VERBOSE_MAKEFILE=ON
- make
- mkdir ../build_shared
- cd ../build_shared
- cmake .. -DCMAKE_VERBOSE_MAKEFILE=ON -DBUILD_SHARED_LIBS=ON
- make
- make data && (diff data/utf8proc_data.c.new utf8proc_data.c > /dev/null)
- (mkdir build_static && cd build_static && cmake .. -DCMAKE_VERBOSE_MAKEFILE=ON && make)
- (mkdir build_shared && cd build_shared && cmake .. -DCMAKE_VERBOSE_MAKEFILE=ON -DBUILD_SHARED_LIBS=ON && make)
47 changes: 18 additions & 29 deletions Makefile
@@ -1,9 +1,6 @@
# libutf8proc Makefile

# programs
CURL=curl
RUBY=ruby
PERL=perl
MAKE=make
AR=ar
INSTALL=install
Expand Down Expand Up @@ -37,36 +34,24 @@ includedir=$(prefix)/include

# meta targets

all: c-library
.PHONY: all, clean, update, data

c-library: libutf8proc.a libutf8proc.$(SHLIB_EXT)
all: libutf8proc.a libutf8proc.$(SHLIB_EXT)

clean:
rm -f utf8proc.o libutf8proc.a libutf8proc.$(SHLIB_VERS_EXT) libutf8proc.$(SHLIB_EXT) test/normtest test/graphemetest data/UnicodeData.txt data/DerivedCoreProperties.txt data/CompositionExclusions.txt data/CaseFolding.txt data/NormalizationTest.txt data/GraphemeBreakTest.txt
rm -f utf8proc.o libutf8proc.a libutf8proc.$(SHLIB_VERS_EXT) libutf8proc.$(SHLIB_EXT) test/normtest test/graphemetest test/printproperty test/charwidth
$(MAKE) -C bench clean
$(MAKE) -C data clean

update: utf8proc_data.c.new
cp -f utf8proc_data.c.new utf8proc_data.c
data: data/utf8proc_data.c.new

# real targets

utf8proc_data.c.new: data/data_generator.rb data/UnicodeData.txt data/GraphemeBreakProperty.txt data/DerivedCoreProperties.txt data/CompositionExclusions.txt data/CaseFolding.txt
(cd data; $(RUBY) data_generator.rb < UnicodeData.txt) > utf8proc_data.c.new

data/UnicodeData.txt:
$(CURL) -o $@ -O http://www.unicode.org/Public/UNIDATA/UnicodeData.txt

data/GraphemeBreakProperty.txt:
$(CURL) -o $@ -O http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
update: data/utf8proc_data.c.new
cp -f data/utf8proc_data.c.new utf8proc_data.c

data/DerivedCoreProperties.txt:
$(CURL) -o $@ -O http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt

data/CompositionExclusions.txt:
$(CURL) -o $@ -O http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt
# real targets

data/CaseFolding.txt:
$(CURL) -o $@ -O http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
data/utf8proc_data.c.new: libutf8proc.$(SHLIB_EXT) data/data_generator.rb data/charwidths.jl
$(MAKE) -C data utf8proc_data.c.new

utf8proc.o: utf8proc.h utf8proc.c utf8proc_data.c
$(cc) -c -o utf8proc.o utf8proc.c
Expand All @@ -86,7 +71,7 @@ libutf8proc.$(MAJOR).dylib: utf8proc.o
$(cc) -dynamiclib -o $@ $^ -install_name $(libdir)/$@ -Wl,-compatibility_version -Wl,$(MAJOR) -Wl,-current_version -Wl,$(MAJOR).$(MINOR).$(PATCH)

libutf8proc.dylib: libutf8proc.$(MAJOR).dylib
ln -s libutf8proc.$(MAJOR).dylib $@
ln -f -s libutf8proc.$(MAJOR).dylib $@

install: libutf8proc.a libutf8proc.$(SHLIB_EXT) libutf8proc.$(SHLIB_VERS_EXT)
mkdir -m 755 -p $(includedir)
Expand All @@ -99,10 +84,10 @@ install: libutf8proc.a libutf8proc.$(SHLIB_EXT) libutf8proc.$(SHLIB_VERS_EXT)
# Test programs

data/NormalizationTest.txt:
$(CURL) -o $@ -O http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt
$(MAKE) -C data NormalizationTest.txt

data/GraphemeBreakTest.txt:
$(CURL) http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt | $(PERL) -pe 's,÷,/,g;s,×,+,g' > $@
$(MAKE) -C data GraphemeBreakTest.txt

test/normtest: test/normtest.c utf8proc.o utf8proc.h test/tests.h
$(cc) test/normtest.c utf8proc.o -o $@
Expand All @@ -113,6 +98,10 @@ test/graphemetest: test/graphemetest.c utf8proc.o utf8proc.h test/tests.h
test/printproperty: test/printproperty.c utf8proc.o utf8proc.h test/tests.h
$(cc) test/printproperty.c utf8proc.o -o $@

check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt
test/charwidth: test/charwidth.c utf8proc.o utf8proc.h test/tests.h
$(cc) test/charwidth.c utf8proc.o -o $@

check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/charwidth
test/normtest data/NormalizationTest.txt
test/graphemetest data/GraphemeBreakTest.txt
test/charwidth
62 changes: 62 additions & 0 deletions data/Makefile
@@ -0,0 +1,62 @@
# Unicode data generation rules. Except for the test data files, most
# users will not use these Makefile rules, which are primarily to re-generate
# unicode_data.c when we get a new Unicode version or charwidth data; they
# require ruby, fontforge, and julia to be installed.

# programs
CURL=curl
RUBY=ruby
PERL=perl
MAKE=make
JULIA=julia
CURLFLAGS = --retry 5 --location

# use JuliaLang caching (https://github.com/staticfloat/cache.julialang.org)
# so that Travis builds do not depend on anyone's flaky servers but our own
URLCACHE=https://cache.e.ip.saba.us/

.PHONY: clean

.DELETE_ON_ERROR:

utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt CharWidths.txt
$(RUBY) data_generator.rb < UnicodeData.txt > $@

# GNU Unifont version for font-metric calculations:
UNIFONT_VERSION=7.0.06

unifont-$(UNIFONT_VERSION).ttf:
$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)http://unifoundry.com/pub/unifont-$(UNIFONT_VERSION)/font-builds/unifont-$(UNIFONT_VERSION).ttf

unifont_upper-$(UNIFONT_VERSION).ttf:
$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)http://unifoundry.com/pub/unifont-$(UNIFONT_VERSION)/font-builds/unifont_upper-$(UNIFONT_VERSION).ttf

CharWidths.txt: charwidths.jl unifont-$(UNIFONT_VERSION).ttf unifont_upper-$(UNIFONT_VERSION).ttf EastAsianWidth.txt
UNIFONT_VERSION=$(UNIFONT_VERSION) $(JULIA) charwidths.jl > $@

UnicodeData.txt:
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://www.unicode.org/Public/UNIDATA/UnicodeData.txt

EastAsianWidth.txt:
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt

GraphemeBreakProperty.txt:
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt

DerivedCoreProperties.txt:
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt

CompositionExclusions.txt:
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt

CaseFolding.txt:
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://www.unicode.org/Public/UNIDATA/CaseFolding.txt

NormalizationTest.txt:
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt

GraphemeBreakTest.txt:
$(CURL) $(CURLFLAGS) $(URLCACHE)http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt | $(PERL) -pe 's,÷,/,g;s,×,+,g' > $@

clean:
rm -f UnicodeData.txt EastAsianWidth.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt CharWidths.txt unifont*.ttf unifont*.sfd
161 changes: 161 additions & 0 deletions data/charwidths.jl
@@ -0,0 +1,161 @@
# Following work by @jiahao, we compute character widths using a combination of
# * advance widths from GNU Unifont (advance width 512 = 1 en)
# * UAX 11: East Asian Width
# * a few exceptions as needed
# Adapted from http://nbviewer.ipython.org/gist/jiahao/07e8b08bf6d8671e9734
#
# Requires Julia (obviously) and FontForge.

#############################################################################
# Julia 0.3/0.4 compatibility (taken from Compat package)
if VERSION < v"0.4.0-dev+1419"
const UInt16 = Uint16
end

CharWidths = Dict{Int,Int}()

#############################################################################
# Widths from GNU Unifont

universion=get(ENV, "UNIFONT_VERSION", "7.0.06")
for fontfile in ["unifont-$universion", "unifont_upper-$universion"]
isfile("$fontfile.ttf") || download("http://unifoundry.com/pub/unifont-$universion/font-builds/$fontfile.ttf", "$fontfile.ttf")
isfile("$fontfile.sfd") || run(`fontforge -lang=ff -c "Open(\"$fontfile.ttf\");Save(\"$fontfile.sfd\");Quit(0);"`)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that -lang=ff seems to be needed for recent versions of fontforge, since they changed the -c option to accept Python scripts by default rather than FontForge's own scripting language.

end

#Read sfdfile for character widths
function parsesfd(filename::String, CharWidths::Dict{Int,Int}=Dict{Int,Int}())
state=:seekchar
lineno = 0
for line in readlines(open(filename))
lineno += 1
if state==:seekchar #StartChar: nonmarkingreturn
if contains(line, "StartChar: ")
codepoint = nothing
width = nothing
state = :readdata
end
elseif state==:readdata #Encoding: 65538 -1 2, Width: 1024
contains(line, "Encoding:") && (codepoint = int(split(line)[3]))
contains(line, "Width:") && (width = int(split(line)[2]))
if codepoint!=nothing && width!=nothing && codepoint >= 0
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jiahao, I added the codepoint>=0 check here since the sfd file seems to have some codepoint -1 entries in the beginning.

CharWidths[codepoint]=div(width, 512) # 512 units to the en
state = :seekchar
end
end
end
CharWidths
end
CharWidths=parsesfd("unifont-$universion.sfd", CharWidths)
CharWidths=parsesfd("unifont_upper-$universion.sfd", CharWidths)

#############################################################################
# Widths from UAX #11: East Asian Width
# .. these take precedence over the Unifont width for all codepoints
# listed explicitly as wide/full/narrow/half-width

isfile("EastAsianWidth.txt") || download("http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt", "EastAsianWidth.txt")
for line in readlines(open("EastAsianWidth.txt"))
#Strip comments
line[1] == '#' && continue
precomment = split(line, '#')[1]
#Parse code point range and width code
tokens = split(precomment, ';')
length(tokens) >= 2 || continue
charrange = tokens[1]
width = strip(tokens[2])
#Parse code point range into Julia UnitRange
rangetokens = split(charrange, "..")
charstart = uint32("0x"*rangetokens[1])
charend = uint32("0x"*rangetokens[length(rangetokens)>1 ? 2 : 1])

#Assign widths
for c in charstart:charend
if width=="W" || width=="F" # wide or full
CharWidths[c]=2
elseif width=="Na"|| width=="H" # narrow or half
CharWidths[c]=1
end
end
end

#############################################################################
# A few exceptions to the above cases, found by manual comparison
# to other wcwidth functions and similar checks.

# Use ../libutf8proc for category codes, rather than the one in Julia,
# to minimize bootstrapping complexity when a new version of Unicode comes out.
function catcode(c)
uint(c) > 0x10FFFF && return 0x0000 # see utf8proc_get_property docs
return unsafe_load(ccall((:utf8proc_get_property,"../libutf8proc"), Ptr{UInt16}, (Int32,), c))
end

# use Base.UTF8proc module to get category codes constants, since
# we aren't goint to change these in utf8proc.
import Base.UTF8proc

for c in keys(CharWidths)
cat = catcode(c)

# make sure format control character (category Cf) have width 0,
# except for the Arabic characters 0x06xx (see unicode std 6.2, sec. 8.2)
if cat==UTF8proc.UTF8PROC_CATEGORY_CF && c ∉ [0x0601,0x0602,0x0603,0x06dd]
CharWidths[c]=0
end

# Unifont has nonzero width for a number of non-spacing combining
# characters, e.g. (in 7.0.06): f84,17b4,17b5,180b,180d,2d7f, and
# the variation selectors
if cat==UTF8proc.UTF8PROC_CATEGORY_MN
CharWidths[c]=0
end

# We also assign width of zero to unassigned and private-use
# codepoints (Unifont includes ConScript Unicode Registry PUA fonts,
# but since these are nonstandard it seems questionable to recognize them).
if cat==UTF8proc.UTF8PROC_CATEGORY_CO || cat==UTF8proc.UTF8PROC_CATEGORY_CN
CharWidths[c]=0
end

# for some reason, Unifont has width-2 glyphs for ASCII control chars
if cat==UTF8proc.UTF8PROC_CATEGORY_CC
CharWidths[c]=0
end
end

#By definition, should have zero width (on the same line)
#0x002028 '
' category: Zl name: LINE SEPARATOR/
#0x002029 '
' category: Zp name: PARAGRAPH SEPARATOR/
CharWidths[0x2028]=0
CharWidths[0x2029]=0

#By definition, should be narrow = width of 1 en space
#0x00202f ' ' category: Zs name: NARROW NO-BREAK SPACE/
CharWidths[0x202f]=1

#By definition, should be wide = width of 1 em space
#0x002001 ' ' category: Zs name: EM QUAD/
#0x002003 ' ' category: Zs name: EM SPACE/
CharWidths[0x2001]=2
CharWidths[0x2003]=2

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jiahao, in a monospaced font, like in a terminal, aren't em and en the same?

#############################################################################
# Output (to a file or pipe) for processing by data_generator.rb
# ... don't bother to output zero widths since that will be the default.

firstc = 0x000000
lastv = 0
uhex(c) = uppercase(hex(c,4))
for c in 0x0000:0x110000
v = get(CharWidths, c, 0)
if v != lastv || c == 0x110000
v < 4 || error("invalid charwidth $v for $c")
if firstc+1 < c
println(uhex(firstc), "..", uhex(c-1), "; ", lastv)
else
println(uhex(firstc), "; ", lastv)
end
firstc = c
lastv = v
end
end
18 changes: 14 additions & 4 deletions data/data_generator.rb
Expand Up @@ -85,14 +85,23 @@
end
end

$charwidth_list = File.read("CharWidths.txt")
$charwidth = Hash.new(0)
$charwidth_list.each_line do |entry|
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*([0-9]+)/
$1.hex.upto($2.hex) { |e2| $charwidth[e2] = $3.to_i }
elsif entry =~ /^([0-9A-F]+)\s*;\s*([0-9]+)/
$charwidth[$1.hex] = $2.to_i
end
end

$exclusions = File.read("CompositionExclusions.txt")[/# \(1\) Script Specifics.*?# Total code points:/m]
$exclusions = $exclusions.chomp.split("\n").collect { |e| e.hex }

$excl_version = File.read("CompositionExclusions.txt")[/# \(2\) Post Composition Version precomposed characters.*?# Total code points:/m]
$excl_version = $excl_version.chomp.split("\n").collect { |e| e.hex }

$case_folding_string = File.open("CaseFolding.txt").read

$case_folding_string = File.open("CaseFolding.txt", :encoding => 'utf-8').read
$case_folding = {}
$case_folding_string.chomp.split("\n").each do |line|
next unless line =~ /([0-9A-F]+); [CFS]; ([0-9A-F ]+);/i
Expand Down Expand Up @@ -172,7 +181,8 @@ def c_entry(comb1_indicies, comb2_indicies)
"#{$exclusions.include?(code) or $excl_version.include?(code)}, " <<
"#{$ignorable.include?(code)}, " <<
"#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " <<
"#{$grapheme_boundclass[code]}},\n"
"#{$grapheme_boundclass[code]}, " <<
"#{$charwidth[code]}},\n"
end
end

Expand Down Expand Up @@ -295,7 +305,7 @@ def c_entry(comb1_indicies, comb2_indicies)
$stdout << "};\n\n"

$stdout << "const utf8proc_property_t utf8proc_properties[] = {\n"
$stdout << " {0, 0, 0, 0, NULL, NULL, -1, -1, -1, -1, -1, false,false,false,false, UTF8PROC_BOUNDCLASS_OTHER},\n"
$stdout << " {0, 0, 0, 0, NULL, NULL, -1, -1, -1, -1, -1, false,false,false,false, UTF8PROC_BOUNDCLASS_OTHER, 0},\n"
properties.each { |line|
$stdout << line
}
Expand Down