[luadist-git] add slnunicode-1.1a-Windows-x86

LuaDist · Mar 6, 2013 · 8629877 · 8629877
commit 8629877
Show file tree

Hide file tree

Showing 3 changed files with 239 additions and 0 deletions.
diff --git a/dist.info b/dist.info
@@ -0,0 +1,24 @@
+type = "x86"
+arch = "Windows"
+author = "Unknown (paul)"
+depends = {
+[[lua ~> 5.1]],
+}
+
+desc = "A Unicode support library for Lua, developed for the Selene database project."
+version = "1.1a"
+maintainer = "Peter Drahoš"
+files = {
+Runtime = {
+[[lib\lua\unicode.dll]],
+}
+,
+Test = {
+[[share\slnunicode\test\unitest]],
+}
+,
+}
+
+url = "http://luaforge.net/projects/sln/"
+name = "slnunicode"
+license = "MIT/X11"
diff --git a/lib/lua/unicode.dll b/lib/lua/unicode.dll
diff --git a/share/slnunicode/test/unitest b/share/slnunicode/test/unitest
@@ -0,0 +1,215 @@
+#!/opt/lua-5.0.2/bin/lua
+--	there are four string-like ctype closures:
+--	unicode.ascii, latin1, utf8 and grapheme
+--
+--	ascii and latin1 are single-byte like string,
+--	but use the unicode table for upper/lower and character classes
+--	ascii does not touch bytes > 127 on upper/lower
+--
+--	ascii or latin1 can be used as locale-independent string replacement.
+--	(There is a compile switch to do this automatically for ascii).
+--
+--	UTF-8 operates on UTF-8 sequences as of RFC 3629:
+--	1 byte 0-7F, 2 byte 80-7FF, 3 byte 800-FFFF, 4 byte 1000-10FFFF
+--	(not exclusing UTF-16 surrogate characters)
+--	Any byte not part of such a sequence is treated as it's (Latin-1) value.
+--
+--	Grapheme takes care of grapheme clusters, which are characters followed by
+--	"grapheme extension" characters (Mn+Me) like combining diacritical marks.
+--
+--	calls are:
+--	len(str)
+--	sub(str, start [,end=-1])
+--	byte(str, start [,end=-1])
+--	lower(str)
+--	upper(str)
+--	char(i [,j...])
+--	reverse(str)
+--
+--	same as in string: rep, format, dump
+--	TODO: use char count with %s in format? (sub does the job)
+--	TODO: grapheme.byte: only first code of any cluster?
+--
+--	find, gfind, gsub: done, but need thorough testing ...:
+--	ascii does not match them on any %class (but on ., literals and ranges)
+--	behaviour of %class with class not ASCII is undefined
+--	frontier %f currently disabled -- should we?
+--
+--	character classes are:
+--	%a L* (Lu+Ll+Lt+Lm+Lo)
+--	%c Cc
+--	%d 0-9
+--	%l Ll
+--	%n N* (Nd+Nl+No, new)
+--	%p P* (Pc+Pd+Ps+Pe+Pi+Pf+Po)
+--	%s Z* (Zs+Zl+Zp) plus the controls 9-13 (HT,LF,VT,FF,CR)
+--	%u Lu (also Lt ?)
+--	%w %a+%n+Pc (e.g. '_')
+--	%x 0-9A-Za-z
+--	%z the 0 byte
+--	c.f. http://unicode.org/Public/UNIDATA/UCD.html#General_Category_Values
+--	http://unicode.org/Public/UNIDATA/UnicodeData.txt
+--
+--	NOTE: find positions are in bytes for all ctypes!
+--	use ascii.sub to cut found ranges!
+--	this is a) faster b) more reliable
+--
+--	UTF-8 behaviour: match is by codes, code ranges are supported
+--
+--	grapheme behaviour: any %class, '.' and range match includes
+--	any following grapheme extensions.
+--	Ranges apply to single code points only.
+--	If a [] enumeration contains a grapheme cluster,
+--	this matches only the exact same cluster.
+--	However, a literal single 'o' standalone or in an [] enumeration
+--	will match just that 'o',	even if it has a extension in the string.
+--	Consequently, grapheme match positions are not always cluster positions.
+--
+
+local unicode = require("unicode")
+local utf8 = unicode.utf8
+unicode.string = string -- for tests unicode[ctype]
+local sprintf = string.format
+local function printf (fmt, ...) return print(sprintf(fmt, ...)) end
+
+local function check (test, ok, got)
+	if ok == got then return printf("ok  %s = %s",test,ok) end
+	return printf("NOK %s = %s GOT '%s'",test, ok, got or "<nil>")
+end
+local function checka (test, ok, ...)
+	local arg = {...}
+	arg[1] = arg[1] or ""
+	return check(test, ok, table.concat(arg, ","))
+end
+
+
+local function testlen (str,bytes,codes,chars)
+	codes = codes or bytes
+	chars = chars or codes
+	return check(sprintf("len '%s'", str),
+		sprintf("%d/%d/%d", bytes, codes, chars),
+sprintf("%d/%d/%d", string.len(str), utf8.len(str), unicode.grapheme.len(str)))
+end
+
+-- 176 = 00B0;DEGREE SIGN -- UTF-8: C2,B0 = \194\176
+-- 196 = 00C4;LATIN CAPITAL LETTER A WITH DIAERESIS
+-- 214 = 00D6;LATIN CAPITAL LETTER O WITH DIAERESIS
+-- 776 = 0308;COMBINING DIAERESIS -- UTF-8: CC,88 = \204\136
+testlen("A\tB",3) -- plain Latin-1
+testlen("\176\196\214",3) -- plain Latin-1
+testlen("\196\176\214",3,2) -- C4,B0 is valid seq 0130 I WITH DOT ABOVE
+testlen("\192\178",2) -- C0,B2 is bad seq for 2
+testlen("°ÄÖ",6,3) -- simple Latin-1 chars in UTF-8
+testlen("\204\136A\204\136O\204\136",8,5,3) -- decomposed (with broken lead)
+
+
+local function testsub (ctype,ok,str,start,e)
+	return check(sprintf("%s.sub('%s',%d,%d)", ctype, str, start, e), ok,
+		unicode[ctype].sub(str,start,e))
+end
+testsub("ascii","BCD","ABCDE",2,4)
+testsub("utf8","BCD","ABCDE",2,4)
+testsub("latin1","Ä","°ÄÖ",3,4)
+testsub("utf8","Ä","°ÄÖ",2,2)
+testsub("utf8","ÄÖ","°ÄÖ",2,-1)
+testsub("utf8","\204\136","A\204\136O\204\136",2,2) -- decomposed
+testsub("grapheme","O\204\136","A\204\136O\204\136",2,2) -- decomposed
+
+
+local function testbyte (ctype, ok, str, ...)
+	return checka(sprintf("%s.byte('%s',%s)",ctype,str,table.concat({...}, ",")),
+		ok, unicode[ctype].byte(str, ...))
+end
+testbyte("string","194,176","Ä°Ö",3,4) -- the UTF-8 seq for °
+testbyte("ascii","194,176","Ä°Ö",3,4)
+testbyte("utf8","176,214","Ä°Ö",2,3) -- code points for °,Ö
+testbyte("utf8","65,776","\204\136A\204\136O\204\136",2,3) -- decomposed
+testbyte("grapheme","65,776","\204\136A\204\136O\204\136",2) -- decomposed
+
+
+local function testchar (ctype, ok, ...)
+	return check(sprintf("%s.char(%s)",ctype,table.concat({...}, ",")),
+		ok, unicode[ctype].char(...))
+end
+testchar("ascii", "AB", 65,66)
+testchar("ascii", "\176", 176)
+testchar("utf8", "\194\176", 176)
+
+
+local function testcase (ctype,str,up,lo)
+	check(sprintf("%s.lower('%s')", ctype, str), lo, unicode[ctype].lower(str))
+	check(sprintf("%s.upper('%s')", ctype, str), up, unicode[ctype].upper(str))
+end
+-- upper/lower also fixes plain Latin
+testcase("utf8","Ab\196üo\204\136","ABÄÜO\204\136","abäüo\204\136")
+testcase("ascii","Ab\196üo\204\136","AB\196üO\204\136","ab\196üo\204\136")
+testcase("latin1","Ab\196","AB\196","ab\228")
+
+
+local function testrev (ctype,ok,str)
+	return check(sprintf("%s.reverse('%s')",ctype,str),
+		ok, unicode[ctype].reverse(str))
+end
+testrev("ascii","b\136\204oa\176\194ba","ab°ao\204\136b");
+testrev("utf8","b\204\136oa°ba","ab°ao\204\136b");
+testrev("grapheme","bo\204\136a°ba","ab°ao\204\136b");
+
+
+
+local function testfind (ctype,ok,str,pat)
+	return checka(sprintf("%s.find('%s','%s')",ctype,str,pat),
+		ok, unicode[ctype].find(str, pat))
+end
+testfind("ascii","1,1","e=mc2","%a")
+testfind("ascii","3,4","e=mc2","%a%a")
+testfind("ascii","5,5","e=mc2","%d")
+testfind("ascii","","Ä","%a")
+testfind("ascii","1,2","Ä","%A*")
+testfind("latin1","1,1","Ä","%a")
+testfind("utf8","1,2","Ä","%a")
+testfind("utf8","1,1","o\204\136","%a*")
+testfind("utf8","2,3","o\204\136","%A")
+testfind("utf8","1,1","o\204\136",".")
+testfind("grapheme","1,3","o\204\136","%a*")
+testfind("grapheme","2,3","o\204\136","%A") -- didn't expect this?
+testfind("grapheme","1,3","o\204\136",".")
+testfind("utf8","4,5","ÜHÄPPY","[À-Ö]")
+testfind("utf8","4,5","ÜHÄPPY","[Ä-]")
+testfind("utf8","7,7","ÜHÄP-PY","[ä-]")
+testfind("ascii","1,4","abcdef","%a*d")
+testfind("utf8","1,10","äöüßü","%a*ü")
+testfind("utf8","1,6","äöüß","%a*ü")
+testfind("utf8","4,5,Ä","ÜHÄPPY","([À-Ö])")
+testfind("utf8","1,5,ÜHÄ","ÜHÄ_PPY","([%w]+)")
+testfind("utf8","1,9,ÜHÄ_PPY","ÜHÄ_PPY","([%w_]+)")
+
+
+local function testgsub (ctype,ok,str,pat,repl)
+	return check(sprintf("%s.gsub('%s','%s','%s')",ctype,str,pat,repl),
+		ok, unicode[ctype].gsub(str,pat,repl))
+end
+testgsub("ascii","hello hello world world","hello world", "(%w+)", "%1 %1")
+testgsub("ascii","world hello Lua from",
+	"hello world from Lua", "(%w+)%s*(%w+)", "%2 %1")
+testgsub("ascii","l helö wöfr rldöL müä",
+	"hellö wörld fröm Lüä", "(%w+)%s*(%w+)", "%2 %1")
+testgsub("utf8","wörld hellö Lüä fröm",
+	"hellö wörld fröm Lüä", "(%w+)%s*(%w+)", "%2 %1")
+testgsub("utf8","HÜppÄ","HÄppÜ","([À-Ö])(%l*)(%u)","%3%2%1")
+
+
+fail = 0
+for i=0,65535 do if i ~= utf8.byte(utf8.char(i)) then fail=fail+1 end end
+check("code-decode failures", 0, fail)
+
+--[[ print the table
+for i=192,65535,64 do
+	local k = i/64
+	io.write(sprintf("%04x\\%3d\\%3d ",i, 224+k/64, 128+math.mod(k,64)))
+	for j=i,i+63 do
+		io.write(utf8.char(j))
+	end
+	io.write("\n")
+end
+]]
+