This repository has been archived by the owner on Nov 20, 2020. It is now read-only.
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[luadist-git] add slnunicode-1.1a-Windows-x86
- Loading branch information
0 parents
commit 8629877
Showing
3 changed files
with
239 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
type = "x86" | ||
arch = "Windows" | ||
author = "Unknown (paul)" | ||
depends = { | ||
[[lua ~> 5.1]], | ||
} | ||
|
||
desc = "A Unicode support library for Lua, developed for the Selene database project." | ||
version = "1.1a" | ||
maintainer = "Peter Drahoš" | ||
files = { | ||
Runtime = { | ||
[[lib\lua\unicode.dll]], | ||
} | ||
, | ||
Test = { | ||
[[share\slnunicode\test\unitest]], | ||
} | ||
, | ||
} | ||
|
||
url = "http://luaforge.net/projects/sln/" | ||
name = "slnunicode" | ||
license = "MIT/X11" |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,215 @@ | ||
#!/opt/lua-5.0.2/bin/lua | ||
-- there are four string-like ctype closures: | ||
-- unicode.ascii, latin1, utf8 and grapheme | ||
-- | ||
-- ascii and latin1 are single-byte like string, | ||
-- but use the unicode table for upper/lower and character classes | ||
-- ascii does not touch bytes > 127 on upper/lower | ||
-- | ||
-- ascii or latin1 can be used as locale-independent string replacement. | ||
-- (There is a compile switch to do this automatically for ascii). | ||
-- | ||
-- UTF-8 operates on UTF-8 sequences as of RFC 3629: | ||
-- 1 byte 0-7F, 2 byte 80-7FF, 3 byte 800-FFFF, 4 byte 1000-10FFFF | ||
-- (not exclusing UTF-16 surrogate characters) | ||
-- Any byte not part of such a sequence is treated as it's (Latin-1) value. | ||
-- | ||
-- Grapheme takes care of grapheme clusters, which are characters followed by | ||
-- "grapheme extension" characters (Mn+Me) like combining diacritical marks. | ||
-- | ||
-- calls are: | ||
-- len(str) | ||
-- sub(str, start [,end=-1]) | ||
-- byte(str, start [,end=-1]) | ||
-- lower(str) | ||
-- upper(str) | ||
-- char(i [,j...]) | ||
-- reverse(str) | ||
-- | ||
-- same as in string: rep, format, dump | ||
-- TODO: use char count with %s in format? (sub does the job) | ||
-- TODO: grapheme.byte: only first code of any cluster? | ||
-- | ||
-- find, gfind, gsub: done, but need thorough testing ...: | ||
-- ascii does not match them on any %class (but on ., literals and ranges) | ||
-- behaviour of %class with class not ASCII is undefined | ||
-- frontier %f currently disabled -- should we? | ||
-- | ||
-- character classes are: | ||
-- %a L* (Lu+Ll+Lt+Lm+Lo) | ||
-- %c Cc | ||
-- %d 0-9 | ||
-- %l Ll | ||
-- %n N* (Nd+Nl+No, new) | ||
-- %p P* (Pc+Pd+Ps+Pe+Pi+Pf+Po) | ||
-- %s Z* (Zs+Zl+Zp) plus the controls 9-13 (HT,LF,VT,FF,CR) | ||
-- %u Lu (also Lt ?) | ||
-- %w %a+%n+Pc (e.g. '_') | ||
-- %x 0-9A-Za-z | ||
-- %z the 0 byte | ||
-- c.f. http://unicode.org/Public/UNIDATA/UCD.html#General_Category_Values | ||
-- http://unicode.org/Public/UNIDATA/UnicodeData.txt | ||
-- | ||
-- NOTE: find positions are in bytes for all ctypes! | ||
-- use ascii.sub to cut found ranges! | ||
-- this is a) faster b) more reliable | ||
-- | ||
-- UTF-8 behaviour: match is by codes, code ranges are supported | ||
-- | ||
-- grapheme behaviour: any %class, '.' and range match includes | ||
-- any following grapheme extensions. | ||
-- Ranges apply to single code points only. | ||
-- If a [] enumeration contains a grapheme cluster, | ||
-- this matches only the exact same cluster. | ||
-- However, a literal single 'o' standalone or in an [] enumeration | ||
-- will match just that 'o', even if it has a extension in the string. | ||
-- Consequently, grapheme match positions are not always cluster positions. | ||
-- | ||
|
||
local unicode = require("unicode") | ||
local utf8 = unicode.utf8 | ||
unicode.string = string -- for tests unicode[ctype] | ||
local sprintf = string.format | ||
local function printf (fmt, ...) return print(sprintf(fmt, ...)) end | ||
|
||
local function check (test, ok, got) | ||
if ok == got then return printf("ok %s = %s",test,ok) end | ||
return printf("NOK %s = %s GOT '%s'",test, ok, got or "<nil>") | ||
end | ||
local function checka (test, ok, ...) | ||
local arg = {...} | ||
arg[1] = arg[1] or "" | ||
return check(test, ok, table.concat(arg, ",")) | ||
end | ||
|
||
|
||
local function testlen (str,bytes,codes,chars) | ||
codes = codes or bytes | ||
chars = chars or codes | ||
return check(sprintf("len '%s'", str), | ||
sprintf("%d/%d/%d", bytes, codes, chars), | ||
sprintf("%d/%d/%d", string.len(str), utf8.len(str), unicode.grapheme.len(str))) | ||
end | ||
|
||
-- 176 = 00B0;DEGREE SIGN -- UTF-8: C2,B0 = \194\176 | ||
-- 196 = 00C4;LATIN CAPITAL LETTER A WITH DIAERESIS | ||
-- 214 = 00D6;LATIN CAPITAL LETTER O WITH DIAERESIS | ||
-- 776 = 0308;COMBINING DIAERESIS -- UTF-8: CC,88 = \204\136 | ||
testlen("A\tB",3) -- plain Latin-1 | ||
testlen("\176\196\214",3) -- plain Latin-1 | ||
testlen("\196\176\214",3,2) -- C4,B0 is valid seq 0130 I WITH DOT ABOVE | ||
testlen("\192\178",2) -- C0,B2 is bad seq for 2 | ||
testlen("°ÄÖ",6,3) -- simple Latin-1 chars in UTF-8 | ||
testlen("\204\136A\204\136O\204\136",8,5,3) -- decomposed (with broken lead) | ||
|
||
|
||
local function testsub (ctype,ok,str,start,e) | ||
return check(sprintf("%s.sub('%s',%d,%d)", ctype, str, start, e), ok, | ||
unicode[ctype].sub(str,start,e)) | ||
end | ||
testsub("ascii","BCD","ABCDE",2,4) | ||
testsub("utf8","BCD","ABCDE",2,4) | ||
testsub("latin1","Ä","°ÄÖ",3,4) | ||
testsub("utf8","Ä","°ÄÖ",2,2) | ||
testsub("utf8","ÄÖ","°ÄÖ",2,-1) | ||
testsub("utf8","\204\136","A\204\136O\204\136",2,2) -- decomposed | ||
testsub("grapheme","O\204\136","A\204\136O\204\136",2,2) -- decomposed | ||
|
||
|
||
local function testbyte (ctype, ok, str, ...) | ||
return checka(sprintf("%s.byte('%s',%s)",ctype,str,table.concat({...}, ",")), | ||
ok, unicode[ctype].byte(str, ...)) | ||
end | ||
testbyte("string","194,176","Ä°Ö",3,4) -- the UTF-8 seq for ° | ||
testbyte("ascii","194,176","Ä°Ö",3,4) | ||
testbyte("utf8","176,214","Ä°Ö",2,3) -- code points for °,Ö | ||
testbyte("utf8","65,776","\204\136A\204\136O\204\136",2,3) -- decomposed | ||
testbyte("grapheme","65,776","\204\136A\204\136O\204\136",2) -- decomposed | ||
|
||
|
||
local function testchar (ctype, ok, ...) | ||
return check(sprintf("%s.char(%s)",ctype,table.concat({...}, ",")), | ||
ok, unicode[ctype].char(...)) | ||
end | ||
testchar("ascii", "AB", 65,66) | ||
testchar("ascii", "\176", 176) | ||
testchar("utf8", "\194\176", 176) | ||
|
||
|
||
local function testcase (ctype,str,up,lo) | ||
check(sprintf("%s.lower('%s')", ctype, str), lo, unicode[ctype].lower(str)) | ||
check(sprintf("%s.upper('%s')", ctype, str), up, unicode[ctype].upper(str)) | ||
end | ||
-- upper/lower also fixes plain Latin | ||
testcase("utf8","Ab\196üo\204\136","ABÄÜO\204\136","abäüo\204\136") | ||
testcase("ascii","Ab\196üo\204\136","AB\196üO\204\136","ab\196üo\204\136") | ||
testcase("latin1","Ab\196","AB\196","ab\228") | ||
|
||
|
||
local function testrev (ctype,ok,str) | ||
return check(sprintf("%s.reverse('%s')",ctype,str), | ||
ok, unicode[ctype].reverse(str)) | ||
end | ||
testrev("ascii","b\136\204oa\176\194ba","ab°ao\204\136b"); | ||
testrev("utf8","b\204\136oa°ba","ab°ao\204\136b"); | ||
testrev("grapheme","bo\204\136a°ba","ab°ao\204\136b"); | ||
|
||
|
||
|
||
local function testfind (ctype,ok,str,pat) | ||
return checka(sprintf("%s.find('%s','%s')",ctype,str,pat), | ||
ok, unicode[ctype].find(str, pat)) | ||
end | ||
testfind("ascii","1,1","e=mc2","%a") | ||
testfind("ascii","3,4","e=mc2","%a%a") | ||
testfind("ascii","5,5","e=mc2","%d") | ||
testfind("ascii","","Ä","%a") | ||
testfind("ascii","1,2","Ä","%A*") | ||
testfind("latin1","1,1","Ä","%a") | ||
testfind("utf8","1,2","Ä","%a") | ||
testfind("utf8","1,1","o\204\136","%a*") | ||
testfind("utf8","2,3","o\204\136","%A") | ||
testfind("utf8","1,1","o\204\136",".") | ||
testfind("grapheme","1,3","o\204\136","%a*") | ||
testfind("grapheme","2,3","o\204\136","%A") -- didn't expect this? | ||
testfind("grapheme","1,3","o\204\136",".") | ||
testfind("utf8","4,5","ÜHÄPPY","[À-Ö]") | ||
testfind("utf8","4,5","ÜHÄPPY","[Ä-]") | ||
testfind("utf8","7,7","ÜHÄP-PY","[ä-]") | ||
testfind("ascii","1,4","abcdef","%a*d") | ||
testfind("utf8","1,10","äöüßü","%a*ü") | ||
testfind("utf8","1,6","äöüß","%a*ü") | ||
testfind("utf8","4,5,Ä","ÜHÄPPY","([À-Ö])") | ||
testfind("utf8","1,5,ÜHÄ","ÜHÄ_PPY","([%w]+)") | ||
testfind("utf8","1,9,ÜHÄ_PPY","ÜHÄ_PPY","([%w_]+)") | ||
|
||
|
||
local function testgsub (ctype,ok,str,pat,repl) | ||
return check(sprintf("%s.gsub('%s','%s','%s')",ctype,str,pat,repl), | ||
ok, unicode[ctype].gsub(str,pat,repl)) | ||
end | ||
testgsub("ascii","hello hello world world","hello world", "(%w+)", "%1 %1") | ||
testgsub("ascii","world hello Lua from", | ||
"hello world from Lua", "(%w+)%s*(%w+)", "%2 %1") | ||
testgsub("ascii","l helö wöfr rldöL müä", | ||
"hellö wörld fröm Lüä", "(%w+)%s*(%w+)", "%2 %1") | ||
testgsub("utf8","wörld hellö Lüä fröm", | ||
"hellö wörld fröm Lüä", "(%w+)%s*(%w+)", "%2 %1") | ||
testgsub("utf8","HÜppÄ","HÄppÜ","([À-Ö])(%l*)(%u)","%3%2%1") | ||
|
||
|
||
fail = 0 | ||
for i=0,65535 do if i ~= utf8.byte(utf8.char(i)) then fail=fail+1 end end | ||
check("code-decode failures", 0, fail) | ||
|
||
--[[ print the table | ||
for i=192,65535,64 do | ||
local k = i/64 | ||
io.write(sprintf("%04x\\%3d\\%3d ",i, 224+k/64, 128+math.mod(k,64))) | ||
for j=i,i+63 do | ||
io.write(utf8.char(j)) | ||
end | ||
io.write("\n") | ||
end | ||
]] | ||
|