Skip to content

Commit

Permalink
Merge pull request #64 from LettError/unicode11.0.0
Browse files Browse the repository at this point in the history
Unicode11.0.0
  • Loading branch information
LettError committed Dec 4, 2018
2 parents 7cdf5df + 10568ce commit df18d8e
Show file tree
Hide file tree
Showing 46 changed files with 141,944 additions and 327,641 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,5 @@

*.sublime-workspace
Lib/Glyph_Name_Formatter.egg-info
ArabicShaping.txt
CaseFolding.txt
16 changes: 8 additions & 8 deletions Lib/glyphNameFormatter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,18 @@

import unicodedata

from .data import unicodelist
from .data.scriptConflictNames import scriptConflictNames
from .data.preferredAGLNames import preferredAGLNames
from .data.scriptPrefixes import scriptPrefixes, addScriptPrefix, SCRIPTSEPARATOR, SCRIPTASPREFIX
from .data.mathUniNumbers import mathUniNumbers
from glyphNameFormatter.data import unicodelist, mathUniNumbers
from glyphNameFormatter.data.scriptConflictNames import scriptConflictNames
from glyphNameFormatter.data.preferredAGLNames import preferredAGLNames
from glyphNameFormatter.data.scriptPrefixes import scriptPrefixes, addScriptPrefix, SCRIPTSEPARATOR, SCRIPTASPREFIX
#from .data.mathUniNumbers import mathUniNumbers

from .unicodeRangeNames import getRangeName, getRangeProcessor, getRangeProcessorByRangeName
from glyphNameFormatter.unicodeRangeNames import getRangeName, getRangeProcessor, getRangeProcessorByRangeName

from .tools import unicodeToChar
from glyphNameFormatter.tools import unicodeToChar


__version__ = "0.28"
__version__ = "0.3"


def debug(uniNumber):
Expand Down
36 changes: 35 additions & 1 deletion Lib/glyphNameFormatter/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@
"unicodelist",
"unicodeVersion",
"unicodeRangeNames",
"unicodeCaseMap",
"upperToLower",
"lowerToUpper",
"mathUniNumbers",

]

path = os.path.dirname(__file__)
Expand Down Expand Up @@ -76,6 +81,9 @@

unicodelist = {}
unicodeCategories = {}
upperToLower = {}
lowerToUpper = {}
mathUniNumbers = []

flatUnicodePath = os.path.join(path, "flatUnicode.txt")

Expand All @@ -94,10 +102,35 @@
if not line:
# empty line
continue
uniNumber, uniName, uniCategory = line.split("\t")
# codepoint / tab / uppercase / tab / lowercase / tab / category / tab / math / tab / name
uniNumber, uniUppercase, uniLowercase, uniCategory, mathFlag, uniName, = line.split("\t")
uniNumber = int(uniNumber, 16)
#print(uniNumber, uniUppercase, uniLowercase, uniCategory, uniName)
if uniUppercase != '':
try:
uniUppercase = int(uniUppercase, 16)
except ValueError:
uniUppercase = None
else:
uniUppercase = None
if uniLowercase != '':
try:
if uniLowercase:
uniLowercase = int(uniLowercase, 16)
upperToLower[uniNumber] = uniUppercase
except ValueError:
uniLowercase = None
else:
uniLowercase = None

if uniUppercase == None and uniLowercase != None:
upperToLower[uniNumber] = uniLowercase
if uniUppercase != None and uniLowercase == None:
lowerToUpper[uniNumber] = uniUppercase
unicodelist[uniNumber] = uniName
unicodeCategories[uniNumber] = uniCategory
if mathFlag is not '':
mathUniNumbers.append(uniNumber)

unicodeVersion = lines[0].replace("#", "").strip()

Expand Down Expand Up @@ -130,3 +163,4 @@
end = int(end, 16)

unicodeRangeNames[(start, end)] = rangeName

25 changes: 19 additions & 6 deletions Lib/glyphNameFormatter/data/buildFlatUnicodeList.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
from urllib.request import urlopen

__doc__ = """
This will parse the large ucd xml from unicode.org
into a simple list that is workable and
This will parse the large ucd xml from unicode.org
into a simple list that is workable and
is fair enough for download and embedding.
starts with the # unicode description/version
Expand All @@ -23,14 +23,14 @@

URL = "http://www.unicode.org/Public/{version}/ucdxml/ucd.all.flat.zip"

UNICODE_VERSION = "10.0.0"
UNICODE_VERSION = "11.0.0"
UCD_ZIP_FILE = "ucd.all.flat.zip"
UCD_FILE = UCD_ZIP_FILE[:-3] + "xml"
FLAT_FILE = "flatUnicode.txt"

parser = ArgumentParser(description=__doc__,
formatter_class=RawDescriptionHelpFormatter)
group = parser.add_mutually_exclusive_group(required=True)
group = parser.add_mutually_exclusive_group(required=False)
group.add_argument("-u", "--unicode-version",
help="Unicode version to use for download and processing")
group.add_argument("-i", "--input", metavar="UCD_FILE",
Expand All @@ -44,11 +44,12 @@
else:
tempdir = tempfile.mkdtemp()
filename = os.path.join(tempdir, UCD_ZIP_FILE)
print(">> Downloading {} to {}".format(UCD_ZIP_FILE, filename))
if options.unicode_version:
version = options.unicode_version
else:
version = UNICODE_VERSION
print(">> Downloading {} to {} (version {})".format(UCD_ZIP_FILE, filename, version))
print(URL.format(version=version))
url = urlopen(URL.format(version=version))
with open(filename, "wb") as fp:
blocksize = 8192
Expand All @@ -73,7 +74,19 @@
if i.tag.endswith("char"):
n = i.attrib.get("na")
if n:
flat.append("%s\t%s\t%s" % (i.attrib.get("cp"), n, i.attrib.get("gc")))
uc = i.attrib.get("uc")
if uc == "#":
uc = ''
lc = i.attrib.get("lc")
if lc == "#":
lc = ''
mth = i.attrib.get('Math')
if mth != "N":
mth = "M"
else:
mth = ""
# codepoint / tab / uppercase / tab / lowercase / tab / category / tab / name
flat.append("%s\t%s\t%s\t%s\t%s\t%s" % (i.attrib.get("cp"), uc, lc, i.attrib.get("gc"), mth, n))


f = open(FLAT_FILE, "w")
Expand Down
141 changes: 120 additions & 21 deletions Lib/glyphNameFormatter/data/buildJoiningTypesList.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import print_function
import os
import time
import sys
import tempfile
from argparse import ArgumentParser, RawDescriptionHelpFormatter
Expand All @@ -9,37 +10,135 @@
except ImportError:
from urllib.request import urlopen

parser = ArgumentParser(description=__doc__,
formatter_class=RawDescriptionHelpFormatter)
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("-u", "--unicode-version",
help="Unicode version to use for download and processing")
options = parser.parse_args()
UNICODE_VERSION = "11.0.0"

if options.unicode_version:
version = options.unicode_version
else:
version = UNICODE_VERSION
#parser = ArgumentParser(description=__doc__,
# formatter_class=RawDescriptionHelpFormatter)
#group = parser.add_mutually_exclusive_group(required=True)
#group.add_argument("-u", "--unicode-version",
# help="Unicode version to use for download and processing")
#options = parser.parse_args()

JOININGTYPES_FILE = "joiningTypes.txt"
#if options.unicode_version:
# version = options.unicode_version
#else:
version = UNICODE_VERSION

JOININGTYPES_FILE = "ArabicShaping.txt"
ASURL = "http://www.unicode.org/Public/{version}/ucd/ArabicShaping.txt"

print(ASURL.format(version=version))

tempdir = tempfile.mkdtemp()
filename = os.path.join(tempdir, JOININGTYPES_FILE)
print(">> Downloading {} to {}".format(os.path.basename(ASURL), filename))
if options.unicode_version:
version = options.unicode_version
else:
version = UNICODE_VERSION
print(">> Downloading {} to {}".format(os.path.basename(ASURL), JOININGTYPES_FILE))
#if options.unicode_version:
# version = options.unicode_version
#else:
version = UNICODE_VERSION
url = urlopen(ASURL.format(version=version))
with open(filename, "wb") as fp:
with open(JOININGTYPES_FILE, "wb") as fp:
blocksize = 8192
while True:
buffer = url.read(blocksize)
if not buffer:
break
fp.write(buffer)
fp.rewind()
print(len(fp))
# fp.rewind()
# print(len(fp))


# coding: utf-8


def readArabicShaping(path):
# makes joiningTypes.txt
names = {}
categories = dict()
explanation = """# Joining types
# R Right_Joining
# L Left_Joining
# D Dual_Joining
# C Join_Causing
# U Non_Joining
# T Transparent"""

f = open(path, 'r')
d = f.read()
f.close()
source = None
for l in d.split("\n"):
if not l: continue
if l.find("ArabicShaping-")!=-1:
source = l
continue
if l[0] == "#": continue
parts = l.split(";")
parts = [p.strip() for p in parts]
parts[0] = u"0x"+parts[0]
try:
uni = parts[0] = int(parts[0],0)
except ValueError:
print(parts)
categories[uni] = parts[2]
txt = []
txt.append(explanation)
txt.append("# Generated on %s" % time.strftime("%Y %m %d %H:%M:%S"))
txt.append("# Source: %s"%source)
txt.append("# <unicode> <joiningtype>")
k = list(categories.keys())
k.sort()
for uni in k:
txt.append("%05X\t%s"%(uni, categories.get(uni)))
path = "joiningTypes.txt"
f = open(path, 'w')
f.write('\n'.join(txt))
f.close()

def suggestExtension(names, categories, uni):
# based on the category data, give suggestion for which extensions we need for any unicode
extensions = {

# XXXX no idea if it works this way...


# D Dual_Joining
# &----G----&
'D': ['isol', 'ini', 'medi', 'fina'],

# C Join_Causing
# &----G----& ????
'C': ['isol', 'ini', 'medi', 'fina'],

# R Right_Joining
# x G----&
'R': ['isol', 'fina'],

# L Left_Joining
# &----G x
'L': ['isol', 'ini', 'fina'],

# U Non_Joining
# x G x
'U': [ ],

# T Transparent
# x G x
'T': [ ],

}
if not uni in categories.get('_used'):
return [], None
for k, v in categories.items():
if k == "_used": continue
if uni in v:
return extensions.get(k), names.get(uni)
return [], None


if __name__ == "__main__":
from pprint import pprint
arabicRange = 1536, 125251
path = "ArabicShaping.txt"
readArabicShaping(path)

path = "joiningTypes.txt"

Loading

0 comments on commit df18d8e

Please sign in to comment.