Merge pull request #64 from LettError/unicode11.0.0

Unicode11.0.0
LettError · Dec 4, 2018 · df18d8e · df18d8e
2 parents 7cdf5df + 10568ce
commit df18d8e
Show file tree

Hide file tree

Showing 46 changed files with 141,944 additions and 327,641 deletions.
diff --git a/.gitignore b/.gitignore
@@ -21,3 +21,5 @@
 
 *.sublime-workspace
 Lib/Glyph_Name_Formatter.egg-info
+ArabicShaping.txt
+CaseFolding.txt
diff --git a/Lib/glyphNameFormatter/__init__.py b/Lib/glyphNameFormatter/__init__.py
@@ -3,18 +3,18 @@
 
 import unicodedata
 
-from .data import unicodelist
-from .data.scriptConflictNames import scriptConflictNames
-from .data.preferredAGLNames import preferredAGLNames
-from .data.scriptPrefixes import scriptPrefixes, addScriptPrefix, SCRIPTSEPARATOR, SCRIPTASPREFIX
-from .data.mathUniNumbers import mathUniNumbers
+from glyphNameFormatter.data import unicodelist, mathUniNumbers
+from glyphNameFormatter.data.scriptConflictNames import scriptConflictNames
+from glyphNameFormatter.data.preferredAGLNames import preferredAGLNames
+from glyphNameFormatter.data.scriptPrefixes import scriptPrefixes, addScriptPrefix, SCRIPTSEPARATOR, SCRIPTASPREFIX
+#from .data.mathUniNumbers import mathUniNumbers
 
-from .unicodeRangeNames import getRangeName, getRangeProcessor, getRangeProcessorByRangeName
+from glyphNameFormatter.unicodeRangeNames import getRangeName, getRangeProcessor, getRangeProcessorByRangeName
 
-from .tools import unicodeToChar
+from glyphNameFormatter.tools import unicodeToChar
 
 
-__version__ = "0.28"
+__version__ = "0.3"
 
 
 def debug(uniNumber):

diff --git a/Lib/glyphNameFormatter/data/__init__.py b/Lib/glyphNameFormatter/data/__init__.py
@@ -10,6 +10,11 @@
     "unicodelist",
     "unicodeVersion",
     "unicodeRangeNames",
+    "unicodeCaseMap",
+    "upperToLower",
+    "lowerToUpper",
+    "mathUniNumbers",
+
 ]
 
 path = os.path.dirname(__file__)
@@ -76,6 +81,9 @@
 
 unicodelist = {}
 unicodeCategories = {}
+upperToLower = {}
+lowerToUpper = {}
+mathUniNumbers = []
 
 flatUnicodePath = os.path.join(path, "flatUnicode.txt")
 
@@ -94,10 +102,35 @@
         if not line:
             # empty line
             continue
-        uniNumber, uniName, uniCategory = line.split("\t")
+        # codepoint / tab / uppercase / tab / lowercase / tab / category / tab / math / tab / name
+        uniNumber, uniUppercase, uniLowercase, uniCategory, mathFlag, uniName, = line.split("\t")
         uniNumber = int(uniNumber, 16)
+        #print(uniNumber, uniUppercase, uniLowercase, uniCategory, uniName)
+        if uniUppercase != '':
+            try:
+                uniUppercase = int(uniUppercase, 16)
+            except ValueError:
+                uniUppercase = None
+        else:
+            uniUppercase = None
+        if uniLowercase != '':
+            try:
+                if uniLowercase:
+                    uniLowercase = int(uniLowercase, 16)
+                upperToLower[uniNumber] = uniUppercase
+            except ValueError:
+                uniLowercase = None
+        else:
+            uniLowercase = None
+
+        if uniUppercase == None and uniLowercase != None:
+            upperToLower[uniNumber] = uniLowercase
+        if uniUppercase != None and uniLowercase == None:
+            lowerToUpper[uniNumber] = uniUppercase
         unicodelist[uniNumber] = uniName
         unicodeCategories[uniNumber] = uniCategory
+        if mathFlag is not '':
+            mathUniNumbers.append(uniNumber)
 
     unicodeVersion = lines[0].replace("#", "").strip()
 
@@ -130,3 +163,4 @@
         end = int(end, 16)
 
         unicodeRangeNames[(start, end)] = rangeName
+
diff --git a/Lib/glyphNameFormatter/data/buildFlatUnicodeList.py b/Lib/glyphNameFormatter/data/buildFlatUnicodeList.py
@@ -11,8 +11,8 @@
     from urllib.request import urlopen
 
 __doc__ = """
-This will parse the large ucd xml from unicode.org 
-into a simple list that is workable and 
+This will parse the large ucd xml from unicode.org
+into a simple list that is workable and
 is fair enough for download and embedding.
 
 starts with the # unicode description/version
@@ -23,14 +23,14 @@
 
 URL = "http://www.unicode.org/Public/{version}/ucdxml/ucd.all.flat.zip"
 
-UNICODE_VERSION = "10.0.0"
+UNICODE_VERSION = "11.0.0"
 UCD_ZIP_FILE = "ucd.all.flat.zip"
 UCD_FILE = UCD_ZIP_FILE[:-3] + "xml"
 FLAT_FILE = "flatUnicode.txt"
 
 parser = ArgumentParser(description=__doc__,
                         formatter_class=RawDescriptionHelpFormatter)
-group = parser.add_mutually_exclusive_group(required=True)
+group = parser.add_mutually_exclusive_group(required=False)
 group.add_argument("-u", "--unicode-version",
                    help="Unicode version to use for download and processing")
 group.add_argument("-i", "--input", metavar="UCD_FILE",
@@ -44,11 +44,12 @@
 else:
     tempdir = tempfile.mkdtemp()
     filename = os.path.join(tempdir, UCD_ZIP_FILE)
-    print(">> Downloading {} to {}".format(UCD_ZIP_FILE, filename))
     if options.unicode_version:
         version = options.unicode_version
     else:
         version = UNICODE_VERSION
+    print(">> Downloading {} to {} (version {})".format(UCD_ZIP_FILE, filename, version))
+    print(URL.format(version=version))
     url = urlopen(URL.format(version=version))
     with open(filename, "wb") as fp:
         blocksize = 8192
@@ -73,7 +74,19 @@
     if i.tag.endswith("char"):
         n = i.attrib.get("na")
         if n:
-            flat.append("%s\t%s\t%s" % (i.attrib.get("cp"), n, i.attrib.get("gc")))
+            uc = i.attrib.get("uc")
+            if uc == "#":
+                uc = ''
+            lc = i.attrib.get("lc")
+            if lc == "#":
+                lc = ''
+            mth = i.attrib.get('Math')
+            if mth != "N":
+                mth = "M"
+            else:
+                mth = ""
+            # codepoint / tab / uppercase / tab / lowercase / tab / category / tab / name
+            flat.append("%s\t%s\t%s\t%s\t%s\t%s" % (i.attrib.get("cp"), uc, lc, i.attrib.get("gc"), mth, n))
 
 
 f = open(FLAT_FILE, "w")

diff --git a/Lib/glyphNameFormatter/data/buildJoiningTypesList.py b/Lib/glyphNameFormatter/data/buildJoiningTypesList.py
@@ -1,5 +1,6 @@
 from __future__ import print_function
 import os
+import time
 import sys
 import tempfile
 from argparse import ArgumentParser, RawDescriptionHelpFormatter
@@ -9,37 +10,135 @@
 except ImportError:
     from urllib.request import urlopen
 
-parser = ArgumentParser(description=__doc__,
-                        formatter_class=RawDescriptionHelpFormatter)
-group = parser.add_mutually_exclusive_group(required=True)
-group.add_argument("-u", "--unicode-version",
-                   help="Unicode version to use for download and processing")
-options = parser.parse_args()
+UNICODE_VERSION = "11.0.0"
 
-if options.unicode_version:
-    version = options.unicode_version
-else:
-    version = UNICODE_VERSION
+#parser = ArgumentParser(description=__doc__,
+#                        formatter_class=RawDescriptionHelpFormatter)
+#group = parser.add_mutually_exclusive_group(required=True)
+#group.add_argument("-u", "--unicode-version",
+#                   help="Unicode version to use for download and processing")
+#options = parser.parse_args()
 
-JOININGTYPES_FILE = "joiningTypes.txt"
+#if options.unicode_version:
+#    version = options.unicode_version
+#else:
+version = UNICODE_VERSION
+
+JOININGTYPES_FILE = "ArabicShaping.txt"
 ASURL = "http://www.unicode.org/Public/{version}/ucd/ArabicShaping.txt"
 
 print(ASURL.format(version=version))
 
-tempdir = tempfile.mkdtemp()
-filename = os.path.join(tempdir, JOININGTYPES_FILE)
-print(">> Downloading {} to {}".format(os.path.basename(ASURL), filename))
-if options.unicode_version:
-    version = options.unicode_version
-else:
-    version = UNICODE_VERSION
+print(">> Downloading {} to {}".format(os.path.basename(ASURL), JOININGTYPES_FILE))
+#if options.unicode_version:
+#    version = options.unicode_version
+#else:
+version = UNICODE_VERSION
 url = urlopen(ASURL.format(version=version))
-with open(filename, "wb") as fp:
+with open(JOININGTYPES_FILE, "wb") as fp:
     blocksize = 8192
     while True:
         buffer = url.read(blocksize)
         if not buffer:
             break
         fp.write(buffer)
-fp.rewind()
-print(len(fp))
+# fp.rewind()
+# print(len(fp))
+
+
+# coding: utf-8
+
+
+def readArabicShaping(path):
+    # makes joiningTypes.txt
+    names = {}
+    categories = dict()
+    explanation = """# Joining types
+# R Right_Joining
+# L Left_Joining
+# D Dual_Joining
+# C Join_Causing
+# U Non_Joining
+# T Transparent"""
+
+    f = open(path, 'r')
+    d = f.read()
+    f.close()
+    source = None
+    for l in d.split("\n"):
+        if not l: continue
+        if l.find("ArabicShaping-")!=-1:
+            source = l
+            continue
+        if l[0] == "#": continue
+        parts = l.split(";")
+        parts = [p.strip() for p in parts]
+        parts[0] = u"0x"+parts[0]
+        try:
+            uni = parts[0] = int(parts[0],0)
+        except ValueError:
+            print(parts)
+        categories[uni] = parts[2]
+    txt = []
+    txt.append(explanation)
+    txt.append("# Generated on %s" % time.strftime("%Y %m %d %H:%M:%S"))
+    txt.append("# Source: %s"%source)
+    txt.append("# <unicode> <joiningtype>")
+    k = list(categories.keys())
+    k.sort()
+    for uni in k:
+        txt.append("%05X\t%s"%(uni, categories.get(uni)))
+    path = "joiningTypes.txt"
+    f = open(path, 'w')
+    f.write('\n'.join(txt))
+    f.close()
+
+def suggestExtension(names, categories, uni):
+    # based on the category data, give suggestion for which extensions we need for any unicode
+    extensions = {
+
+    # XXXX no idea if it works this way...
+
+
+    #   D Dual_Joining
+    #       &----G----&
+    'D': ['isol', 'ini', 'medi', 'fina'],
+
+    #   C Join_Causing
+    #       &----G----&     ????
+    'C': ['isol', 'ini', 'medi', 'fina'],
+
+    #   R Right_Joining
+    #          x G----&
+    'R': ['isol',                'fina'],
+
+    #   L Left_Joining
+    #       &----G x
+    'L': ['isol', 'ini',         'fina'],
+
+    #   U Non_Joining
+    #          x G x
+    'U': [                             ],
+
+    #   T Transparent
+    #          x G x
+    'T': [                             ],
+
+    }
+    if not uni in categories.get('_used'):
+        return [], None
+    for k, v in categories.items():
+        if k == "_used": continue
+        if uni in v:
+            return extensions.get(k), names.get(uni)
+    return [], None
+
+
+if __name__ == "__main__":
+    from pprint import pprint
+    arabicRange = 1536, 125251
+    path = "ArabicShaping.txt"
+    readArabicShaping(path)
+
+    path = "joiningTypes.txt"
+