In [1]:
import xmltodict
import os,sys
sys.path.append("../..")
sys.path.append("../../xsdviz")
import treepath.treepath as tp
from treepath.treepath import TreePath

from functools import reduce

import operator

from collections import Counter
from itertools import chain

import pandas as pd

import json

In [2]:
#with open("customer.xsd", "r") as xsdf:
filename = "pacs.008.001.09.xsd"
#filename = "2012_04_XMLSchema.xsd"
#filename = "test copy.xsd"

with open(filename, "r") as xsdf:
    customer_xsd = xsdf.read()
cust_d = xmltodict.parse(customer_xsd)

In [3]:
def access_dict(d,p):
    return reduce(operator.getitem, p , d)

def isnumeric(v):
    return isinstance(v, (int, float, complex)) and not isinstance(v, bool)

In [4]:
cust_tp = tp.TreePath(cust_d)

In [5]:
#import xsdparse

In [6]:
list(cust_d.keys())[0]

'xs:schema'

In [7]:
self_namespace_prefix = list(cust_d.keys())[0].split(":")[0]

def prefix_set(prefix, pset):
    return {prefix + ":" + str(v) for v in pset}


# Define "static" data
xsprims = {'duration','dateTime','time','date','gYearMonth','gYear',
                'gMonthDay','gDay','gMonth','string','boolean','base64Binary',
                'hexBinary','float','decimal','integer','nonPositiveInteger',
                'negativeInteger','int','short','byte','nonNegativeInteger',
                'unsignedLong','positiveInteger','unsignedInt','unsignedShort',
                'unsignedByte','double','anyURI','QName','NOTATION'}



xsnameables = {"attribute", "attributeGroup","complexType", "element", 
                         "group", "key", "keyref", "notation", "simpleType", 
                         "unique", 'primitiveType'}

xsrefs = {"attribute", "attributeGroup", "element", "group", }
xstype_defs = {"complexType", "simpleType", "primitiveType" }

xscompound_defs = {"choice", "all", "sequence"}

primitives_set = prefix_set(self_namespace_prefix, xsprims)
nameable_xsd_elements = prefix_set(self_namespace_prefix, xsnameables)
ref_set = prefix_set(self_namespace_prefix, xsrefs)
type_definition_set = prefix_set(self_namespace_prefix, xstype_defs)

In [8]:
# Process XSD into ERD task steps
#1 Load XSD into malleable format

#2 Extract named objects

#3 Extract objects requiring references

#4 Loop over 3, updating object from #1 such that all named objects
#  are complete without any remaining reference updates. 

#5 Extract objects that define types, extensions and restrictions

#6 Reduce singular extensions and restrictions down to their singular/primitive types

#7 Identify objects that describe sequences and other multi-part objects 
#  and associate them with a named parent-type/element

#8 Create tree of joined sequences so as to result in a workable ERD with entities and joins
#  This tree would consist of nodes consisting of multi-part sequences, and elemental
#  types that themselves are pointers to other single-or-multi part sequences.



In [9]:
# Given a tree obj, and some child-tag used to identify nodes within that tree
# build an index used to link paths to their assigned names. 
# Since there's no guarantee of name-uniqueness, each name can refer to 
# one or more paths.

def build_path_index(obj, index_tag="@name"):
    index_d={}
    def store(k,v):
        if k in index_d:
            index_d[k].append(v)
        else:
            index_d[k]=[v]

    #for p in TreePath._iterPaths(obj):
    for p in TreePath._iterLeaves(obj):
        if p[-1]==index_tag:
            #print(p)
            store(access_dict(obj,p), p[:-1])
    return index_d


# Querying the pre-built path_index (see build_path_index) on some key value, 
# choose the path that most closely matches a source_path provided. 
# In practice, this means returning the name most likely to be in scope, 
# when there's some ambiguity in terms of which name to pick.
def return_closest_path_match(path_index_d, key, source_path):

    cand_paths = path_index_d.get(key)
    score_paths = []
    for e,p in enumerate(cand_paths):
        matching_path = TreePath._shortest_common_path([p, source_path])
        score_paths.append( (len(matching_path), len(source_path)-len(matching_path) ) )
    #print(cand_paths, score_paths)
    return cand_paths[score_paths.index(sorted([s for s in score_paths], key=lambda x : (x[0], x[1]))[-1])]
        

In [10]:
# 1) Get the things
#named_things = [p[:-1] for p in TreePath._iterPaths(cust_d) if p[-1]=="@name"]
def gen_labelled_path_dict(info_d, index_tag="@name"):
    things={}
    for name, paths in build_path_index(info_d, index_tag=index_tag).items():
        things[name]=[]
        for path in paths:
            for e,path_element in enumerate(path[::-1]):
                if path_element in nameable_xsd_elements:
                    thing_type=path_element
                    things[name].append((thing_type, path))
                    break
                else:
                    pass
                    
    return things

prims = [{"@name":t} for t in primitives_set]
cust_d[self_namespace_prefix+':schema'][self_namespace_prefix+':primitiveType']=prims

named_things=gen_labelled_path_dict(cust_d, index_tag="@name")
things_requiring_reference = build_path_index(cust_d, index_tag="@ref")

#for k in things_requiring_reference:
#    print (k, named_things[k], things_requiring_reference[k])

#referred_things = [p[:-1] for p in TreePath._iterPaths(cust_d) if p[-1]=="@ref"]
#len(named_things),len(referred_things)

In [11]:
all_named_types = set()
named_other_things = set()

for k,v in named_things.items():
    for vv in v:
        if vv[0] in type_definition_set:
            
            all_named_types.add((k, vv[0], tuple(vv[1])))
        else:
            named_other_things.add((k, vv[0], tuple(vv[1])))

            
all_named_types # these will become terminal nodes for type-matching.

{('AccountIdentification4Choice',
  'xs:complexType',
  ('xs:schema', 'xs:complexType', 0)),
 ('AccountSchemeName1Choice',
  'xs:complexType',
  ('xs:schema', 'xs:complexType', 1)),
 ('ActiveCurrencyAndAmount',
  'xs:complexType',
  ('xs:schema', 'xs:complexType', 2)),
 ('ActiveCurrencyAndAmount_SimpleType',
  'xs:simpleType',
  ('xs:schema', 'xs:simpleType', 0)),
 ('ActiveCurrencyCode', 'xs:simpleType', ('xs:schema', 'xs:simpleType', 1)),
 ('ActiveOrHistoricCurrencyAndAmount',
  'xs:complexType',
  ('xs:schema', 'xs:complexType', 3)),
 ('ActiveOrHistoricCurrencyAndAmount_SimpleType',
  'xs:simpleType',
  ('xs:schema', 'xs:simpleType', 2)),
 ('ActiveOrHistoricCurrencyCode',
  'xs:simpleType',
  ('xs:schema', 'xs:simpleType', 3)),
 ('AddressType2Code', 'xs:simpleType', ('xs:schema', 'xs:simpleType', 4)),
 ('AddressType3Choice', 'xs:complexType', ('xs:schema', 'xs:complexType', 4)),
 ('AnyBICDec2014Identifier',
  'xs:simpleType',
  ('xs:schema', 'xs:simpleType', 5)),
 ('BICFIDec2014Ident

In [12]:
#cust_tp.data['xs:schema']['xs:element'][1]['xs:complexType'][ 'xs:sequence']['xs:element'][0]

In [13]:
cust_tp.data[self_namespace_prefix+':schema'][self_namespace_prefix+':primitiveType']=prims

In [14]:
cust_tp.data['xs:schema']['xs:element']

{'@name': 'Document', '@type': 'Document'}

In [15]:

named_things=gen_labelled_path_dict(cust_tp.data, index_tag="@name")
things_requiring_reference = build_path_index(cust_tp.data, index_tag="@ref")


all_named_types = set()
named_other_things = set()

for k,v in named_things.items():
    for vv in v:
        if vv[0] in type_definition_set:
            
            all_named_types.add((k, vv[0], tuple(vv[1])))
        else:
            named_other_things.add((k, vv[0], tuple(vv[1])))

            
Counter([t[1] for t in named_other_things]) # these will become terminal nodes for type-matching.

Counter({'xs:element': 368, 'xs:attribute': 2})

In [16]:
Counter([t[1] for t in all_named_types])

Counter({'xs:complexType': 94, 'xs:primitiveType': 31, 'xs:simpleType': 69})

In [17]:
def return_longest_matching_ancestor(q, paths):
    
    scores=[]
    lq = len(q)
    for t,path in enumerate(paths):
        maxe=0
        for e,p in enumerate(path[:lq]):
            if q[e]!=p:
                r=e
                break
            r=e+1
        scores.append(r-len(path))
    ms = max(scores)
#    return paths
#    return scores
    return [paths[e] for e,s in enumerate(scores) if s==ms ]
            

In [18]:
named_nodes = list(chain([(k, p[0], p[1], access_dict(cust_tp.data,p[1])) for k,t in named_things.items() for p in t]))

In [19]:
named_objects_df = pd.DataFrame(named_nodes, columns=["Name", "Type", "Path", "Obj"])

named_objects_df.groupby("Type").agg(len)

Unnamed: 0_level_0,Name,Path,Obj
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
xs:attribute,2,2,2
xs:complexType,94,94,94
xs:element,368,368,368
xs:primitiveType,31,31,31
xs:simpleType,69,69,69


In [20]:
q_df = named_objects_df.set_index(["Name", "Type"]).sort_index()
q_df['_resolved']=False



In [21]:
q_df#loc[("TCRMExtension", "xs:element")]

Unnamed: 0_level_0,Unnamed: 1_level_0,Path,Obj,_resolved
Name,Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AccountIdentification4Choice,xs:complexType,"[xs:schema, xs:complexType, 0]","{'@name': 'AccountIdentification4Choice', 'xs:...",False
AccountSchemeName1Choice,xs:complexType,"[xs:schema, xs:complexType, 1]","{'@name': 'AccountSchemeName1Choice', 'xs:choi...",False
AccptncDtTm,xs:element,"[xs:schema, xs:complexType, 16, xs:sequence, x...","{'@maxOccurs': '1', '@minOccurs': '0', '@name'...",False
ActiveCurrencyAndAmount,xs:complexType,"[xs:schema, xs:complexType, 2]","{'@name': 'ActiveCurrencyAndAmount', 'xs:simpl...",False
ActiveCurrencyAndAmount_SimpleType,xs:simpleType,"[xs:schema, xs:simpleType, 0]",{'@name': 'ActiveCurrencyAndAmount_SimpleType'...,False
...,...,...,...,...
xs:time,xs:primitiveType,"[xs:schema, xs:primitiveType, 15]",{'@name': 'xs:time'},False
xs:unsignedByte,xs:primitiveType,"[xs:schema, xs:primitiveType, 28]",{'@name': 'xs:unsignedByte'},False
xs:unsignedInt,xs:primitiveType,"[xs:schema, xs:primitiveType, 5]",{'@name': 'xs:unsignedInt'},False
xs:unsignedLong,xs:primitiveType,"[xs:schema, xs:primitiveType, 9]",{'@name': 'xs:unsignedLong'},False


In [22]:
leaf_gen = cust_tp._iterLeaves(cust_tp.data)
named_leaves=set()
for p in leaf_gen:
    if p[-1]=="@name":
        named_leaves.add(tuple(p[:-1]))
len(named_leaves)

564

In [23]:
# A named thing exists at the end of a specific path. Some named things can be resolved as individual items 
# without referencing something else, other things require de-referencing before they can be worked on.
# Figuring out what does and does not need de-referencing can be done using the following algorithm:
# 1) Is it a primitive type?
# 2) Is it a simpleType
# 3) Is it an element consisting of simpleContent
### ... add more here ...
## https://learn.microsoft.com/en-us/previous-versions/dotnet/netframework-4.0/ms256067(v=vs.100)
    
    
def simpleContent_proc(ix_dummy, vals, resolver):
    pass

def complexContent_proc(ix_dummy, vals, resolver):
    pass


def complexType_proc(ix, vals, resolver):
    #simpleContent, complexContent, group, all, choice, sequence, attribute, attributeGroup, anyAttribute
    
    return False,0

# This is an anonimous type, so should always be called in the context of some parent that will 
# harvest and process the returned content.
# The vals parameter should contain a Series in which the obj column contains the specific restriction
# for processing
def restriction_proc(ix, vals, resolver):
    obj = vals['Obj']
    base_name = obj['@base']
    possible_base_types = ["xs:primitiveType", "xs:simpleType"]
    is_resolvable=False
    pointer=None
    if any([(base_name,bt) in resolver.keys() for bt in possible_base_types]):
        is_resolvable=True
        pointer=[(base_name,bt) for bt in possible_base_types if (base_name,bt) in resolver.keys() ]
    else:
        pass
    return is_resolvable, pointer

def extension_proc(ix, vals, resolver):
    # The extension will fetch, as a base, some content located elsewhere...
    # Likely appears as part of a complexType section, but could be under a simpleContent section also
    obj = vals['Obj']
    base_name = obj['@base']
    possible_base_types = ["xs:primitiveType", "xs:simpleType", "xs:complexType"]
    is_resolvable=False
    pointer=None
    if any([(base_name,bt) in resolver.keys() for bt in possible_base_types]):
        is_resolvable=True
        pointer=[(base_name,bt) for bt in possible_base_types if (base_name,bt) in resolver.keys() ]
    else:
        pass
    return is_resolvable, pointer
    
def simpleType_proc(ix, vals, resolver):
    # If this is a simpleType, it should consist only of references to primitives - or extensions of same
    obj = vals['Obj']
    is_resolvable=False
    pointer = None
    if 'xs:restriction' in obj.keys():
        base_name = obj['xs:restriction']['@base']
        possible_base_types = ["xs:primitiveType", "xs:simpleType"]
        pass_series = pd.Series(vals)
        pass_series['Obj']=obj['xs:restriction']
        pass_series['Path']=vals['Path'].extend('xs:restriction')
        is_resolvable, pointer = restriction_proc(ix, pass_series, resolver)
    elif 'xs:extension' in obj.keys():
        print(obj)
        print("Simple Type shouldn't contain an extension, quitting.")
        
    elif 'xs:union' in obj.keys():
        print(obj)
        print("Union element not yet implemented")
        assert False
        
    elif 'xs:list' in obj.keys():
        print(obj)
        print("List element not yet implemented")
        assert False
        
    else:
        print(obj.keys())
        assert False
    return is_resolvable,pointer

def element_proc(ix, vals,resolver):
    return False,0

def attribute_proc(ix, vals, resolver):
    print(vals['Obj'])
    obj = vals['Obj']
    is_resolvable=False
    pointer = None
    possible_attribute_datatypes = ["xs:primitiveType"]
    if "@type" in obj.keys():
        type_name = obj['@type']
    else:
        type_name = None
    if any([(type_name,bt) in resolver.keys() for bt in possible_attribute_datatypes]):
        is_resolvable=True
        pointer=[(type_name,bt) for bt in possible_attribute_datatypes if (type_name,bt) in resolver.keys() ]
    return is_resolvable,pointer

def primitiveType_proc(ix, vals,resolver):
    is_resolvable=True
    #print(vals['obj'])
    return is_resolvable, "primitive"
    
def unknown_proc(ix, vals,resolver):
    print ("Unknown!", ix, vals)
    assert False


type_function_lookup =  {"xs:complexType" : complexType_proc, 
                         "xs:simpleType" : simpleType_proc, 
                         "xs:element" : element_proc, 
                         "xs:attribute" : attribute_proc, 
                         "xs:primitiveType" : primitiveType_proc, 
                     }


resolver = {}
finished=False
ii=0
while not finished:
    ii+=1
    print("Iteration ", ii)
    for i,r in q_df.iterrows():
        print(i)
        if r['_resolved']==False:
            p_func = type_function_lookup.get(i[1], unknown_proc)
            r,v = p_func(i,r,resolver)
        if r is True:
            resolver[i]=r,v
    if len([v for v in resolver.values() if v[0]])==len(q_df):
        
        finished=True
    print(len([v for v in resolver.values() if v[0]]))
    if ii > 5:
        finished=True
resolver_df = pd.DataFrame(resolver).T

Iteration  1
('AccountIdentification4Choice', 'xs:complexType')
('AccountSchemeName1Choice', 'xs:complexType')
('AccptncDtTm', 'xs:element')
('ActiveCurrencyAndAmount', 'xs:complexType')
('ActiveCurrencyAndAmount_SimpleType', 'xs:simpleType')
('ActiveCurrencyCode', 'xs:simpleType')
('ActiveOrHistoricCurrencyAndAmount', 'xs:complexType')
('ActiveOrHistoricCurrencyAndAmount_SimpleType', 'xs:simpleType')
('ActiveOrHistoricCurrencyCode', 'xs:simpleType')
('AddressType2Code', 'xs:simpleType')
('AddressType3Choice', 'xs:complexType')
('AddtlInf', 'xs:element')
('AddtlInf', 'xs:element')
('AddtlRmtInf', 'xs:element')
('AdjstmntAmtAndRsn', 'xs:element')
('AdjstmntAmtAndRsn', 'xs:element')
('AdmstnZone', 'xs:element')
('AdmstnZone', 'xs:element')
('Adr', 'xs:element')
('AdrLine', 'xs:element')
('AdrTp', 'xs:element')
('Agt', 'xs:element')
('Amt', 'xs:element')
('Amt', 'xs:element')
('Amt', 'xs:element')
('Amt', 'xs:element')
('Amt', 'xs:element')
('Amt', 'xs:element')
('Amt', 'xs:element')
('An

In [24]:
resolver_df[85:]

Unnamed: 0,Unnamed: 1,0,1
Max4Text,xs:simpleType,True,"[(xs:string, xs:primitiveType)]"
Max70Text,xs:simpleType,True,"[(xs:string, xs:primitiveType)]"
NamePrefix2Code,xs:simpleType,True,"[(xs:string, xs:primitiveType)]"
Number,xs:simpleType,True,"[(xs:decimal, xs:primitiveType)]"
PercentageRate,xs:simpleType,True,"[(xs:decimal, xs:primitiveType)]"
PhoneNumber,xs:simpleType,True,"[(xs:string, xs:primitiveType)]"
PreferredContactMethod1Code,xs:simpleType,True,"[(xs:string, xs:primitiveType)]"
Priority2Code,xs:simpleType,True,"[(xs:string, xs:primitiveType)]"
Priority3Code,xs:simpleType,True,"[(xs:string, xs:primitiveType)]"
RegulatoryReportingType1Code,xs:simpleType,True,"[(xs:string, xs:primitiveType)]"


In [25]:
assert False

AssertionError: 

In [None]:
from datetime import datetime, timedelta

datetime.now()-timedelta(days=41)

In [None]:
import uuid
str(uuid.uuid1())

In [None]:
pd.DataFrame(named_other_things).groupby(1).agg(len)

In [None]:
pd.DataFrame(named_other_things)[2][0]

In [None]:
assert False

In [None]:
pd.DataFrame(all_named_types)

In [None]:
# Identify both named and anonymous type definitions
# Those definitions that contain multi-part objects (sequences, choices, all's)
# can be considered object definitions. 
# How they nest will describe how those objects (entities) are linked to one another. 
type_objs={}
for p in TreePath._iterPaths(cust_tp.data):
    
    if p[-1] in type_definition_set:
        if "@name" in access_dict(cust_tp.data,p[:-1]).keys():
            # Some parent element or thing is referencing this local type
            # for the type's name, use the name of the referring object + _local

            type_objs[access_dict(cust_tp.data,p[:-1]+["@name"])+f"__local"]=(p,access_dict(cust_tp.data,p))
        else:

            tname=str(uuid.uuid1())+"b"
            type_objs[tname]=(p,access_dict(cust_tp.data,p))
    elif isnumeric(p[-1]) and p[-2] in type_definition_set:
            if "@name" in access_dict(cust_tp.data,p).keys():

                type_objs[access_dict(cust_tp.data,p+["@name"])]=(p,access_dict(cust_tp.data,p))
            else:

                tname=str(uuid.uuid1())+"a"
                type_objs[tname]=(p,access_dict(cust_tp.data,p))
            thing_type = p[-1]


In [None]:
type_objs.keys()

In [None]:
for k,v in type_objs.items():
    print(k)
    print("\t", v[0])
    print( len(v[1]))
    if len(v[1])<=2:
        print (json.dumps(v[1], indent=4))
    print()
    print()
    print()

In [None]:
access_dict(cust_tp.data,['xs:schema', 'xs:element', 1])

In [None]:
return_longest_matching_ancestor(['xs:schema', 'xs:element', 1, 'xs:complexType'],  named_nodes)

In [None]:
assert False

In [None]:


    referred_things=gen_labelled_path_dict(cust_tp.data, index_tag="@ref")

    for n, paths in referred_things.items():
        for p in paths:
            #print(p)
            name = n
            obj=access_dict(cust_tp.data, p[1])
            org=access_dict(cust_d, p[1])
            print(n, p[0], p[1], obj, org)
        

In [None]:
cust_tp.data

In [None]:
referred_things

In [None]:
assert False

In [None]:
access_dict(cust_d, named_things.get('test_by_reference')[0][1][:-2])

In [None]:
cust_d


In [None]:
access_dict(cust_d, named_things.get( 'Document')[0])

named_things.get( 'Document')

In [None]:
named_things.get('root')[0]
#access_dict(cust_d, named_things.get( 'root')[0][:-1])

In [None]:
build_path_index(cust_d, index_tag="@ref")

In [None]:


for k,v_list in things_requiring_reference.items():
    print(k,v_list)
    for e,v in enumerate(v_list):
        print(e)
        ref_path = return_closest_path_match(named_things,k,v)
        #print(v, ref_path)
        cust_tp.set(v, access_dict(cust_d, ref_path))
    print()


In [None]:
cust_tp.data

In [None]:

t_i = build_path_index(cust_d,"@name")

{k:[access_dict(cust_d, q) for q in v] for k,v in t_i.items()}

test_path = ['xs:schema', 'xs:complexType', 1,'xs:element', 1, 1, 1, 2, 'xs:complexType', 'xs:complexContent']

return_closest_path_match(t_i, "element5", test_path)

In [None]:
TreePath._shortest_common_path([t_i.get("element5")[0], test_path])

In [None]:
named_things

In [None]:
t_i['element5']

In [None]:
# 2) Classify the things:
c_dict={}
df_records=[]
for path in named_things:
    thing_name = access_dict(cust_d, path + ["@name"])
    # Does this thing have a type?
    if "@type" in access_dict(cust_d, path).keys():
        thing_type = access_dict(cust_d, path + ["@type"])
    else:
        thing_type = None
    # Does this thing reference a type via an extension or restriction "base"?
    if any([n in ('xs:restriction', 'xs:extension') for n in access_dict(cust_d, path).keys()]):
        base_context = [n for n in access_dict(cust_d, path).keys() if n in ('xs:restriction', 'xs:extension') ][0]
        thing_base = access_dict(cust_d, path+[base_context, '@base'])
    elif any([n in ('xs:simpleContent', 'xs:complexContent') for n in access_dict(cust_d, path).keys()]):
        content_type=[n for n in access_dict(cust_d, path).keys() if n in ('xs:simpleContent', 'xs:complexContent')][0]
        base_context = [n for n in access_dict(cust_d, path+[content_type]).keys() if n in ('xs:restriction', 'xs:extension') ][0]
        thing_base = access_dict(cust_d, path+[content_type, base_context, '@base'])
    else:
        content_type =None
        base_context = None
        thing_base = None
    # Does this thing contain one of the container types that might point to multiple elements?
    if any([n in ('xs:all', 'xs:group', 'xs:choice','xs:sequence') for n in access_dict(cust_d, path).keys()]):
        thing_contains_collection = True
    else:
        thing_contains_collection = False
        
    # What is this thing's parent object class? 
    if isnumeric(path[-1]):
        thing_expression_type = path[-2]
    else:
        thing_expression_type = path[-1]
    # Build a counter of parent expression types 
    if thing_expression_type in c_dict.keys():
        c_dict[thing_expression_type]=c_dict[thing_expression_type]+1
    else:
        c_dict[thing_expression_type]=1
    
    # Print some things
    df_records.append((thing_name, thing_type, thing_expression_type, thing_base, base_context, thing_contains_collection, content_type))
    
xsd_df = pd.DataFrame(df_records, columns=["Name", "Type", "expression_type", "base", "base_context", "has_collection", "content_type"])
xsd_df

In [None]:
xsd_df[xsd_df[["expression_type", "content_type"]].isnull().apply(lambda x : not any([y for y in x]), axis=1)]

In [None]:
c_dict

In [None]:
# 2) Classify the things:
typed_collection = {}
for path in named_things:
    thing_name = access_dict(cust_d, path + ["@name"])
    if isnumeric(path[-1]):
        thing_type = path[-2]
    else:
        thing_type = path[-1]
    if thing_type in typed_collection.keys():
        typed_collection[thing_type].append(path)
    else:
        typed_collection[thing_type]=[path]
    print (path,  thing_name, thing_type)

In [None]:
path=['xs:schema', 'xs:complexType', 2, 'xs:complexContent', 'xs:extension', 'xs:attribute']
access_dict(cust_d, path)

In [None]:
[access_dict(cust_d, path) for path in chain(*typed_collection.values())]

In [None]:
# Of the named things extracted, what are their parent types?
{k:len(v) for k,v in typed_collection.items()}

In [None]:
# Of the things extracted, which are terminal items?
terminals = {s for s in primitives_set}
log=[]
for k,v in typed_collection.items():
    print(k)
    for path in v:
        node = access_dict(cust_d, path)
        if "@type" in node.keys():
            if (node['@type'] in terminals):
                log.append(k)
                terminals.add(node["@name"])
            else:
                print(k, "no terminal")
        if "xs:restriction" in node.keys():
            if node["xs:restriction"]["@base"] in terminals:
                
                terminals.add(node["@name"])
                log.append(k)
            else:
                print(k, "no terminal")
        
log

In [None]:
for k,v in typed_collection.items():
    
    if k == "xs:complexType":
        for path in v:
            node = access_dict(cust_d, path)
            if "xs:sequence" in node.keys():
                print(node["@name"])
                if 'xs:element' in node['xs:sequence'].keys():
                    print (len(node['xs:sequence']['xs:element']))
                    print (type(node['xs:sequence']['xs:element']))
                    if isinstance(node['xs:sequence']['xs:element'], list):
                        for h,i in node['xs:sequence'].items():
                            for si in i:
#                                print(si)
                                print("\t", str(si["@name"]), "<" + si["@type"] + ">")
                    else:
                        si = node['xs:sequence']['xs:element']
#                        print(si)
                        print("\t", str(si["@name"]), "<" + si["@type"] + ">")

            print()
            print()
        

In [None]:
class TypeDefinition(object):
    def __init__(self, type_dict):
        if "@name" in type_dict.keys():
            self.name = type_dict['@name']
        else:
            assert False
        if "@id" in type_dict.keys():
            self.id = type_dict['@id']
        self.children=[]
            
    def parse_restriction(self, type_dict):
        if "xs:restriction" in type_dict.keys():
            if "@base" in type_dict['xs:restriction'].keys():
                self.children.append ( type_dict['xs:restriction']['@base'])
                
    def parse_extension(self, type_dict):
        if "xs:extension" in type_dict.keys():
            if "@base" in type_dict['xs:extension'].keys():
                self.children.append ( type_dict['xs:extension']['@base'])
                
    def parse_all(self, type_dict):
        if "xs:all" in type_dict.keys():
            for e in type_dict['xs:all']:
                self.parse_element(e)
        
    def parse_group(self, type_dict):
        
    def parse_sequence(self, type_dict):
        
    def parse_choice(self, type_dict):
        
    def parse_attribute(self, type_dict):
        
    def parse_attribute_group(self, type_dict):
        
    def parse_simple_content(self, type_dict):

    def parse_complex_content(self, type_dict):

    def parse_union(self, type_dict):
        if "xs:union" in type_dict.keys():
            if "@memberTypes" in type_dict['xs:union'].keys():
                self.children.append ( type_dict['xs:list']["@memberTypes"] )
                
    def parse_list(self, type_dict):
        if "xs:list" in type_dict.keys():
            if "@itemType" in type_dict['xs:list'].keys():
                self.children.append ( type_dict['xs:list']["@itemType"] )
                
    def parse_element(self, type_dict):
        
        
class SimpleType(TypeDefinition):
    
    def __init__(self, type_dict):
        
        super().__init__(type_dict)

            
    def __repr__(self):
        return str(self.name) + ":<" + str(self.base) + ">(primitive:" + str(self.primitive) + ")"
    
class ComplexType(TypeDefinition):
    
    def __init__(self, type_dict):
        
        super().__init__(type_dict)
            
            
    def __repr__(self):
        return str(self.name) + ":<" + str(None) + ">"
    
    
    

In [None]:
[(p, access_dict(cust_d, p)) for p in typed_collection['xs:element']]

In [None]:
[(p, access_dict(cust_d, p)) for p in typed_collection['xs:element']]

In [None]:
k=['xs:schema', 'xs:complexType', 20]
reduce(operator.getitem, k, cust_d)

In [None]:
k=['xs:schema', 'xs:complexType', 88]
reduce(operator.getitem, k, cust_d)

In [None]:
Counter({tuple(k):reduce(operator.getitem, k, cust_d)  for k in leaves if k[-1]=="@name"}.values())


In [None]:
cust_d

In [None]:
raw_names_d, raw_bases_d, raw_types_d, raw_primitives_d, raw_primitives_labels_d, ref_keys, specification_d = xsdparse.parse_xsd(customer_xsd)

In [None]:
reduce(operator.getitem, ['xs:schema', 'xs:complexType', 2], cust_d) 

In [None]:
xsdparse.build_(specification_d.get("customerinfo")[0],specification_d)

In [None]:
ref_keys

In [None]:
"""
xsdparse
===========

A utility for parsing an xsd and producing a model specification.

"""
import xmltodict
import zipfile
from collections import Counter
import os, sys
sys.path.append(os.path.abspath("../treepath"))
import treepath.treepath as tp

import pandas as pd
import networkx as nx


def key_scan_obj(obj, search=None, path=None, results=None):
    if results is None:
        results = []
    if search is None:
        search = None
    if path is None:
        path = []
    if isinstance(obj, dict):
        for k,v in obj.items():
            if (k==search or v==search):
                results.append({"path" : path+[k], "value" : v})
            elif isinstance(v, (dict, list)):
                results.extend(key_scan_obj(v, search=search, path=path + [k]))
    elif isinstance(obj, list):
        for e,v in enumerate(obj):
            if v==search:
                results.append({"path" : path+[e], "value" : v})
            elif isinstance(v, (dict, list)):
                results.extend(key_scan_obj(v, search=search, path = path + [e]))
    else:
        if v==search or search is None:
            results.append({"path" : path, "value" : v})
    return results

prims=[ {'@name': 'xs:duration'},
        {'@name': 'xs:dateTime'},
        {'@name': 'xs:time'},
        {'@name': 'xs:date'},
        {'@name': 'xs:gYearMonth'},
        {'@name': 'xs:gYear'},
        {'@name': 'xs:gMonthDay'},
        {'@name': 'xs:gDay'},
        {'@name': 'xs:gMonth'},
        {'@name': 'xs:string'},
        {'@name': 'xs:boolean'},
        {'@name': 'xs:base64Binary'},
        {'@name': 'xs:hexBinary'},
        {'@name': 'xs:float'},
        {'@name': 'xs:decimal'},
        {'@name': 'xs:integer'},
        {'@name': 'xs:nonPositiveInteger'},
        {'@name': 'xs:negativeInteger'},
        {'@name': 'xs:int'},
        {'@name': 'xs:short'},
        {'@name': 'xs:byte'},
        {'@name': 'xs:nonNegativeInteger'},
        {'@name': 'xs:unsignedLong'},
        {'@name': 'xs:positiveInteger'},
        {'@name': 'xs:unsignedInt'},
        {'@name': 'xs:unsignedShort'},
        {'@name': 'xs:unsignedByte'},
        {'@name': 'xs:double'},
        {'@name': 'xs:anyURI'},
        {'@name': 'xs:QName'},
       {'@name': 'xs:NOTATION'},
]
def extract_paths(tree, search_val, path_offset, filter_offset=None, filter_value=None):
    if filter_offset is not None and filter_value is not None:
        extract = [(tuple(k['path'][:path_offset]), k['value']) for k in key_scan_obj(tree, search_val) if k['path'][filter_offset].lower()==filter_value]
    else:
        extract = [(tuple(k['path'][:path_offset]), k['value']) for k in key_scan_obj(tree, search_val) ]
    return extract


def parse_xsd(xsd_bytes):
    xsd_d = xmltodict.parse(xsd_bytes)
    xsd_d['xs:schema']['xs:primitiveType']=prims
    raw_names_d = dict(extract_paths(xsd_d['xs:schema'], "@name", -1))
    raw_bases_d = dict(extract_paths(xsd_d['xs:schema'], "@base", -2, -2, 'xs:restriction') + extract_paths(xsd_d['xs:schema'], "@base", -3, -2, 'xs:extension'))
    raw_types_d = dict(extract_paths(xsd_d['xs:schema'], "@type", -1))
    raw_primitives_d = dict(extract_paths(xsd_d['xs:schema'], "@base", -1, 0, 'xs:primitiveType'))
    raw_primitives_labels_d = {k:"_terminal_" for k,v in raw_primitives_d.items()}

    refs_d = dict([(k,v) for k,v in raw_names_d.items() if not any([p in ['xs:element','xs:attribute'] for p in k[-2:]])])

    # This contains all the specifications required to build the model, keys, (names, types, contextual_clues) TBD: Cardinalities
    specification_d = {k:(v,
                      raw_types_d.get(k, raw_bases_d.get(k,raw_primitives_labels_d.get(k,"_container_"))),
                      "_ref_" if k in refs_d.keys() else "_spec_") for k,v in raw_names_d.items()}

    ref_keys = {v[0]:(k,v[1],v[2]) for k,v in specification_d.items() if v[2]=='_ref_'}
    return raw_names_d, raw_bases_d, raw_types_d, raw_primitives_d, raw_primitives_labels_d, ref_keys, specification_d


def build_(root, spec_d, obj=None):
    ref_keys = {v[0]:(k,v[1],v[2]) for k,v in spec_d.items() if v[2]=='_ref_'}
    if obj is None:
        obj={}
    name, v_type, clue = spec_d.get(root)
    #print(root, name, v_type, clue)
    #print()
    content=[]
    if clue == '_ref_':
        #print("R")
        if v_type == "_container_":
            #print("C")
            for k in spec_d.keys():
                if k[0:len(root)]==root and k!=root:
                    content.append(build_(k, spec_d))
        elif v_type != "_container_" and v_type != "_terminal_":
            #print("!", v_type, clue)
            lookahead = ref_keys.get(v_type)
            if lookahead[1] == "_terminal_":
                content={v_type:{}}
            else:
                content=build_( lookahead[0], spec_d)
        else:
            content=name
    elif clue == '_spec_':

        #print (name, v_type, ref_keys.get(v_type))
        content=build_( ref_keys.get(v_type)[0], spec_d)

    else:
        assert False
        return name, v_type, clue
    obj[name]=content
    return obj


In [None]:
# Resolve References
# Loop over the data, taking references where they exist and replacing them with their
# de-referenced lookup values
i=0
finished=False

cust_tp = tp.TreePath(cust_d)

while not finished:
    i = i + 1
    things_requiring_reference = build_path_index(cust_tp.data, index_tag="@ref")
    copy_tp = tp.TreePath(cust_tp.data)
    if len(things_requiring_reference)==0:
        finished=True
        break

    for k,v_list in things_requiring_reference.items():
#        print(k,v_list)
        for e,v in enumerate(v_list):
#            print("\t", e)
            ref_path = return_closest_path_match(named_things,k,v)
#            print("`",v,"`", ref_path)
            retain_content_d={ki:vi for ki,vi in access_dict(copy_tp.data, v).items() if ki != "@ref"}
            cust_tp.set(v, {**retain_content_d, **access_dict(copy_tp.data, ref_path[1])})
    print(i,len(things_requiring_reference))
#k,access_dict(cust_d, things_requiring_reference[k])




In [None]:
xsd_construct = parse_xsd(customer_xsd)

In [None]:
q = build_(('xs:element',), xsd_construct[6])

In [None]:
paths = traverse_object(q)

In [None]:
for p in paths[0:25]:
    print (p[-8:], len(p))