## Importer la version 5.0.4 du CIDOC CRM base

Ce carnet prépare le fichier à importer dans OntoME


In [1]:
import lxml.etree as et
import lxml.html as ht
from datetime import datetime
import re
from itertools import chain

### Initialisation du parseur

In [2]:
### Documentation :
#  https://lxml.de/validation.html#xmlschema
f = 'references/schemaImportXmlwithReferences.xml'
xmlschema_doc = et.parse(f)
xmlschema = et.XMLSchema(xmlschema_doc)
type(xmlschema)

lxml.etree.XMLSchema

### Importation du document à traiter

In [3]:
file = 'data/cidoc_crm_v5.0.4.xml'

In [4]:
### Ouvrir et importer le fichier
try:
    xmlf = et.parse(file)
    print(type(xmlf))
except Exception as e:
    print('Error: ' + str(e))

<class 'lxml.etree._ElementTree'>


In [5]:
### Transformer le fichier de _ElementTree à _Element
#  afin de pouvoir utiliser les méthodes etree 
crm = xmlf.getroot()
type(crm)

lxml.etree._Element

In [6]:
### Tester la récupération des balises 'class'
classes_orig = crm.xpath('./classes/class')
print(type(classes_orig[0]))
for c in classes_orig[:5]:
    print(c.attrib)


<class 'lxml.etree._Element'>
{'id': 'E1'}
{'id': 'E2'}
{'id': 'E3'}
{'id': 'E4'}
{'id': 'E5'}


In [7]:
for c in classes_orig[7:9]:
    print(c.attrib)
    ic = c.iterchildren(tag='scopeNote')
    for c in ic:
        print(c.tag, c.text)
    print('----')    

{'id': 'E8'}
scopeNote <p>This class comprises transfers of legal ownership from one or more instances of E39 Actor to one or more other instances of E39 Actor. </p><p>The class also applies to the establishment or loss of ownership of instances of E18 Physical Thing. It does not, however, imply changes of any other kinds of right. The recording of the donor and/or recipient is optional. It is possible that in an instance of E8 Acquisition there is either no donor or no recipient. Depending on the circumstances, it may describe:</p><p>the beginning of ownership</p><p>the end of ownership</p><p>the transfer of ownership</p><p>the acquisition from an unknown source </p><p>the loss of title due to destruction of the item</p><p>It may also describe events where a collector appropriates legal title, for example by annexation or field collection. The interpretation of the museum notion of "accession" differs between institutions. The CRM therefore models legal ownership (E8 Acquisition) and 

### Création du document à produire 

Création du document à produire en vue de l'importation dans OntoME

In [11]:
### Créer l'élément racine: namespace
namespace = et.Element("namespace")
namespace.tag, type(namespace)

('namespace', lxml.etree._Element)

In [12]:
### Ajouter l'élément standardLabel
standardLabel = et.SubElement(namespace, 'standardLabel', lang = 'en')
standardLabel.text = '5.0.4 (December 2011)'
print(et.tostring(namespace, pretty_print=True).decode('utf-8'))

<namespace>
  <standardLabel lang="en">5.0.4 (December 2011)</standardLabel>
</namespace>



In [13]:
### Ajouter l'élément version
version = et.SubElement(namespace, 'version')
version.text = '5.0.4 (December 2011)'
print(et.tostring(namespace, pretty_print=True).decode('utf-8'))

<namespace>
  <standardLabel lang="en">5.0.4 (December 2011)</standardLabel>
  <version>5.0.4 (December 2011)</version>
</namespace>



In [14]:
### Ajouter les éléments racine de classes et properties
classes = et.SubElement(namespace, 'classes')
properties = et.SubElement(namespace, 'properties')
print(et.tostring(namespace, pretty_print=True).decode('utf-8'))

<namespace>
  <standardLabel lang="en">5.0.4 (December 2011)</standardLabel>
  <version>5.0.4 (December 2011)</version>
  <classes/>
  <properties/>
</namespace>



In [15]:
### Deux méthodes d'inspection des enfants
namespace.getchildren(), [t for t in namespace]

([<Element standardLabel at 0x7fb17c232640>,
  <Element version at 0x7fb17c232e40>,
  <Element classes at 0x7fb17c23aa00>,
  <Element properties at 0x7fb17c23acc0>],
 [<Element standardLabel at 0x7fb17c232640>,
  <Element version at 0x7fb17c232e40>,
  <Element classes at 0x7fb17c23aa00>,
  <Element properties at 0x7fb17c23acc0>])

In [16]:
### Valider le document produit jusuq'ici
# Normalement il proteste car les éléments classes and properties sont vides
try:
    xmlschema.assert_(namespace)
except Exception as e:
    print(e)

Element 'classes': Missing child element(s). Expected is ( class ).


### Ajouter les enfants des classes

In [17]:
### Reinitialiser le contenu de la balise 'classes'
classes.clear()

In [18]:
### Reinitialiser le contenu de la balise 'classes',
#  puis la remplir
classes.clear()
for c in classes_orig: #[7:9]:
    # underscore à cause du nom réservé
    _class = et.SubElement(classes,"_class")
    
    identifierInNamespace =  et.SubElement(_class,'identifierInNamespace')
    identifierInNamespace.text = c.get(key='id')
    
    standardLabel_l = c.iterchildren(tag='className')
    standardLabel = et.SubElement(_class,'standardLabel', lang="en")
    standardLabel.text = list(standardLabel_l)[0].text
    
    subClassOf_l = c.iterchildren(tag='subClassOf')
    for scl in subClassOf_l:
        subClassOf = et.SubElement(_class,'subClassOf')
        subClassOf.text = scl.get(key='id')
    
    textProperties = et.SubElement(_class, 'textProperties')
    
    scopeNote = et.SubElement(textProperties, 'scopeNote', lang="en")
    scopeNote_l = c.iterchildren(tag='scopeNote')
    """ with HTML tags
    for sn in scopeNote_l:
        [scopeNote.append(p) for p in et.HTML(sn.text).iterdescendants(tag='p')]
    """
    ### keeping HTML entities as in original
    for sn in list(scopeNote_l)[:1]:
        scopeNote.text = sn.text

    examples_l = list(c.iterchildren(tag='examples'))[0]
    ### blocs 'try' ajoutés car problème si sans exemples
    try:
        exam_li = et.fromstring(examples_l.text)
        for e in exam_li.iterdescendants(tag='li'):
            example = et.SubElement(textProperties, 'example', lang="en")
            example.text = et.tostring(e).decode('utf-8')[:et.tostring(e).decode('utf-8').find('</li>')].replace('<li>', '')

            """ Variante qui garderait les tags HTML 
            e.tag = "example"
            e.set("lang","en")
            textProperties.append(e)
            """
    except Exception as e:
        print(c.get(key='id'), e, len(examples_l))
        
    _class.tag = "class"        

In [19]:
### Test output
print(et.tostring(namespace, pretty_print=True).decode('utf-8'))

<namespace>
  <standardLabel lang="en">5.0.4 (December 2011)</standardLabel>
  <version>5.0.4 (December 2011)</version>
  <classes>
    <class>
      <identifierInNamespace>E1</identifierInNamespace>
      <standardLabel lang="en">CRM Entity</standardLabel>
      <textProperties>
        <scopeNote lang="en">&lt;p&gt;This class comprises all things in the universe of discourse of the CIDOC Conceptual Reference Model. &lt;/p&gt;&lt;p&gt;It is an abstract concept providing for three general properties:&lt;/p&gt;&lt;p&gt;Identification by name or appellation, and in particular by a preferred identifier&lt;/p&gt;&lt;p&gt;Classification by type, allowing further refinement of the specific subclass an instance belongs to &lt;/p&gt;&lt;p&gt;Attachment of free text for the expression of anything not captured by formal properties&lt;/p&gt;&lt;p&gt;With the exception of E59 Primitive Value, all other classes within the CRM are directly or indirectly specialisations of E1 CRM Entity. &lt;/p&gt;</

In [20]:
### Valider le document produit jusuq'ici
# Normalement il proteste car les éléments classes and properties sont vides
try:
    xmlschema.assert_(namespace)
except Exception as e:
    print(e)

Element 'properties': Missing child element(s). Expected is ( property ).


### Ajouter les enfants des propriétés

#### Test sur les cardinalités

In [21]:
pr1 = re.compile('[0,1,2,3,4,5]')
pr2 = re.compile('[1,2,3,4,5,n]')

In [22]:
a = '0'
b = 'n'
len(a) == 1 and len(pr1.findall(a)) > 0 \
and len(b) == 1 and len(pr2.findall(b)) > 0

True

#### Récupérer les propriétés

In [23]:
### Tester la récupération des balises 'property'
properties_orig = crm.xpath('./properties/property')
print(type(properties_orig[0]))
for c in properties_orig[:5]:
    print(c.attrib)


<class 'lxml.etree._Element'>
{'id': 'P1'}
{'id': 'P2'}
{'id': 'P3'}
{'id': 'P4'}
{'id': 'P5'}


In [24]:
### Préparation du traitement des quantificateurs
for p in properties_orig[7:12]:
    txt = list(p.iterchildren('quantification'))[0].text
    x = re.findall("\(.{,10}\)", txt)
    if len(x):
        a = (x[0].strip('\s\(\)'))
        print(a[0])
        print(a[2])
        print(a[4])
        print(a[6])
        

0
n
0
1
0
n
0
n
0
n
0
n
1
n
0
n
1
n
0
1


In [25]:
### Reinitialiser le contenu de la balise 'properties',
#  puis la remplir
properties.clear()
for p in properties_orig: #[7:12]:
    # underscore à cause du nom réservé
    _property = et.SubElement(properties,"_property")
    
    identifierInNamespace =  et.SubElement(_property,'identifierInNamespace')
    p_id = p.get(key='id') 
    identifierInNamespace.text = p_id

    label =  et.SubElement(_property,'label', lang="en")
    standardLabel = et.SubElement(label,'standardLabel')
    standardLabel.text = list(p.iterchildren(tag='directName'))[0].text
    
    inverseLabel = et.SubElement(label,'inverseLabel')
    if len(list(p.iterchildren(tag='inverseName'))):
        inverseLabel.text = list(p.iterchildren(tag='inverseName'))[0].text
    else:
        inverseLabel.text = ''

    subPropertyOf_l = p.iterchildren(tag='subPropertyOf')
    for spo in subPropertyOf_l:
        subPropertyOf = et.SubElement(_property,'subPropertyOf')
        subPropertyOf.text = spo.get(key='id').replace('i','')
        
    hasDomain = et.SubElement(_property,'hasDomain')   
    hasDomain.text = list(p.iterchildren(tag='domain'))[0].get(key='id')
    
    hasRange = et.SubElement(_property,'hasRange')   
    hasRange.text = list(p.iterchildren(tag='range'))[0].get(key='id')
    
    txt = list(p.iterchildren('quantification'))[0].text
    x = re.findall("\(.{,10}\)", txt)
    if len(x):
        a = (x[0].strip('\s\(\)'))
        a_4 = str(a[4])
        a_6 = str(a[6])
        a_0 = str(a[0])
        a_2 = str(a[2])

        if len(a_4) == 1 and len(pr1.findall(a_4)) > 0 \
                and len(a_6) == 1 and len(pr2.findall(a_6)) > 0 :
            domainInstancesMinQuantifier = et.SubElement(_property,\
                                                'domainInstancesMinQuantifier')
            domainInstancesMinQuantifier.text = a[4]
            domainInstancesMaxQuantifier = et.SubElement(_property,\
                                                'domainInstancesMaxQuantifier')
            domainInstancesMaxQuantifier.text = a[6]
        else:
            print(f'domain cardinality issue property {p_id}')
            
        if len(a_0) == 1 and len(pr1.findall(a_0)) > 0 \
                and len(a_2) == 1 and len(pr2.findall(a_2)) > 0 :
            rangeInstancesMinQuantifier = et.SubElement(_property,\
                                            'rangeInstancesMinQuantifier')
            rangeInstancesMinQuantifier.text = a[0]
            rangeInstancesMaxQuantifier = et.SubElement(_property,\
                                                'rangeInstancesMaxQuantifier')
            rangeInstancesMaxQuantifier.text = a[2]
        else:
            print(f'range cardinality issue property {p_id}')
            
            
        

    textProperties = et.SubElement(_property, 'textProperties')      
    
    scopeNote = et.SubElement(textProperties, 'scopeNote', lang="en")
    scopeNote_l = c.iterchildren(tag='scopeNote')    
    """for sn in scopeNote_l:
        [scopeNote.append(p) for p in et.HTML(sn.text).iterdescendants(tag='p')]
    """
    ### keeping HTML entities as in original
    for sn in list(scopeNote_l)[:1]:
        scopeNote.text = sn.text
    
    """   if len(list(p.iterchildren(tag='examples'))):
        examples_l = list(p.iterchildren(tag='examples'))[0]
        for li in et.HTML(examples_l.text).xpath('./body/ul/li'):
            #example = et.SubElement(textProperties, 'example', lang="en")
            # example.text = li.text
            li.tag = "example"
            li.set("lang","en")
            textProperties.append(li)"""
            
     
    
    
    ### blocs 'try' ajoutés car problème si sans exemples
    try:
        examples_l = list(p.iterchildren(tag='examples'))[0]
        exam_li = et.fromstring(examples_l.text)
        for e in exam_li.iterdescendants(tag='li'):
            example = et.SubElement(textProperties, 'example', lang="en")
            example.text = et.tostring(e).decode('utf-8')[:et.tostring(e).decode('utf-8').find('</li>')].replace('<li>', '')
            """ Variante qui garderait les tags HTML 
            e.tag = "example"
            e.set("lang","en")
            textProperties.append(e)
            """
    except Exception as e:
        print(p_id, e, len(examples_l))
    
    
    
 
    _property.tag = "property"

In [None]:
# print(et.tostring(namespace, pretty_print=True).decode('utf-8'))

In [26]:
### Valider le document produit jusuq'ici
# Normalement il proteste car les éléments classes and properties sont vides
try:
    xmlschema.assert_(namespace)
except Exception as e:
    print(e)

In [27]:
### Ecrire le document
dt = datetime.now()
tmsp = dt.strftime("%Y%m%d_%H%M%S")
# tmsp = ''
filename = f'data/output_{tmsp}.xml'
### Préparer l'arbre XML et l'écrire dans un fichier
# la méthode write() est disponible pour le type _ElementTree non pour _Element
tree = namespace.getroottree()
### xml_declaration=True, encoding="utf-8"
tree.write(filename, pretty_print=True, xml_declaration=True, encoding="utf-8")

In [28]:
### Normalement cette syntaxe doit tester si le document de sortie est bien formé
with open(filename, 'r') as f:
    # txt = f.read()
    ### omet la xml_declaration qui ne peut pas être lue par et.fromstring()
    txt = ''.join(f.readlines()[1:])
    
    
try:
    # test_xmlf = et.parse(filename)
    test_xmlf = et.fromstring(txt)
    print(type(test_xmlf))
except Exception as e:
    print('Error: ' + str(e))
    

<class 'lxml.etree._Element'>


In [29]:
print(et.tostring(test_xmlf, pretty_print=True).decode('utf-8'))

<namespace>
  <standardLabel lang="en">5.0.4 (December 2011)</standardLabel>
  <version>5.0.4 (December 2011)</version>
  <classes>
    <class>
      <identifierInNamespace>E1</identifierInNamespace>
      <standardLabel lang="en">CRM Entity</standardLabel>
      <textProperties>
        <scopeNote lang="en">&lt;p&gt;This class comprises all things in the universe of discourse of the CIDOC Conceptual Reference Model. &lt;/p&gt;&lt;p&gt;It is an abstract concept providing for three general properties:&lt;/p&gt;&lt;p&gt;Identification by name or appellation, and in particular by a preferred identifier&lt;/p&gt;&lt;p&gt;Classification by type, allowing further refinement of the specific subclass an instance belongs to &lt;/p&gt;&lt;p&gt;Attachment of free text for the expression of anything not captured by formal properties&lt;/p&gt;&lt;p&gt;With the exception of E59 Primitive Value, all other classes within the CRM are directly or indirectly specialisations of E1 CRM Entity. &lt;/p&gt;</