## Importer la version 7.1.1 du CIDOC CRM base

Ce carnet prépare le fichier à importer dans OntoME


In [None]:
import lxml.etree as et
from datetime import datetime
import re

In [None]:
file = 'data/cidoc_crm_v7.1.1_with_tags.xml'
file = 'data/cidoc_crm_v7.1.1.xml'

In [None]:
### Ouvrir et importer le fichier
try:
    xmlf = et.parse(file)
    print(type(xmlf))
except Exception as e:
    print('Error: ' + str(e))

In [None]:
### Transformer le fichier de _ElementTree à _Element
#  afin de pouvoir utiliser les méthodes etree 
crm = xmlf.getroot()
type(crm)

In [None]:
### Tester la récupération des balises 'class'
classes_orig = crm.xpath('./classes/class')
print(type(classes_orig[0]))
for c in classes_orig[:5]:
    print(c.attrib)


In [None]:
### Créer l'élément racine: namespace
namespace = et.Element("namespace")
namespace.tag, type(namespace)

In [None]:
### Ajouter l'élément standardLabel
standardLabel = et.SubElement(namespace, 'standardLabel')
standardLabel.text = 'CIDOC CRM version 7.1.1'
standardLabel.set("lang", "en")
print(et.tostring(namespace, pretty_print=True).decode('utf-8'))

In [None]:
### Ajouter l'élément version
version = et.SubElement(namespace, 'version', lang = "en")
version.text = '7.1.1 (May 2021)'
print(et.tostring(namespace, pretty_print=True).decode('utf-8'))

In [None]:
### Ajouter les éléments classes et properties
classes = et.SubElement(namespace, 'classes')
properties = et.SubElement(namespace, 'properties')
print(et.tostring(namespace, pretty_print=True).decode('utf-8'))

In [None]:
### Deux méthodes d'inspection des enfants
namespace.getchildren(), [t for t in namespace]

In [None]:
### Reinitialiser le contenu de la balise 'classes'
classes.clear()

In [None]:
### Reinitialiser le contenu de la balise 'classes',
#  puis la remplir
classes.clear()
for c in classes_orig:#[:5]:
    # underscore à cause du nom réservé
    _class = et.SubElement(classes,"_class")
    
    identifierInNamespace =  et.SubElement(_class,'identifierInNamespace')
    identifierInNamespace.text = c.get(key='id')
    
    standardLabel_l = c.iterchildren(tag='className')
    standardLabel = et.SubElement(_class,'standardLabel', lang="en")
    standardLabel.text = list(standardLabel_l)[0].text
    
    subClassOf_l = c.iterchildren(tag='subClassOf')
    for scl in subClassOf_l:
        subClassOf = et.SubElement(_class,'subClassOf')
        subClassOf.text = scl.get(key='id')
    
    textProperties = et.SubElement(_class, 'textProperties')
    
    scopeNote_l = c.iterchildren(tag='scopeNote')
    scopeNote = et.SubElement(textProperties, 'scopeNote', lang="en")
    for sn in scopeNote_l:
        [scopeNote.append(p) for p in et.HTML(sn.text).iterdescendants(tag='p')]

    examples_l = list(c.iterchildren(tag='examples'))[0]
    for li in et.HTML(examples_l.text).xpath('./body/ul/li'):
        #example = et.SubElement(textProperties, 'example', lang="en")
        # example.text = li.text
        li.tag = "example"
        li.set("lang","en")
        textProperties.append(li)

    _class.tag = "class"        

In [None]:
# print(et.tostring(namespace, pretty_print=True).decode('utf-8'))

In [None]:
### Tester la récupération des balises 'class'
properties_orig = crm.xpath('./properties/property')
print(type(properties_orig[0]))
for c in properties_orig[:5]:
    print(c.attrib)


In [None]:
### Préparation du traitement des quantificateurs
for p in properties_orig[7:12]:
    txt = list(p.iterchildren('quantification'))[0].text
    x = re.findall("\(.{,10}\)", txt)
    if len(x):
        a = (x[0].strip('\s\(\)'))
        print(a[0])
        print(a[2])
        print(a[4])
        print(a[6])
        

In [None]:
### Reinitialiser le contenu de la balise 'properties',
#  puis la remplir
properties.clear()
for p in properties_orig: #[7:12]:
    # underscore à cause du nom réservé
    _property = et.SubElement(properties,"_property")
    
    identifierInNamespace =  et.SubElement(_property,'identifierInNamespace')
    identifierInNamespace.text = p.get(key='id')

    label =  et.SubElement(_property,'label', lang="en")
    standardLabel = et.SubElement(label,'standardLabel')
    standardLabel.text = list(p.iterchildren(tag='directName'))[0].text
    
    inverseLabel = et.SubElement(label,'inverseLabel')
    if len(list(p.iterchildren(tag='inverseName'))):
        inverseLabel.text = list(p.iterchildren(tag='inverseName'))[0].text
    else:
        inverseLabel.text = ''

    subPropertyOf_l = p.iterchildren(tag='subPropertyOf')
    for spo in subPropertyOf_l:
        subPropertyOf = et.SubElement(_property,'subPropertyOf')
        subPropertyOf.text = spo.get(key='id').replace('i','')
        
    hasDomain = et.SubElement(_property,'hasDomain')   
    hasDomain.text = list(p.iterchildren(tag='domain'))[0].get(key='id')
    
    hasRange = et.SubElement(_property,'hasRange')   
    hasRange.text = list(p.iterchildren(tag='range'))[0].get(key='id')
    
    txt = list(p.iterchildren('quantification'))[0].text
    x = re.findall("\(.{,10}\)", txt)
    if len(x):
        a = (x[0].strip('\s\(\)'))
        domainInstancesMinQuantifier = et.SubElement(_property,\
                                            'domainInstancesMinQuantifier')
        domainInstancesMinQuantifier.text = a[4]
        domainInstancesMaxQuantifier = et.SubElement(_property,\
                                            'domainInstancesMaxQuantifier')
        domainInstancesMaxQuantifier.text = a[6]
        rangeInstancesMinQuantifier = et.SubElement(_property,\
                                            'rangeInstancesMinQuantifier')
        rangeInstancesMinQuantifier.text = a[0]
        rangeInstancesMinQuantifier = et.SubElement(_property,\
                                            'rangeInstancesMinQuantifier')
        rangeInstancesMinQuantifier.text = a[2]

    textProperties = et.SubElement(_property, 'textProperties')      
    
    scopeNote_l = p.iterchildren(tag='scopeNote')
    scopeNote = et.SubElement(textProperties, 'scopeNote', lang="en")
    for sn in scopeNote_l:
        [scopeNote.append(p) for p in et.HTML(sn.text).iterdescendants(tag='p')]
        
    if len(list(p.iterchildren(tag='examples'))):
        examples_l = list(p.iterchildren(tag='examples'))[0]
        for li in et.HTML(examples_l.text).xpath('./body/ul/li'):
            #example = et.SubElement(textProperties, 'example', lang="en")
            # example.text = li.text
            li.tag = "example"
            li.set("lang","en")
            textProperties.append(li)    
 
    _property.tag = "property"

In [None]:
# print(et.tostring(namespace, pretty_print=True).decode('utf-8'))

In [None]:
### Ecrire le document
dt = datetime.now()
tmsp = dt.strftime("%Y%m%d_%H%M%S")
# tmsp = ''
filename = f'data/output_{tmsp}.xml'
### Préparer l'arbre XML et l'écrire dans un fichier
# la méthode write() est disponible pour le type _ElementTree non pour _Element
tree = namespace.getroottree()
tree.write(filename, pretty_print=True)

In [None]:
### Normalement cette syntaxe doit tester si le document de sortie est bien formé
with open(filename, 'r') as f:
    txt = f.read()
    
    
try:
    test_xmlf = et.fromstring(txt)
    print(type(test_xmlf))
except Exception as e:
    print('Error: ' + str(e))
    