## Importer la version 5.0.4 du CIDOC CRM base

Ce carnet prépare le fichier à importer dans OntoME


In [107]:
import lxml.etree as et
import lxml.html as ht
from datetime import datetime
import re
from itertools import chain
import json

### Initialisation du parseur

In [108]:
### Documentation :
#  https://lxml.de/validation.html#xmlschema
f = 'references/schemaImportXmlwithReferences.xml'
xmlschema_doc = et.parse(f)
xmlschema = et.XMLSchema(xmlschema_doc)
type(xmlschema)

lxml.etree.XMLSchema

### Importation du document à traiter

In [109]:
current_version = "cidoc_crm_v5.0.4"
file = f'data/{current_version}.xml'

In [110]:
### Ouvrir et importer le fichier
try:
    xmlf = et.parse(file)
    print(type(xmlf))
except Exception as e:
    print('Error: ' + str(e))

<class 'lxml.etree._ElementTree'>


In [111]:
### Transformer le fichier de _ElementTree à _Element
#  afin de pouvoir utiliser les méthodes etree 
crm = xmlf.getroot()
type(crm)

lxml.etree._Element

In [112]:
### Tester la récupération des balises 'class'
classes_orig = crm.xpath('./classes/class')
print(type(classes_orig[0]))
for c in classes_orig[:5]:
    print(c.attrib)


<class 'lxml.etree._Element'>
{'id': 'E1'}
{'id': 'E2'}
{'id': 'E3'}
{'id': 'E4'}
{'id': 'E5'}


In [113]:
for c in classes_orig[7:9]:
    print(c.attrib)
    ic = c.iterchildren(tag='scopeNote')
    for c in ic:
        print(c.tag, c.text)
    print('----')    

{'id': 'E8'}
scopeNote <p>This class comprises transfers of legal ownership from one or more instances of E39 Actor to one or more other instances of E39 Actor. </p><p>The class also applies to the establishment or loss of ownership of instances of E18 Physical Thing. It does not, however, imply changes of any other kinds of right. The recording of the donor and/or recipient is optional. It is possible that in an instance of E8 Acquisition there is either no donor or no recipient. Depending on the circumstances, it may describe:</p><p>the beginning of ownership</p><p>the end of ownership</p><p>the transfer of ownership</p><p>the acquisition from an unknown source </p><p>the loss of title due to destruction of the item</p><p>It may also describe events where a collector appropriates legal title, for example by annexation or field collection. The interpretation of the museum notion of "accession" differs between institutions. The CRM therefore models legal ownership (E8 Acquisition) and 

### Création du document à produire 

Création du document à produire en vue de l'importation dans OntoME

In [114]:
### Créer l'élément racine: namespace
namespace = et.Element("namespace")
namespace.tag, type(namespace)

('namespace', lxml.etree._Element)

In [115]:
### Ajouter l'élément standardLabel
standardLabel = et.SubElement(namespace, 'standardLabel', lang = 'en')
standardLabel.text = '5.0.4 (December 2011)'
print(et.tostring(namespace, pretty_print=True).decode('utf-8'))

<namespace>
  <standardLabel lang="en">5.0.4 (December 2011)</standardLabel>
</namespace>



In [116]:
### Ajouter l'élément version
version = et.SubElement(namespace, 'version')
version.text = '5.0.4 (December 2011)'
print(et.tostring(namespace, pretty_print=True).decode('utf-8'))

<namespace>
  <standardLabel lang="en">5.0.4 (December 2011)</standardLabel>
  <version>5.0.4 (December 2011)</version>
</namespace>



In [117]:
### Ajouter l'élément publishedAt

### Transformer la date au format xs:dateTime
evenmonths = {"April":"04", "June":"06", "September":"09", "November":"11"}
unevenmonths = {"January":"01", "March":"03", "May":"05", "July":"07", "August":"08", "October":"10", "December":"12"}

year = re.search("\d{4}",crm.get('releaseDate')).group()
month = re.search("[^\d\s]+",crm.get('releaseDate')).group()

if month in evenmonths.keys():
    try:
        date = year + "-" + evenmonths[month] + "-30T23:59:59"
    except:
        print("It's an even month but I couldn't do it.")
        
elif month in unevenmonths.keys():
    try:
        date = year + "-" + unevenmonths[month] + "-31T23:59:59"
    except:
        print("It's an uneven month but I couldn't do it.")
        
elif month == "February":
    try:
        if year in ["2016", "2020", "2024", "2028", "2032", "2036", "2040", "2044", "2048", "2052", "2056", "2060", "2064"]:
            date = year + "-02-29T23:59:59"
        else:
            date = year + "-02-28T23:59:59"
    except:
        print("It's February but I couldn't do it.")
else:
    print("I don't understand " + crm.get('releaseDate'))

### Créer et ajouter l'élément.
released = et.SubElement(namespace, 'publishedAt')
released.text = date
print(et.tostring(namespace, pretty_print=True).decode('utf-8'))

<namespace>
  <standardLabel lang="en">5.0.4 (December 2011)</standardLabel>
  <version>5.0.4 (December 2011)</version>
  <publishedAt>2011-12-31T23:59:59</publishedAt>
</namespace>



In [118]:
### Ajouter les éléments racine de classes et properties
classes = et.SubElement(namespace, 'classes')
properties = et.SubElement(namespace, 'properties')
print(et.tostring(namespace, pretty_print=True).decode('utf-8'))

<namespace>
  <standardLabel lang="en">5.0.4 (December 2011)</standardLabel>
  <version>5.0.4 (December 2011)</version>
  <publishedAt>2011-12-31T23:59:59</publishedAt>
  <classes/>
  <properties/>
</namespace>



In [119]:
### Deux méthodes d'inspection des enfants
namespace.getchildren(), [t for t in namespace]

([<Element standardLabel at 0x7fa79c0489c0>,
  <Element version at 0x7fa79c0486c0>,
  <Element publishedAt at 0x7fa79c038e40>,
  <Element classes at 0x7fa79c019f80>,
  <Element properties at 0x7fa79c019280>],
 [<Element standardLabel at 0x7fa79c0489c0>,
  <Element version at 0x7fa79c0486c0>,
  <Element publishedAt at 0x7fa79c038e40>,
  <Element classes at 0x7fa79c019f80>,
  <Element properties at 0x7fa79c019280>])

In [120]:
### Valider le document produit jusuq'ici
# Normalement il proteste car les éléments classes and properties sont vides
try:
    xmlschema.assert_(namespace)
except Exception as e:
    print(e)

Element 'publishedAt': This element is not expected. Expected is one of ( contributors, referenceNamespace, classes, properties ).


### Récupérer les autres langues des labels.

In [121]:
filerdf = 'data/cidoc_crm_v5.0.4_official_release.rdfs.rdf'
et.register_namespace("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
et.register_namespace("rdfs", "http://www.w3.org/2000/01/rdf-schema#")
et.register_namespace('xml','http://www.w3.org/XML/1998/namespace')

In [122]:
### Ouvrir et importer le fichier
try:
    xmlrdf = et.parse(filerdf)
    print(type(xmlrdf))
except Exception as e:
    print('Error: ' + str(e))

<class 'lxml.etree._ElementTree'>


In [123]:
### Transformer le fichier de _ElementTree à _Element
#  afin de pouvoir utiliser les méthodes etree 
crm_rdfs = xmlrdf.getroot()
type(crm_rdfs)

lxml.etree._Element

In [124]:
### Récupérer les labels pour chaque classe/propriété.
lang_versions = {}

for child in crm_rdfs.findall('./*'):
    
    qname = child.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about").split("_")[0]
    childict = {}
        
    for label in child.findall('./{http://www.w3.org/2000/01/rdf-schema#}label'):
        if label.get("{http://www.w3.org/XML/1998/namespace}lang") != "en":
            childict[label.get("{http://www.w3.org/XML/1998/namespace}lang")] = label.text
    lang_versions[qname] = childict
print(lang_versions)

{'E1': {'el': 'Οντότητα CIDOC CRM', 'de': 'CRM Entität', 'ru': 'CRM Сущность', 'fr': 'Entité CRM', 'pt': 'Entidade CRM'}, 'E2': {'fr': 'Entité temporelle', 'ru': 'Временная Сущность', 'el': 'Έγχρονη  Οντότητα', 'de': 'Geschehendes', 'pt': 'Entidade Temporal'}, 'E3': {'ru': 'Состояние', 'de': 'Zustandsphase', 'fr': 'État matériel', 'el': 'Κατάσταση', 'pt': 'Estado Material'}, 'E4': {'de': 'Phase', 'fr': 'Période', 'ru': 'Период', 'el': 'Περίοδος', 'pt': 'Período'}, 'E5': {'el': 'Συμβάν', 'fr': 'Événement', 'ru': 'Событие', 'de': 'Ereignis', 'pt': 'Evento'}, 'E6': {'ru': 'Разрушение', 'fr': 'Destruction', 'de': 'Zerstörung', 'el': 'Καταστροφή', 'pt': 'Destruição'}, 'E7': {'fr': 'Activité', 'de': 'Handlung', 'ru': 'Деятельность', 'el': 'Δράση', 'pt': 'Atividade'}, 'E8': {'fr': 'Acquisition', 'el': 'Απόκτηση', 'ru': 'Событие Приобретения', 'de': 'Erwerb', 'pt': 'Aquisição'}, 'E9': {'el': 'Μετακίνηση', 'de': 'Objektbewegung', 'ru': 'Перемещение', 'fr': 'Déplacement', 'pt': 'Locomoção'}, 'E1

In [141]:
### Vérifier que le compte y est.
print(len(lang_versions.keys()))
count = 0
for item in lang_versions.keys():
    count += len(lang_versions[item].keys())
print(count)

344
1628


In [125]:
### Exporter dans un fichier Json
dtjson = datetime.now()
tmspjson = dtjson.strftime("%Y%m%d_%H%M%S")
jsonfilen = f'data/output_{current_version}_{tmspjson}.json'
with open(jsonfilen, 'w') as jsf:
    json.dump(lang_versions, jsf, ensure_ascii = False)

### Ajouter les enfants des classes

In [126]:
### Reinitialiser le contenu de la balise 'classes'
classes.clear()

In [127]:
### Reinitialiser le contenu de la balise 'classes',
#  puis la remplir
classes.clear()
for c in classes_orig: #[7:9]:
    # underscore à cause du nom réservé
    _class = et.SubElement(classes,"_class")
    
    identifierInNamespace =  et.SubElement(_class,'identifierInNamespace')
    identifierInNamespace.text = c.get(key='id')
    
    standardLabel_l = c.iterchildren(tag='className')
    standardLabel = et.SubElement(_class,'standardLabel', lang="en")
    standardLabel.text = list(standardLabel_l)[0].text
    
    subClassOf_l = c.iterchildren(tag='subClassOf')
    for scl in subClassOf_l:
        subClassOf = et.SubElement(_class,'subClassOf')
        subClassOf.text = scl.get(key='id')
    
    textProperties = et.SubElement(_class, 'textProperties')
    
    scopeNote = et.SubElement(textProperties, 'scopeNote', lang="en")
    scopeNote_l = c.iterchildren(tag='scopeNote')
    """ with HTML tags
    for sn in scopeNote_l:
        [scopeNote.append(p) for p in et.HTML(sn.text).iterdescendants(tag='p')]
    """
    ### keeping HTML entities as in original
    for sn in list(scopeNote_l)[:1]:
        scopeNote.text = sn.text

    examples_l = list(c.iterchildren(tag='examples'))[0]
    ### blocs 'try' ajoutés car problème si sans exemples
    try:
        exam_li = et.fromstring(examples_l.text)
        for e in exam_li.iterdescendants(tag='li'):
            example = et.SubElement(textProperties, 'example', lang="en")
            example.text = et.tostring(e).decode('utf-8')[:et.tostring(e).decode('utf-8').find('</li>')].replace('<li>', '')

            """ Variante qui garderait les tags HTML 
            e.tag = "example"
            e.set("lang","en")
            textProperties.append(e)
            """
    except Exception as e:
        print(c.get(key='id'), e, len(examples_l))
        
    _class.tag = "class"        

In [128]:
### Test output
print(et.tostring(namespace, pretty_print=True).decode('utf-8'))

<namespace>
  <standardLabel lang="en">5.0.4 (December 2011)</standardLabel>
  <version>5.0.4 (December 2011)</version>
  <publishedAt>2011-12-31T23:59:59</publishedAt>
  <classes>
    <class>
      <identifierInNamespace>E1</identifierInNamespace>
      <standardLabel lang="en">CRM Entity</standardLabel>
      <textProperties>
        <scopeNote lang="en">&lt;p&gt;This class comprises all things in the universe of discourse of the CIDOC Conceptual Reference Model. &lt;/p&gt;&lt;p&gt;It is an abstract concept providing for three general properties:&lt;/p&gt;&lt;p&gt;Identification by name or appellation, and in particular by a preferred identifier&lt;/p&gt;&lt;p&gt;Classification by type, allowing further refinement of the specific subclass an instance belongs to &lt;/p&gt;&lt;p&gt;Attachment of free text for the expression of anything not captured by formal properties&lt;/p&gt;&lt;p&gt;With the exception of E59 Primitive Value, all other classes within the CRM are directly or indirect

In [129]:
### Valider le document produit jusuq'ici
# Normalement il proteste car les éléments classes and properties sont vides
try:
    xmlschema.assert_(namespace)
except Exception as e:
    print(e)

Element 'publishedAt': This element is not expected. Expected is one of ( contributors, referenceNamespace, classes, properties ).


### Ajouter les enfants des propriétés

#### Test sur les cardinalités

In [130]:
pr1 = re.compile('[0,1,2,3,4,5]')
pr2 = re.compile('[1,2,3,4,5,n]')

In [131]:
a = '0'
b = 'n'
len(a) == 1 and len(pr1.findall(a)) > 0 \
and len(b) == 1 and len(pr2.findall(b)) > 0

True

#### Récupérer les propriétés

In [132]:
### Tester la récupération des balises 'property'
properties_orig = crm.xpath('./properties/property')
print(type(properties_orig[0]))
for c in properties_orig[:5]:
    print(c.attrib)


<class 'lxml.etree._Element'>
{'id': 'P1'}
{'id': 'P2'}
{'id': 'P3'}
{'id': 'P4'}
{'id': 'P5'}


In [133]:
### Préparation du traitement des quantificateurs
for p in properties_orig[7:12]:
    txt = list(p.iterchildren('quantification'))[0].text
    x = re.findall("\(.{,10}\)", txt)
    if len(x):
        a = (x[0].strip('\s\(\)'))
        print(a[0])
        print(a[2])
        print(a[4])
        print(a[6])
        

0
n
0
1
0
n
0
n
0
n
0
n
1
n
0
n
1
n
0
1


In [134]:
### Reinitialiser le contenu de la balise 'properties',
#  puis la remplir
properties.clear()
for p in properties_orig: #[7:12]:
    # underscore à cause du nom réservé
    _property = et.SubElement(properties,"_property")
    
    identifierInNamespace =  et.SubElement(_property,'identifierInNamespace')
    p_id = p.get(key='id') 
    identifierInNamespace.text = p_id

    label =  et.SubElement(_property,'label', lang="en")
    standardLabel = et.SubElement(label,'standardLabel')
    standardLabel.text = list(p.iterchildren(tag='directName'))[0].text
    
    inverseLabel = et.SubElement(label,'inverseLabel')
    if len(list(p.iterchildren(tag='inverseName'))):
        inverseLabel.text = list(p.iterchildren(tag='inverseName'))[0].text
    else:
        inverseLabel.text = ''

    subPropertyOf_l = p.iterchildren(tag='subPropertyOf')
    for spo in subPropertyOf_l:
        subPropertyOf = et.SubElement(_property,'subPropertyOf')
        subPropertyOf.text = spo.get(key='id').replace('i','')
        
    hasDomain = et.SubElement(_property,'hasDomain')   
    hasDomain.text = list(p.iterchildren(tag='domain'))[0].get(key='id')
    
    hasRange = et.SubElement(_property,'hasRange')   
    hasRange.text = list(p.iterchildren(tag='range'))[0].get(key='id')
    
    txt = list(p.iterchildren('quantification'))[0].text
    x = re.findall("\(.{,10}\)", txt)
    if len(x):
        a = (x[0].strip('\s\(\)'))
        a_4 = str(a[4])
        a_6 = str(a[6])
        a_0 = str(a[0])
        a_2 = str(a[2])

        if len(a_4) == 1 and len(pr1.findall(a_4)) > 0 \
                and len(a_6) == 1 and len(pr2.findall(a_6)) > 0 :
            domainInstancesMinQuantifier = et.SubElement(_property,\
                                                'domainInstancesMinQuantifier')
            domainInstancesMinQuantifier.text = a[4]
            domainInstancesMaxQuantifier = et.SubElement(_property,\
                                                'domainInstancesMaxQuantifier')
            domainInstancesMaxQuantifier.text = a[6]
        else:
            print(f'domain cardinality issue property {p_id}')
            
        if len(a_0) == 1 and len(pr1.findall(a_0)) > 0 \
                and len(a_2) == 1 and len(pr2.findall(a_2)) > 0 :
            rangeInstancesMinQuantifier = et.SubElement(_property,\
                                            'rangeInstancesMinQuantifier')
            rangeInstancesMinQuantifier.text = a[0]
            rangeInstancesMaxQuantifier = et.SubElement(_property,\
                                                'rangeInstancesMaxQuantifier')
            rangeInstancesMaxQuantifier.text = a[2]
        else:
            print(f'range cardinality issue property {p_id}')
            
            
        

    textProperties = et.SubElement(_property, 'textProperties')      
    
    scopeNote = et.SubElement(textProperties, 'scopeNote', lang="en")
    scopeNote_l = c.iterchildren(tag='scopeNote')    
    """for sn in scopeNote_l:
        [scopeNote.append(p) for p in et.HTML(sn.text).iterdescendants(tag='p')]
    """
    ### keeping HTML entities as in original
    for sn in list(scopeNote_l)[:1]:
        scopeNote.text = sn.text
    
    """   if len(list(p.iterchildren(tag='examples'))):
        examples_l = list(p.iterchildren(tag='examples'))[0]
        for li in et.HTML(examples_l.text).xpath('./body/ul/li'):
            #example = et.SubElement(textProperties, 'example', lang="en")
            # example.text = li.text
            li.tag = "example"
            li.set("lang","en")
            textProperties.append(li)"""
            
     
    
    
    ### blocs 'try' ajoutés car problème si sans exemples
    try:
        examples_l = list(p.iterchildren(tag='examples'))[0]
        exam_li = et.fromstring(examples_l.text)
        for e in exam_li.iterdescendants(tag='li'):
            example = et.SubElement(textProperties, 'example', lang="en")
            example.text = et.tostring(e).decode('utf-8')[:et.tostring(e).decode('utf-8').find('</li>')].replace('<li>', '')
            """ Variante qui garderait les tags HTML 
            e.tag = "example"
            e.set("lang","en")
            textProperties.append(e)
            """
    except Exception as e:
        print(p_id, e, len(examples_l))
    
    
    
 
    _property.tag = "property"

In [None]:
# print(et.tostring(namespace, pretty_print=True).decode('utf-8'))

In [135]:
### Valider le document produit jusuq'ici
# Normalement il proteste car les éléments classes and properties sont vides
try:
    xmlschema.assert_(namespace)
except Exception as e:
    print(e)

Element 'publishedAt': This element is not expected. Expected is one of ( contributors, referenceNamespace, classes, properties ).


In [136]:
### Ecrire le document
dt = datetime.now()
tmsp = dt.strftime("%Y%m%d_%H%M%S")
# tmsp = ''
filename = f'data/output_{current_version}_{tmsp}.xml'
### Préparer l'arbre XML et l'écrire dans un fichier
# la méthode write() est disponible pour le type _ElementTree non pour _Element
tree = namespace.getroottree()
### xml_declaration=True, encoding="utf-8"
tree.write(filename, pretty_print=True, xml_declaration=True, encoding="utf-8")

In [28]:
### Normalement cette syntaxe doit tester si le document de sortie est bien formé
with open(filename, 'r') as f:
    # txt = f.read()
    ### omet la xml_declaration qui ne peut pas être lue par et.fromstring()
    txt = ''.join(f.readlines()[1:])
    
    
try:
    # test_xmlf = et.parse(filename)
    test_xmlf = et.fromstring(txt)
    print(type(test_xmlf))
except Exception as e:
    print('Error: ' + str(e))
    

<class 'lxml.etree._Element'>


In [137]:
print(et.tostring(test_xmlf, pretty_print=True).decode('utf-8'))

NameError: name 'test_xmlf' is not defined