In [1]:
#import des librairies
from collatex import *
from lxml import etree
from xml.etree import ElementTree as et
import os
import json

In [2]:
#on parse les documents
A1 = etree.parse('textes/AsansLemmes.xml')
B1 = etree.parse('textes/BsansLemmes.xml')
C1 = etree.parse('textes/CsansLemmes.xml')

In [3]:
#défintion de la fonction qui permet la transformation vers du json, puisque collatex doit prendre du json
#on utilise une feuille de transformation
def XMLtoJson(id,xmlInput):
    #défition de witness comme un dictionnaire
    witness = {}
    #on donne la première valeur de l'id
    witness['id'] = id
    #on transforme le document XML : d'abord on tokénise, puis on transforme en json
    tokenization = etree.XML("""
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
    <xsl:output method="xml" indent="yes" encoding="UTF-8" omit-xml-declaration="yes"/>
   
    <xsl:template match="/*">
        <xsl:copy>
            <xsl:apply-templates select="@*"/>
            <xsl:apply-templates/>
        </xsl:copy>
    </xsl:template>
    
    <xsl:template match="text()">
        <xsl:call-template name="tokenize">
            <xsl:with-param name="param1" select="." />
        </xsl:call-template>
    </xsl:template>
    
    <xsl:template name="tokenize">
        <xsl:param name="param1" />
        <xsl:variable name="Pretoken" select="normalize-space(
            substring-before(concat( $param1, ','), ','))" />
        <xsl:variable name="Pretoken2" select="normalize-space(
            substring-before(concat( $Pretoken, '.'), '.'))" />
        <xsl:variable name="token" select="normalize-space(
            substring-before(concat($Pretoken2, ' '), ' ')
            )" />

        <xsl:if test="$token">
        <xsl:choose>
            <xsl:when test="not(contains($token,  '’'))">
                <w>
                    <xsl:value-of select="$token" />
                </w>
                
            </xsl:when>
            <xsl:when test="contains($token,  '’')">
                <w>
                    <xsl:value-of select="substring-before($token,'’')"/>
                </w>
                <w>
                    <xsl:value-of select="substring-after($token,'’')"/>
                </w>
            </xsl:when>
        </xsl:choose>
            <xsl:call-template name="tokenize">
                <xsl:with-param name="param1" select="substring-after($param1, ' ')" />
            </xsl:call-template>
        </xsl:if>
         
    </xsl:template>     
    
</xsl:stylesheet>
    
""")
    transformTok = etree.XSLT(tokenization)
                           
    XSLtoJSON = etree.XML('''
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
    xmlns:xs="http://www.w3.org/2001/XMLSchema"
    exclude-result-prefixes="xs"
    version="1.0">

    <xsl:output method="text"/>

    <xsl:template match="/">
        <xsl:apply-templates/>
    </xsl:template>

    <xsl:template match="w">
        <xsl:text>{"t": "</xsl:text>
        <xsl:apply-templates/>
        <xsl:text>"}</xsl:text>
        <xsl:if test="following::w">
            <xsl:text>, </xsl:text>
        </xsl:if>
    </xsl:template>
</xsl:stylesheet>
    ''')
    #on applique l'XSL
    monXSL = etree.XSLT(XSLtoJSON)
    #on charge le résultat de l'XSL dans la valeur de l'entrée du dicitonnaire "tokens"
    witness['tokens'] = json.loads( '[' +str(monXSL(transformTok(xmlInput))) +']')
    return witness

In [4]:
#le dictionnaire niveau général
json_input = {}
#la première entrée qui contient les witnesses
json_input['witnesses'] = []
#application de la fonction de transformation sur A puis sur B
json_input['witnesses'].append(XMLtoJson('A',A1))
json_input['witnesses'].append(XMLtoJson('B',B1))
json_input['witnesses'].append(XMLtoJson('C',C1))
print(json_input)
#on a bien à  le json créé !      

{'witnesses': [{'id': 'A', 'tokens': [{'t': 'Mais'}, {'t': 'au'}, {'t': 'povre'}, {'t': 'home'}, {'t': 'doit'}, {'t': 'l'}, {'t': 'en'}, {'t': 'doner'}, {'t': 'tex'}, {'t': 'choses'}, {'t': 'qui'}, {'t': 'soient'}, {'t': 'plus'}, {'t': 'boenes'}, {'t': 'que'}, {'t': 'beles'}, {'t': 'et'}, {'t': 'plus'}, {'t': 'porfitables'}, {'t': 'que'}, {'t': 'plaisanz'}, {'t': 'car'}, {'t': 'povretez'}, {'t': 'n'}, {'t': 'a'}, {'t': 'mestier'}, {'t': 'que'}, {'t': 'd'}, {'t': 'amendement'}, {'t': 'et'}, {'t': 'richece'}, {'t': 'n'}, {'t': 'a'}, {'t': 'mestier'}, {'t': 'que'}, {'t': 'de'}, {'t': 'delit'}, {'t': 'Ne'}, {'t': 'tex'}, {'t': 'choses'}, {'t': 'nefont'}, {'t': 'mie'}, {'t': 'a'}, {'t': 'doner'}, {'t': 'a'}, {'t': 'toz'}, {'t': 'car'}, {'t': 'en'}, {'t': 'ne'}, {'t': 'doit'}, {'t': 'doner'}, {'t': 'a'}, {'t': 'home'}, {'t': 'chose'}, {'t': 'dom'}, {'t': 'il'}, {'t': 'ait'}, {'t': 'assez'}, {'t': 'Ensint'}, {'t': 'e'}, {'t': 'covanra'}, {'t': 'doner'}, {'t': 'se'}, {'t': 'tu'}, {'t': 'viaus'

In [5]:
#collation
coll = collate(json_input, output="table",segmentation=False)
coll

<collatex.core_classes.AlignmentTable at 0x7fddf7782e80>

In [6]:
#définition de la fonction notre export
def notre_export_xml(table):
    readings = []
    for column in table.columns:
        #création élément app
        app = et.Element('app')
        for key, value in sorted(column.tokens_per_witness.items()):
            #création d'élément rdg
            child = et.Element('rdg')
            child.attrib['wit'] = "#" + key
            child.text = "".join(str(item.token_data["t"]) for item in value)
            app.append(child)
        result = et.tostring(app, encoding="unicode")
        readings.append(result)
    return "".join(readings)

In [7]:
collResult = notre_export_xml(coll) 

In [8]:
docCollResult = "<text>" + ''.join(collResult) + "</text>"

In [9]:
docCollResult

'<text><app><rdg wit="#A">Mais</rdg><rdg wit="#B">Mais</rdg><rdg wit="#C">Mais</rdg></app><app><rdg wit="#A">au</rdg><rdg wit="#B">aux</rdg><rdg wit="#C">au</rdg></app><app><rdg wit="#A">povre</rdg><rdg wit="#B">povres</rdg><rdg wit="#C">povre</rdg></app><app><rdg wit="#A">home</rdg><rdg wit="#B">hommes</rdg><rdg wit="#C">home</rdg></app><app><rdg wit="#A">doit</rdg><rdg wit="#B">doit</rdg><rdg wit="#C">doit</rdg></app><app><rdg wit="#A">l</rdg><rdg wit="#B">on</rdg><rdg wit="#C">l</rdg></app><app><rdg wit="#A">en</rdg><rdg wit="#B">telles</rdg><rdg wit="#C">en</rdg></app><app><rdg wit="#A">doner</rdg><rdg wit="#C">douner</rdg></app><app><rdg wit="#A">tex</rdg><rdg wit="#C">tex</rdg></app><app><rdg wit="#A">choses</rdg><rdg wit="#B">choses</rdg><rdg wit="#C">choses</rdg></app><app><rdg wit="#B">donner</rdg></app><app><rdg wit="#A">qui</rdg><rdg wit="#B">qui</rdg><rdg wit="#C">qui</rdg></app><app><rdg wit="#A">soient</rdg><rdg wit="#B">soient</rdg><rdg wit="#C">soient</rdg></app><app><r

In [10]:
#on écrit le résultat (une simple chaîne de caractères) dans notre document XML
with open("resultat_collation_depuisXML3SansL.xml", "w") as text_file:
    text_file.write(docCollResult)   