# Multepal: Colop Poetic K'iche' Edition Line Encoding
* **Created by:** Aldo Barriente
* **Date:** 06/25/2021

## Set Up

In [1]:
import re
import xml.etree.ElementTree as et

In [6]:
text_prefix = "colop_pk_text"
data_in = './source/'
data_out = './encoding_output/'

## Importing text

In [41]:
with open(f'{text_prefix}.txt') as colop_plaintext:
    text = colop_plaintext.readlines()

In [42]:
text

["NAB'E\n",
 '\n',
 "ARE' UXE' OJER TZIJ WARAL K'ICHE' UB'I'\n",
 '\n',
 "Waral xchiqatz'ib'aj wi,\n",
 "      xchiqatikib'a' wi ojer tzij\n",
 "utikarib'al,\n",
 "uxe'nab'al puch ronojel xb'an pa tinamit K'iche'\n",
 "                                 ramaq' K'iche' winaq.\n",
 "Are k'ut xchiqak'am wi:\n",
 "uk'utunisaxik,\n",
 "uq'alajob'isaxik,\n",
 'utzijoxik puch\n',
 "ewaxib'al,\n",
 "saqirib'al rumal Tz'aqol,\n",
 "                 B'itol,\n",
 '                 Alom,\n',
 "                 K'ajolom kib'i'.\n",
 "                 Junajpu Wuch',\n",
 '                 Junajpu Utiw,\n',
 '                 Saqinim Aq\n',
 '                         Sis\n',
 '                 Tepew,\n',
 "                 Q'ukumatz,\n",
 "                 Uk'ux Cho,\n",
 "                 Uk'ux Palo,\n",
 '                 Ajraxa Laq,\n',
 "                 Ajraxa Tzel. Chucha'xik,\n",
 "                              rachb'ixik,\n",
 '                              rachtzijoxik ri Iyom,\n',
 '         

## Encoding lines

In [43]:
for i in range(len(text)):
    offset = re.match(r'(\s+)', text[i]) ## matching the alignment space at the beginning of a line
    if offset is not None and len(text[i]) != 1: ## if there is at least one alignment space
        quantity = offset.end(0) ## stores the number of spaces
        text[i] = text[i].strip() ## removes the space in the string  
        text[i] = '<space quantity="' + str(quantity) + '" unit="chars" /><l>' + text[i] + "</l>" ## encodes each line
    elif text[i] == '\n':
        text[i] = '<lb /> <!-- potential div marker -->'
    else: ## if there are no alignment spaces
        text[i] = text[i].strip()
        text[i] = '<l>' + text[i] + '</l>'

In [44]:
text

["<l>NAB'E</l>",
 '<lb /> <!-- potential div marker -->',
 "<l>ARE' UXE' OJER TZIJ WARAL K'ICHE' UB'I'</l>",
 '<lb /> <!-- potential div marker -->',
 "<l>Waral xchiqatz'ib'aj wi,</l>",
 '<space quantity="6" unit="chars" /><l>xchiqatikib\'a\' wi ojer tzij</l>',
 "<l>utikarib'al,</l>",
 "<l>uxe'nab'al puch ronojel xb'an pa tinamit K'iche'</l>",
 '<space quantity="33" unit="chars" /><l>ramaq\' K\'iche\' winaq.</l>',
 "<l>Are k'ut xchiqak'am wi:</l>",
 "<l>uk'utunisaxik,</l>",
 "<l>uq'alajob'isaxik,</l>",
 '<l>utzijoxik puch</l>',
 "<l>ewaxib'al,</l>",
 "<l>saqirib'al rumal Tz'aqol,</l>",
 '<space quantity="17" unit="chars" /><l>B\'itol,</l>',
 '<space quantity="17" unit="chars" /><l>Alom,</l>',
 '<space quantity="17" unit="chars" /><l>K\'ajolom kib\'i\'.</l>',
 '<space quantity="17" unit="chars" /><l>Junajpu Wuch\',</l>',
 '<space quantity="17" unit="chars" /><l>Junajpu Utiw,</l>',
 '<space quantity="17" unit="chars" /><l>Saqinim Aq</l>',
 '<space quantity="25" unit="chars" /><l>Sis</l>'

## Exporting encoding

In [51]:
with open ('cpk_test.xml', 'a') as cpk_xml:
    for line in text:
        cpk_xml.write(line + '\n')

## lg and l numbers

In [31]:
colop_tree = et.parse('colop-pk.xml')
namespace = '{http://www.tei-c.org/ns/1.0}'

In [32]:
root = colop_tree.getroot()

In [45]:
for child in root.findall(f'./{namespace}text/{namespace}body/'):
    for subchild in child.findall(f'./{namespace}lg'):
        print(subchild.attrib)

{'n': '1', '{http://www.w3.org/XML/1998/namespace}id': 'lg01', 'type': 'seccion'}
