Skip to content

Commit

Permalink
mvp-HXLTM (#16), EticaAI/HXL-Data-Science-file-formats#19: hxltm2xlif…
Browse files Browse the repository at this point in the history
…f v0.7, --archivum-extensionem=.csv
  • Loading branch information
fititnt committed Jun 27, 2021
1 parent 10abda5 commit 25fe905
Showing 1 changed file with 60 additions and 15 deletions.
75 changes: 60 additions & 15 deletions _systema/programma/hxltm2xliff.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,19 @@
# COMPANY: EticaAI
# LICENSE: Public Domain dedication
# SPDX-License-Identifier: Unlicense
# VERSION: v0.6
# VERSION: v0.7
# CREATED: 2021-06-27 19:50 UTC v0.5, de github.com/EticaAI
# /HXL-Data-Science-file-formats/blob/main/bin/hxl2example
# REVISION: 2021-06-27 19:50 UTC v0.6 de hxl2tab
# REVISION: 2021-06-27 21:16 UTC v0.6 de hxl2tab
# REVISION: 2021-06-27 23:53 UTC v0.7 --archivum-extensionem=.csv
# ==============================================================================

# Tests
# ./_systema/programma/hxltm2xliff.py --help
# ./_systema/programma/hxltm2xliff.py _hxltm/schemam-un-htcds.tm.hxl.csv
# ./_systema/programma/hxltm2xliff.py _hxltm/schemam-un-htcds-5items.tm.hxl.csv

__VERSION__ = "v0.6"
__VERSION__ = "v0.7"

import sys
import os
Expand Down Expand Up @@ -226,7 +227,8 @@ def hxltm2csv(self, hxlated_input, tab_output, is_stdout, args):
)

if is_stdout:
txt_writer = csv.writer(sys.stdout, delimiter='\t')
# txt_writer = csv.writer(sys.stdout, delimiter='\t')
txt_writer = csv.writer(sys.stdout)
txt_writer.writerow(header_new)
for line in csv_reader:
txt_writer.writerow(line)
Expand All @@ -237,7 +239,8 @@ def hxltm2csv(self, hxlated_input, tab_output, is_stdout, args):
tab_output_cleanup.close()

with open(tab_output, 'a') as new_txt:
txt_writer = csv.writer(new_txt, delimiter='\t')
# txt_writer = csv.writer(new_txt, delimiter='\t')
txt_writer = csv.writer(new_txt)
txt_writer.writerow(header_new)
for line in csv_reader:
txt_writer.writerow(line)
Expand Down Expand Up @@ -284,8 +287,16 @@ def hxltm2csv_header(self, hxlated_header, fontem_linguam, objectivum_linguam):
XLIFF translation pair
[eng-Latn]_
#item+id -> #x_xliff+unit+id
#meta+archivum -> #x_xliff+file
#item+id -> #x_xliff+unit+id
#meta+archivum -> #x_xliff+file
[contextum: XLIFF srcLang]
#item(*)+i_ZZZ+is_ZZZZ -> #x_xliff+source+i_ZZZ+is_ZZZZ
#status(*)+i_ZZZ+is_ZZZZ+xliff -> #meta+x_xliff+segment_source+state+i_ZZZ+is_ZZZZ (XLIFF don't support)
[contextum: XLIFF trgLang]
#item(*)+i_ZZZ+is_ZZZZ -> #x_xliff+target+i_ZZZ+is_ZZZZ
#status(*)+i_ZZZ+is_ZZZZ+xliff -> #x_xliff+segment+state+i_ZZZ+is_ZZZZ
"""

# TODO: improve this block. I'm very sure there is some cleaner way to
Expand All @@ -301,18 +312,52 @@ def hxltm2csv_header(self, hxlated_header, fontem_linguam, objectivum_linguam):
fon_ling = HXLTM2XLIFF.linguam_2_hxlattrs(fontem_linguam)
obj_ling = HXLTM2XLIFF.linguam_2_hxlattrs(objectivum_linguam)

# print('fon_ling', fon_ling)
# print('obj_ling', obj_ling)

for idx, _ in enumerate(hxlated_header):

# feature types
if hxlated_header[idx] == '#item+id':

# hxlated_header[idx] = '#item+id+xliff_segment_id'
hxlated_header[idx] = '#x_xliff+unit+id'
# hxlated_header[idx] = 'D' + hxlated_header[idx]
continue

elif hxlated_header[idx] == '#meta+archivum':
hxlated_header[idx] = '#x_xliff+file'
elif True:
break
continue

elif hxlated_header[idx].startswith('#item'):

if hxlated_header[idx].find(fon_ling) > -1 and \
not hxlated_header[idx].find('+list') > -1:
hxlated_header[idx] = '#x_xliff+source' + fon_ling
elif hxlated_header[idx].find(obj_ling) > -1 and \
not hxlated_header[idx].find('+list') > -1:
hxlated_header[idx] = '#x_xliff+target' + obj_ling

continue

elif hxlated_header[idx].startswith('#status'):
if hxlated_header[idx].find(fon_ling) > -1 and \
not hxlated_header[idx].find('+list') > -1:
# TODO: maybe just ignore source state? XLIFF do not
# support translations from source languages that
# are not ideally ready yet
if hxlated_header[idx].find('+xliff') > -1:
hxlated_header[idx] = '#x_xliff+segment+state' + fon_ling
elif hxlated_header[idx].find(obj_ling) > -1 and \
not hxlated_header[idx].find('+list') > -1:
if hxlated_header[idx].find('+xliff') > -1:
hxlated_header[idx] = '#x_xliff+segment+state' + obj_ling
if hxlated_header[idx] != '#status':
print('#status ERROR?, FIX ME', hxlated_header[idx])
continue

elif hxlated_header[idx].startswith('#meta'):
continue
# print('TODO')
# elif True:
# break
# elif hxlated_header[idx].find('+vt_orange_type_discrete') > -1 \
# or hxlated_header[idx].find('+vt_categorical') > -1:

Expand Down Expand Up @@ -413,9 +458,9 @@ def linguam_2_hxlattrs(linguam):
Example:
>>> HXLTM2XLIFF.linguam_2_hxlattrs('por-Latn')
i_por+is_latn
+i_por+is_latn
>>> HXLTM2XLIFF.linguam_2_hxlattrs('arb-Arab')
i_arb+is_Arab
+i_arb+is_Arab
Args:
linguam ([String]): A linguam code
Expand All @@ -424,7 +469,7 @@ def linguam_2_hxlattrs(linguam):
[String]: HXL Attributes
"""
iso6393, iso115924 = list(linguam.lower().split('-'))
return 'i_' + iso6393 + '+is_' + iso115924
return '+i_' + iso6393 + '+is_' + iso115924


class HXLUtils:
Expand Down

0 comments on commit 25fe905

Please sign in to comment.