Skip to content

Commit

Permalink
mvp-HXLTM (#16), EticaAI/HXL-Data-Science-file-formats#19: hxltm2xlif…
Browse files Browse the repository at this point in the history
…f, formato XML do XLIFF, quase MVP
  • Loading branch information
fititnt committed Jun 28, 2021
1 parent d164727 commit c54728f
Show file tree
Hide file tree
Showing 3 changed files with 146 additions and 28 deletions.
1 change: 1 addition & 0 deletions _hxltm/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@
*#
out/*
!out/.gitkeep
*.zip
33 changes: 33 additions & 0 deletions _hxltm/exemplum/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -285,3 +285,36 @@ sh /opt/okapi/tikal.sh -x _hxltm/out/schemam-un-htcds_eng-Latn--por-Latn.tsv -gs
## >>> https://bitbucket.org/okapiframework/okapi/issues/1053/an-error-occurred-when-extracting-from-the
```



<!--
- /workspace/git/HXL-CPLP/Auxilium-Humanitarium-API/_hxltm/schemam-un-htcds-5items.tm.hxl.zip
./_systema/programma/hxltm2xliff.py _hxltm/schemam-un-htcds.tm.hxl.csv
'utf-8' codec can't decode byte 0xd0 in position 4094: unexpected end of data
'utf-8' codec can't decode byte 0xd0 in position 4094: unexpected end of data
'utf-8' codec can't decode byte 0xd0 in position 4094: unexpected end of data
'utf-8' codec can't decode byte 0xd0 in position 4094: unexpected end of data
'utf-8' codec can't decode byte 0xd0 in position 4094: unexpected end of data
'utf-8' codec can't decode byte 0xd0 in position 4094: unexpected end of data
'utf-8' codec can't decode byte 0xd0 in position 4094: unexpected end of data
Traceback (most recent call last):
File "./_systema/programma/hxltm2xliff.py", line 766, in <module>
hxltm2xliff.execute_cli(args)
File "./_systema/programma/hxltm2xliff.py", line 185, in execute_cli
with self.hxlhelper.make_source(args, stdin) as source, \
File "./_systema/programma/hxltm2xliff.py", line 670, in make_source
input = self.make_input(args, stdin)
File "./_systema/programma/hxltm2xliff.py", line 689, in make_input
return hxl.io.make_input(
File "/home/fititnt/.local/lib/python3.8/site-packages/hxl/io.py", line 442, in make_input
return CSVInput(input, encoding=encoding)
File "/home/fititnt/.local/lib/python3.8/site-packages/hxl/io.py", line 763, in __init__
delimiter = CSVInput._detect_delimiter(input, encoding)
File "/home/fititnt/.local/lib/python3.8/site-packages/hxl/io.py", line 797, in _detect_delimiter
lines = re.split(r'\r?\n', sample)
UnboundLocalError: local variable 'sample' referenced before assignment
-->
140 changes: 112 additions & 28 deletions _systema/programma/hxltm2xliff.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,12 @@
# ==============================================================================

# Tests
# Exemplos: https://github.com/oasis-tcs/xliff-xliff-22/blob/master/xliff-21/test-suite/core/valid/allExtensions.xlf
# ./_systema/programma/hxltm2xliff.py --help
# ./_systema/programma/hxltm2xliff.py _hxltm/schemam-un-htcds.tm.hxl.csv
# ./_systema/programma/hxltm2xliff.py _hxltm/schemam-un-htcds-5items.tm.hxl.csv
# ./_systema/programma/hxltm2xliff.py _hxltm/schemam-un-htcds.tm.hxl.csv --fontem-linguam=eng-Latn


__VERSION__ = "v0.7"

Expand Down Expand Up @@ -302,36 +305,91 @@ def hxltm2xliff(self, hxlated_input, tab_output, is_stdout, args):
# maybe csv.DictReader?
# @see https://www.geeksforgeeks.org/convert-csv-to-json-using-python/
# @see https://docs.python.org/3/library/csv.html#csv.DictReader
datum = []

with open(hxlated_input, 'r') as csv_file:
csv_reader = csv.reader(csv_file)

# # Hotfix: skip first non-HXL header. Ideally I think the already
# # exported HXlated file should already save without headers.
# next(csv_reader)
header_original = next(csv_reader)
header_new = self.hxltm2csv_header(
header_original,
fontem_linguam=args.fontem_linguam,
objectivum_linguam=args.objectivum_linguam,
)

if is_stdout:
txt_writer = csv.writer(sys.stdout, delimiter='\t')
txt_writer.writerow(header_new)
for line in csv_reader:
txt_writer.writerow(line)
else:

tab_output_cleanup = open(tab_output, 'w')
tab_output_cleanup.truncate()
tab_output_cleanup.close()

with open(tab_output, 'a') as new_txt:
txt_writer = csv.writer(new_txt, delimiter='\t')
txt_writer.writerow(header_new)
for line in csv_reader:
txt_writer.writerow(line)
csvReader = csv.DictReader(csv_file)

# Convert each row into a dictionary
# and add it to data
for item in csvReader:

datum.append(HXLTM2XLIFF.hxltm_item_relevant_options(item))

resultatum = []
resultatum.append('<?xml version="1.0"?>')
resultatum.append(
'<xliff xmlns="urn:oasis:names:tc:xliff:document:2.0" version="2.0" srcLang="en" trgLang="fr">')
resultatum.append(' <file id="f1">')

num = 0

for rem in datum:
num += 1
# unit_id = rem['#x_xliff+unit+id'] if rem.has_key('#x_xliff+unit+id') else num
unit_id = rem['#x_xliff+unit+id'] if rem['#x_xliff+unit+id'] else num
resultatum.append(' <unit id="' + unit_id + '">')

resultatum.append(' <segment>')

xsource = HXLTM2XLIFF.hxltm_item_xliff_source_key(rem)
if xsource:
if not rem[xsource]:
resultatum.append(' <!-- ERROR source ' + unit_id + ', ' + xsource + '-->')
print('ERROR:', unit_id, xsource)
# continue
else:
resultatum.append(' <source>' + rem[xsource] + '</source>')

xtarget = HXLTM2XLIFF.hxltm_item_xliff_target_key(rem)
if xtarget and rem[xtarget]:
resultatum.append(' <target>' + rem[xtarget] + '</target>')

resultatum.append(' </segment>')

resultatum.append(' </unit>')

resultatum.append(' </file>')
resultatum.append('</xliff>')

# print('datum', datum)
# print('')
# print('')
# print('resultatum')
# print('resultatum', resultatum)
for ln in resultatum:
print (ln)


# with open(hxlated_input, 'r') as csv_file:
# csv_reader = csv.reader(csv_file)

# # # Hotfix: skip first non-HXL header. Ideally I think the already
# # # exported HXlated file should already save without headers.
# # next(csv_reader)
# header_original = next(csv_reader)
# header_new = self.hxltm2csv_header(
# header_original,
# fontem_linguam=args.fontem_linguam,
# objectivum_linguam=args.objectivum_linguam,
# )

# if is_stdout:
# txt_writer = csv.writer(sys.stdout, delimiter='\t')
# txt_writer.writerow(header_new)
# for line in csv_reader:
# txt_writer.writerow(line)
# else:

# tab_output_cleanup = open(tab_output, 'w')
# tab_output_cleanup.truncate()
# tab_output_cleanup.close()

# with open(tab_output, 'a') as new_txt:
# txt_writer = csv.writer(new_txt, delimiter='\t')
# txt_writer.writerow(header_new)
# for line in csv_reader:
# txt_writer.writerow(line)

# def hxl2tab_header(self, hxlated_header):
def hxltm2csv_header(self, hxlated_header, fontem_linguam, objectivum_linguam):
Expand Down Expand Up @@ -460,6 +518,32 @@ def hxltm2csv_header(self, hxlated_header, fontem_linguam, objectivum_linguam):

return hxlated_header

def hxltm_item_relevant_options(item):
item_neo = {}

for k in item:
if k.startswith('#x_xliff'):
if item[k] == '∅':
item_neo[k] = None
else:
item_neo[k] = item[k]

return item_neo

def hxltm_item_xliff_source_key(item):
for k in item:
if k.startswith('#x_xliff+source'):
return k

return None

def hxltm_item_xliff_target_key(item):
for k in item:
if k.startswith('#x_xliff+target'):
return k

return None

def linguam_2_hxlattrs(linguam):
"""linguam_2_hxlattrs
Expand Down

0 comments on commit c54728f

Please sign in to comment.