mvp-HXLTM (#16), EticaAI/HXL-Data-Science-file-formats#19: hxltm2xlif…

…f, formato XML do XLIFF, quase MVP
HXL-CPLP · Jun 28, 2021 · c54728f · c54728f
1 parent d164727
commit c54728f
Show file tree

Hide file tree

Showing 3 changed files with 146 additions and 28 deletions.
diff --git a/_hxltm/.gitignore b/_hxltm/.gitignore
@@ -6,3 +6,4 @@
 *#
 out/*
 !out/.gitkeep
+*.zip
diff --git a/_hxltm/exemplum/README.md b/_hxltm/exemplum/README.md
@@ -285,3 +285,36 @@ sh /opt/okapi/tikal.sh -x _hxltm/out/schemam-un-htcds_eng-Latn--por-Latn.tsv -gs
 ##  >>> https://bitbucket.org/okapiframework/okapi/issues/1053/an-error-occurred-when-extracting-from-the
 ```
 
+
+
+<!--
+
+- /workspace/git/HXL-CPLP/Auxilium-Humanitarium-API/_hxltm/schemam-un-htcds-5items.tm.hxl.zip
+
+ ./_systema/programma/hxltm2xliff.py _hxltm/schemam-un-htcds.tm.hxl.csv
+'utf-8' codec can't decode byte 0xd0 in position 4094: unexpected end of data
+'utf-8' codec can't decode byte 0xd0 in position 4094: unexpected end of data
+'utf-8' codec can't decode byte 0xd0 in position 4094: unexpected end of data
+'utf-8' codec can't decode byte 0xd0 in position 4094: unexpected end of data
+'utf-8' codec can't decode byte 0xd0 in position 4094: unexpected end of data
+'utf-8' codec can't decode byte 0xd0 in position 4094: unexpected end of data
+'utf-8' codec can't decode byte 0xd0 in position 4094: unexpected end of data
+Traceback (most recent call last):
+  File "./_systema/programma/hxltm2xliff.py", line 766, in <module>
+    hxltm2xliff.execute_cli(args)
+  File "./_systema/programma/hxltm2xliff.py", line 185, in execute_cli
+    with self.hxlhelper.make_source(args, stdin) as source, \
+  File "./_systema/programma/hxltm2xliff.py", line 670, in make_source
+    input = self.make_input(args, stdin)
+  File "./_systema/programma/hxltm2xliff.py", line 689, in make_input
+    return hxl.io.make_input(
+  File "/home/fititnt/.local/lib/python3.8/site-packages/hxl/io.py", line 442, in make_input
+    return CSVInput(input, encoding=encoding)
+  File "/home/fititnt/.local/lib/python3.8/site-packages/hxl/io.py", line 763, in __init__
+    delimiter = CSVInput._detect_delimiter(input, encoding)
+  File "/home/fititnt/.local/lib/python3.8/site-packages/hxl/io.py", line 797, in _detect_delimiter
+    lines = re.split(r'\r?\n', sample)
+UnboundLocalError: local variable 'sample' referenced before assignment
+
+
+-->
diff --git a/_systema/programma/hxltm2xliff.py b/_systema/programma/hxltm2xliff.py
@@ -35,9 +35,12 @@
 # ==============================================================================
 
 # Tests
+# Exemplos: https://github.com/oasis-tcs/xliff-xliff-22/blob/master/xliff-21/test-suite/core/valid/allExtensions.xlf
 # ./_systema/programma/hxltm2xliff.py --help
 # ./_systema/programma/hxltm2xliff.py _hxltm/schemam-un-htcds.tm.hxl.csv
 # ./_systema/programma/hxltm2xliff.py _hxltm/schemam-un-htcds-5items.tm.hxl.csv
+# ./_systema/programma/hxltm2xliff.py _hxltm/schemam-un-htcds.tm.hxl.csv --fontem-linguam=eng-Latn
+
 
 __VERSION__ = "v0.7"
 
@@ -302,36 +305,91 @@ def hxltm2xliff(self, hxlated_input, tab_output, is_stdout, args):
         #       maybe csv.DictReader?
         #       @see https://www.geeksforgeeks.org/convert-csv-to-json-using-python/
         #       @see https://docs.python.org/3/library/csv.html#csv.DictReader
+        datum = []
 
         with open(hxlated_input, 'r') as csv_file:
-            csv_reader = csv.reader(csv_file)
-
-            # # Hotfix: skip first non-HXL header. Ideally I think the already
-            # # exported HXlated file should already save without headers.
-            # next(csv_reader)
-            header_original = next(csv_reader)
-            header_new = self.hxltm2csv_header(
-                header_original,
-                fontem_linguam=args.fontem_linguam,
-                objectivum_linguam=args.objectivum_linguam,
-            )
-
-            if is_stdout:
-                txt_writer = csv.writer(sys.stdout, delimiter='\t')
-                txt_writer.writerow(header_new)
-                for line in csv_reader:
-                    txt_writer.writerow(line)
-            else:
-
-                tab_output_cleanup = open(tab_output, 'w')
-                tab_output_cleanup.truncate()
-                tab_output_cleanup.close()
-
-                with open(tab_output, 'a') as new_txt:
-                    txt_writer = csv.writer(new_txt, delimiter='\t')
-                    txt_writer.writerow(header_new)
-                    for line in csv_reader:
-                        txt_writer.writerow(line)
+            csvReader = csv.DictReader(csv_file)
+
+            # Convert each row into a dictionary
+            # and add it to data
+            for item in csvReader:
+
+                datum.append(HXLTM2XLIFF.hxltm_item_relevant_options(item))
+
+        resultatum = []
+        resultatum.append('<?xml version="1.0"?>')
+        resultatum.append(
+            '<xliff xmlns="urn:oasis:names:tc:xliff:document:2.0" version="2.0" srcLang="en" trgLang="fr">')
+        resultatum.append('  <file id="f1">')
+
+        num = 0
+
+        for rem in datum:
+            num += 1
+            # unit_id = rem['#x_xliff+unit+id'] if rem.has_key('#x_xliff+unit+id') else num
+            unit_id = rem['#x_xliff+unit+id'] if rem['#x_xliff+unit+id'] else num
+            resultatum.append('      <unit id="' + unit_id + '">')
+
+            resultatum.append('        <segment>')
+
+            xsource = HXLTM2XLIFF.hxltm_item_xliff_source_key(rem)
+            if xsource:
+                if not rem[xsource]:
+                    resultatum.append('          <!-- ERROR source ' + unit_id + ', ' + xsource + '-->')
+                    print('ERROR:', unit_id, xsource)
+                    # continue
+                else:
+                    resultatum.append('          <source>' + rem[xsource] + '</source>')
+
+            xtarget = HXLTM2XLIFF.hxltm_item_xliff_target_key(rem)
+            if xtarget and rem[xtarget]:
+                resultatum.append('          <target>' + rem[xtarget] + '</target>')
+
+            resultatum.append('        </segment>')
+
+            resultatum.append('      </unit>')
+
+        resultatum.append('  </file>')
+        resultatum.append('</xliff>')
+
+        # print('datum', datum)
+        # print('')
+        # print('')
+        # print('resultatum')
+        # print('resultatum', resultatum)
+        for ln in resultatum:
+            print (ln)
+
+
+        # with open(hxlated_input, 'r') as csv_file:
+        #     csv_reader = csv.reader(csv_file)
+
+        #     # # Hotfix: skip first non-HXL header. Ideally I think the already
+        #     # # exported HXlated file should already save without headers.
+        #     # next(csv_reader)
+        #     header_original = next(csv_reader)
+        #     header_new = self.hxltm2csv_header(
+        #         header_original,
+        #         fontem_linguam=args.fontem_linguam,
+        #         objectivum_linguam=args.objectivum_linguam,
+        #     )
+
+        #     if is_stdout:
+        #         txt_writer = csv.writer(sys.stdout, delimiter='\t')
+        #         txt_writer.writerow(header_new)
+        #         for line in csv_reader:
+        #             txt_writer.writerow(line)
+        #     else:
+
+        #         tab_output_cleanup = open(tab_output, 'w')
+        #         tab_output_cleanup.truncate()
+        #         tab_output_cleanup.close()
+
+        #         with open(tab_output, 'a') as new_txt:
+        #             txt_writer = csv.writer(new_txt, delimiter='\t')
+        #             txt_writer.writerow(header_new)
+        #             for line in csv_reader:
+        #                 txt_writer.writerow(line)
 
     # def hxl2tab_header(self, hxlated_header):
     def hxltm2csv_header(self, hxlated_header, fontem_linguam, objectivum_linguam):
@@ -460,6 +518,32 @@ def hxltm2csv_header(self, hxlated_header, fontem_linguam, objectivum_linguam):
 
         return hxlated_header
 
+    def hxltm_item_relevant_options(item):
+        item_neo = {}
+
+        for k in item:
+            if k.startswith('#x_xliff'):
+                if item[k] == '∅':
+                    item_neo[k] = None
+                else:
+                    item_neo[k] = item[k]
+
+        return item_neo
+
+    def hxltm_item_xliff_source_key(item):
+        for k in item:
+            if k.startswith('#x_xliff+source'):
+                return k
+
+        return None
+
+    def hxltm_item_xliff_target_key(item):
+        for k in item:
+            if k.startswith('#x_xliff+target'):
+                return k
+
+        return None
+
     def linguam_2_hxlattrs(linguam):
         """linguam_2_hxlattrs