From c5405c075f73368ac2f72e3c10093881ecc63b2b Mon Sep 17 00:00:00 2001 From: Nicholas Othieno Date: Fri, 16 Jun 2023 17:44:36 -0400 Subject: [PATCH] A procedure and script to speed up translation of MariaDB error messages to a new language Instructions and a script to help speed up language translations for MariaDB are provided. The README.md provides instructions of a recommended workflow that involves: - Extracting the English language entries in the file errmsg-utf8.txt - Using online tools to do automatic translation. - Proof-reading the automatic translations. - Running the script insert_translations_into_errmsg.py that will take the original errmsg-utf8.txt file, the extracted english translations file and the new language translations file and generate a new file errmsg-utf8-with-new-language.txt that is a copy of errmsg-utf8.txt but with the new language entries inserted into it at the correct positions. All new code of the whole pull request, including one or several files that are either new files or modified ones, are contributed under the BSD-new license. I am contributing on behalf of my employer Amazon Web Services, Inc. --- sql/share/README.md | 51 ++++ sql/share/insert_translations_into_errmsg.py | 279 +++++++++++++++++++ 2 files changed, 330 insertions(+) create mode 100644 sql/share/README.md create mode 100755 sql/share/insert_translations_into_errmsg.py diff --git a/sql/share/README.md b/sql/share/README.md new file mode 100644 index 0000000000000..2ef8c90748f37 --- /dev/null +++ b/sql/share/README.md @@ -0,0 +1,51 @@ +## A quicker way for adding new language translations to the errmsg-utf8.txt file + +### Summary + +To generate a new language translation of MariaDB use the following pull request (PR) as a template for your work: +- https://github.com/MariaDB/server/pull/2676 + +You will notice as part of your translation work, you will have to add your language translations to the file `sql/share/errmsg-utf8.txt` which is found in the current directory. This file is long with many sections which can make the translation work tedious. In this README, we explain a procedure and provide a script `insert_translations_into_errmsg.py` that cuts down the amount of tedium in accomplishing the task. + +### Procedure +1. Start by grepping out all the english translations from errmsg-utf8.txt using the following grep command, and redirecting the output to a file: + + grep -P "^\s*eng\s" errmsg-utf8.txt > all_english_text_in_errmsg-utf8.txt + +2. Next use Google translate to obtain a translation of this file. Google translate provides the ability to upload whole files for translation. For example, this technique was used to obtain Swahili translations which yielded a file with output similar to the below (output is truncated for clarity): + + sw "hashchk" + sw "isamchk" + sw "LA" + sw "NDIYO" + sw "Haiwezi kuunda faili '% -.200s' (kosa: %M)" + sw "Haiwezi kuunda jedwali %`s.%`s (kosa: %M)" + sw "Haiwezi kuunda hifadhidata '% -.192s' (kosa: %M)" + sw "Haiwezi kuunda hifadhidata '% -.192s'; hifadhidata ipo" + +Note that Google translate removes the leading whitespace in the translation file it generates. DO NOT add that leading whitespace back! + +3. Give the translated file an appropriate name (e.g. `all_swahili_text_in_errmsg-utf8.txt`) and store it in the same directory with `errmsg-utf8.txt` and `all_english_text_in_errmsg-utf8.txt`. These 3 files will be used by the script insert_translations_into_errmsg.py. + +4. Proof check the auto-translations in the file you downloaded from Google translate. Note that Google might ommit formating information +that will cause the compilation of MariaDB to fail, so pay attention to these. + +5. Reintegrate these translations into the errmsg-utf8.txt by running the insert_translations_into_errmsg.py script as follows: + + chmod ugo+x insert_translations_into_errmsg.py # Make the script executable if it is not. + + ./insert_translations_into_errmsg.py + + For example, for the swahili translation, we ran the following: + + ./insert_translations_into_errmsg.py errmsg-utf8.txt all_english_text_in_errmsg-utf8.txt all_swahili_text_in_errmsg-utf8.txt + + The script uses the `errmsg-utf8.txt` file and the grepped english file to keep track of each new translation. It then creates a file in the same directory as `errmsg-utf8.txt` with the name `errmsg-utf8-with-new-language.txt`. + +6. Check that the reintegration of the new translations into `errmsg-utf8-with-new-language.txt` went OK, and if it did, rename `errmsg-utf8-with-new-language.txt` to `errmsg-utf8.txt`: + + mv errmsg-utf8-with-new-language.txt errmsg-utf8.txt + +7. In the header of errmsg-utf8.txt make sure to add your language long form to short form mapping. E.g. for Swahili, add: + + swahili=sw diff --git a/sql/share/insert_translations_into_errmsg.py b/sql/share/insert_translations_into_errmsg.py new file mode 100755 index 0000000000000..6c5677362c218 --- /dev/null +++ b/sql/share/insert_translations_into_errmsg.py @@ -0,0 +1,279 @@ +#!/usr/bin/python3 +import pdb +import re +from dataclasses import dataclass +import bisect +import argparse +################################################################################ +# How this script works +# The script is mainly driven by a state machine that consumes input +# and produces output "record-by-record"in an iterator-like fashion. Coroutines, +# are used to consume each of the inputs only when they are needed for each +# state, assuring proper rate-matching as 3 input sources are utilized to +# determine the insertion point of the new language, and not all +# 3 inputs are consumed at the same rate. +# The following steps are performed by the script to insert translations +# of the new language into a copy of the errmsg-utf8.txt file: +# 1. Load the source file and map out the lines in a data structure +# 2. Start reading the source file line by line. +# 2.1 For each line you can be in +# 2.1.1 SEARCHING_FOR_NEXT_HEADER state +# - In this state, we continually search the incoming +# lines from the source file for a string starting +# with a series of capital letters (^[A-Z]+). +# - Write each line to the output file, which is a copy +# of 'errmsg-utf8.txt'. +# - Change the state to CALCULATE_INSERT_POINT if a string matching +# the previous criteria is found +# - Take the string starting with capitals and save it in +# the current_header variable" +# 2.1.2 CALCULATE_INSERT_POINT state +# - Go to the data structure for the source file and +# using te current_header as a key, read out the +# value part of the structure. The value part should be +# a list . +# - Find the insert point for the new language +# error message based on the list from the previous step. +# - Change state to PERFORM_INSERT +# 2.1.3 PERFORM_INSERT state +# - Read the source file and copy out each line to the output +# file (the copy of 'errmsg-utf8.txt'). +# - Continue reading the source file and checking if the +# insert point has been reached. Once it has been reached +# insert the new language in the output file. +# - Change state to SEARCHING_FOR_NEXT_HEADER +################################################################################ + +class SectionList(list): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.comment_locations = [] + + +def read_file(filename): + ''' A function that reads a file in one go ''' + with open(filename, 'r') as f: + data = f.read() + return data + +def obtain_key_value_from_translation_line(translation_match, line): + if translation_match : + return translation_match.groups() + else: + return '#', line # Here we assume some other line type + #that is not a language type and its translation. Return as is with hash character as key + +def map_out_source_data(data): + ''' + Load the source error message file into a navigable data structure (a lists of lists + has been chosen to ensure the source order is not disrupted) + ''' + # use a regex to split the data into sections + sections = re.split(r'\n(?=[A-Z])', data) + # create a dictionary to store the processed data + data_dict = {} + # process each section + for section in sections: + if not re.match(r'^[A-Z]+', section): + continue + # split the section into lines + lines = section.split('\n') + # the title of the section is the first line + title = lines[0].strip() + # create a list for the key-value pairs in this section + section_list = [] + comment_list = [] + prev_key = '' + current_line_loc = 0 + # process each line (except the first one) + for line in lines[1:]: + # split the line into a key and a value + print(line) + translation_match = re.match(r'\s*([a-z\-]+) \"(.*)\"', line) + key, value = obtain_key_value_from_translation_line(translation_match,line) + # add the key-value pair to the section list + if key != '#': + section_list.append([key, value]) + prev_key = key + elif '#' in value: + # Current line in file is a comment, we want to keep + # track of its location in the original file + comment_list.append(current_line_loc) + current_line_loc += 1 + section_list_with_attributes = SectionList(section_list) + section_list_with_attributes.comment_locations = comment_list.copy() + + # add the section list to the main list + data_dict[title] = section_list_with_attributes + return data_dict + +def single_file_reader(input_file_name): + with open(input_file_name, 'r') as input_file: + for line in input_file: + yield line + +def single_file_writer(output_file_name): + with open(output_file_name, 'w') as output_file: + while True: + line = yield + output_file.write(line) + +def double_file_reader(file1, file2): + with open(file1, 'r') as f1, open(file2, 'r') as f2: + for line1, line2 in zip(f1, f2): + yield (line1, line2) + +def detect_language(file_name): + with open(file_name, 'r') as f: + first_line = f.readline() + lang = first_line.split()[0] + return lang + +def detect_leading_whitespace_from_source_lang_file(file_name): + with open(file_name, 'r') as f: + first_line = f.readline() + whitespace = first_line[:len(first_line) - len(first_line.lstrip())] + return whitespace + +@dataclass +class StateControlData: + """ Class for keeping track of state machine information""" + current_state: str = '' + current_header: str = '' + detected_dest_lang: str = '' + whitespace: str = '' + insert_point_index: int = 0 + stop_state_machine: bool = False + mapped_input_data: any = None + input_reader: any = None + output_writer: any = None + eng_to_new_lang_translation_mapper: any = None + + +def searching_for_next_header_action(state_machine_data): + for input_line in state_machine_data.input_reader: + if re.match(r'^[A-Z]+', input_line): + state_machine_data.current_header = input_line.strip() + state_machine_data.current_state = "CALCULATE_INSERT_POINT" + state_machine_data.output_writer.send(input_line) + break + state_machine_data.output_writer.send(input_line) + else: + state_machine_data.stop_state_machine = True + + return state_machine_data + +def calculate_insert_point_action(state_machine_data): + detected_dest_lang = state_machine_data.detected_dest_lang + current_header = state_machine_data.current_header + + old_lang_list = state_machine_data.mapped_input_data[current_header] + + # Determine the spot where the new translation should fit in + # the list of translations + index = bisect.bisect([lang for lang, _ in old_lang_list], detected_dest_lang) + + state_machine_data.insert_point_index = index + state_machine_data.current_state = "PERFORM_INSERT" + + return state_machine_data + +def finding_insert_point_action(state_machine_data): + def adjust_for_comments_occuring_before_insert_point(insert_point_index, comment_locations): + for comment_loc in comment_locations: + if comment_loc <= insert_point_index: + insert_point_index += 1 + return insert_point_index + + eng_to_new_lang_tuple = next(state_machine_data.eng_to_new_lang_translation_mapper) + current_header = state_machine_data.current_header + old_lang_list = state_machine_data.mapped_input_data[current_header] + index = adjust_for_comments_occuring_before_insert_point(state_machine_data.insert_point_index, old_lang_list.comment_locations) + detected_whitespace = state_machine_data.whitespace + + + for i,elem in enumerate(old_lang_list): + if index == i: + state_machine_data.output_writer.send(detected_whitespace + eng_to_new_lang_tuple[1]) + + input_line = next(state_machine_data.input_reader, None) + if input_line is None: + pdb.set_trace() + state_machine_data.output_writer.send(input_line) + + + # New lang should be placed last + if index >= len(old_lang_list): + state_machine_data.output_writer.send(detected_whitespace + eng_to_new_lang_tuple[1]) + #state_machine_data.output_writer.send("\n") # The lines are stripped so we add a carriage-return + + state_machine_data.current_state = "SEARCHING_FOR_NEXT_HEADER" + return state_machine_data + + +def language_inserter(data_dict, english_lang_translations_file, new_lang_translations_file): + ''' + Inserts the new language into a copy of errmsg-utf8.txt, using a state machine to + keep track of what step it is to take. Coroutines are used to keep control flow + tractable when dealing with 4 separate files + ''' + state_machine = { + "SEARCHING_FOR_NEXT_HEADER" : searching_for_next_header_action, + "CALCULATE_INSERT_POINT" : calculate_insert_point_action, + "PERFORM_INSERT" : finding_insert_point_action + } + + state_machine_data = StateControlData() + + state_machine_data.output_writer = single_file_writer('errmsg-utf8-with-new-language.txt') + next(state_machine_data.output_writer) + state_machine_data.input_reader = single_file_reader('errmsg-utf8.txt') + state_machine_data.eng_to_new_lang_translation_mapper = double_file_reader(english_lang_translations_file, new_lang_translations_file) + + state_machine_data.detected_dest_lang = detect_language(new_lang_translations_file) + state_machine_data.whitespace = detect_leading_whitespace_from_source_lang_file(english_lang_translations_file) + state_machine_data.current_header ='' + state_machine_data.current_state = "SEARCHING_FOR_NEXT_HEADER" + state_machine_data.mapped_input_data = data_dict + + while not state_machine_data.stop_state_machine: + current_state = state_machine_data.current_state + state_machine_data = state_machine[current_state](state_machine_data) + + +def main(): + ''' main function ''' + parser = argparse.ArgumentParser(description='''Given errmsg-utf8.txt, + an english language file extracted from errmsg-utf8.txt and another + file with translations into a new language from the english language + file, reinsert the new language translations into their correct + positions in a copy of errmsg-utf8.txt.''') + parser.add_argument('errmsg_file', type=str, help='Path to errmsg-utf8.txt') + parser.add_argument('english_lang_translations_file', type=str, help='Path to English lang translations file') + parser.add_argument('new_lang_translations_file', type=str, help='Path to new lang translations file') + + args = parser.parse_args() + errmsg_file = args.errmsg_file + english_lang_translations_file = args.english_lang_translations_file + new_lang_translations_file = args.new_lang_translations_file + + data = read_file(errmsg_file) + data_dict = map_out_source_data(data) + print('Original file errmsg-utf8.txt has been successfully mapped into memory.') + print('''Now starting insertion process into errmsg-utf8-with-new-language.txt which is + a copy of errmsg-utf8.txt''') + + # In case you want to hard code the language source files, uncomment + # the below two lines, set the new language file name and disable + # argument parsing. + #english_lang_translations_file = 'all_english_text_in_errmsg-utf8.txt' + #new_lang_translations_file = 'all_swahili_text_in_errmsg-utf8.txt' + language_inserter(data_dict, english_lang_translations_file, new_lang_translations_file) + print("Insertion of new language translations into errmsg-utf8-with-new-language.txt is done") + +# call the main function +if __name__ == "__main__": + main() + + +