In [None]:
import re
import os

# NOISE INJECTION for translation

In [None]:
def strings_to_be_replaced(input_string: str, regex: str = r"\[\w+\]"):
    return list(set(re.findall(regex, input_string)))

In [None]:
def read_file(file_path: str) -> str:
    with open(file_path, "r") as file:
        return file.read()

In [None]:
json_str = read_file("../refined_template.json")

print(json_str)

In [None]:
strs_to_be_replaced = strings_to_be_replaced(json_str)

print(strs_to_be_replaced)

In [None]:
genres = {
    'New Age': "new age",
    'Electronic': "điện tử",
    'Rap': 'rap',
    'Religious': 'tôn giáo',
    'International': 'quốc tế',
    'Easy_Listening': 'dễ nghe',
    'Avant_Garde': 'avant-garde',
    'RnB': 'RnB',
    'Latin': 'Latin',
    'Children': 'trẻ em',
    'Jazz': 'jazz',
    'Classical': 'cổ điển',
    'Comedy_Spoken': 'hài kịch',
    'Pop_Rock': 'pop',
    'Reggae': 'reggae',
    'Stage': 'stage',
    'Folk': 'dân ca',
    'Blues': 'blues',
    'Vocal': 'vocal',
    'Holiday': 'holiday',
    'Country': 'đồng quê',
    "Symphony": 'giao hưởng'
}

strs_to_be_replaced.extend(set(genres.keys()))

print(strs_to_be_replaced)

In [None]:
import itertools

replacements = {
    k : f"""[{
        ''.join(
            [
                f"{k[i]}{i % 10}" for i in range(len(k))
                if k[i] not in ['[', ']']
            ]
        )
    }]""" for k in strs_to_be_replaced
} 

print(len(strs_to_be_replaced))
print(len(replacements))
print(replacements)

In [None]:
reversed_replacements = {v : k for k, v in replacements.items()}
print(reversed_replacements)

In [None]:
def replace_by_mapper(
    text: str, 
    replacement_list: list[tuple],
    pre_replacement_list: list[tuple] = []
) -> str:
    """
        Replace all values in replacement_list in text

        Args:
            text: str: The text to be replaced
            replacement_list: list[tuple]: The list of tuples to replace
            pre_replacement_list: list[tuple]: The list of tuples to replace before the main replacement_list
    """
    for k, v in pre_replacement_list:
        print(f"replace_by_mapper: pre-replacements: Replacing {k} with {v}")
        text = text.replace(k, v)

    for k, v in replacement_list:
        print(f"replace_by_mapper: Replacing {k} with {v}")
        text = text.replace(k, v)
            
    return text

In [None]:
import csv
import json

def json_str_to_csv_file(json_str: str, csv_file_path: str, func_to_apply = None, func_kwargs=None):
    '''
        Convert a JSON string to a CSV file
    '''
    if func_to_apply is not None:
        json_str = func_to_apply(json_str, **func_kwargs)
    else:
        pass

    json_str = json.loads(json_str)

    with open(csv_file_path, "w") as csv_file:
        csv_writer = csv.writer(csv_file)

        count = 0
        for item in json_str:
            if count == 0:
                header = item.keys()
                csv_writer.writerow(header)
                count += 1

            csv_writer.writerow(item.values())

In [None]:
import pandas as pd

def xlsx_file_to_json_file(
    xlsx_file_path: str, 
    json_file_path: str, 
    func_to_apply = None, 
    func_kwargs=None
):
    '''
        Convert a XLSX file to a JSON file
    '''
    df = pd.read_excel(xlsx_file_path)
    
    df = df[['attributes', 'translation']]
    df.columns = ['attributes', 'response']

    # Make the attributes column a list
    # by remove ', [ and ] and split by ,
    df['attributes'] = df['attributes'].apply(
        lambda x: x.replace("'", "").replace("[", "").replace("]", "").split()
    )

    if func_to_apply is not None:
        df['response'] = df['response'].apply(func_to_apply, **func_kwargs)
    else:
        pass

    with open(json_file_path, "w", encoding="utf-8") as json_file:
        df.to_json(json_file, orient='records', force_ascii=False)

In [None]:
attributes_with_academic_term = [
    {
        "attribute": "[KEY]",
        "prefix": "",
        "postfix": "key",
        "vietnamese_prefix": "",
        "vietnamese_postfix": "giọng"
    },
    {
        "attribute": "[TIME_SIGNATURE]",
        "prefix": "",
        "postfix": "time signature",
        "vietnamese_prefix": "",
        "vietnamese_postfix": "nhịp"
    },
    {
        "attribute": "[NUM_BARS]",
        "prefix": "",
        "postfix": "bars",
        "vietnamese_prefix": "ô nhịp",
        "vietnamese_postfix": ""
    },
    {
        "attribute": "[TIME_SIGNATURE]",
        "prefix": "time signature of",
        "postfix": "",
        "vietnamese_prefix": "",
        "vietnamese_postfix": "nhịp"
    },
    {
        "attribute": "",
        "prefix": "time signature",
        "postfix": "",
        "vietnamese_prefix": "nhịp",
        "vietnamese_postfix": ""
    },
    {
        "attribute": "",
        "prefix": "octaves",
        "postfix": "",
        "vietnamese_prefix": "quãng tám",
        "vietnamese_postfix": ""
    },
    {
        "attribute": "",
        "prefix": "octave",
        "postfix": "",
        "vietnamese_prefix": "quãng tám",
        "vietnamese_postfix": ""
    },
    {
        "attribute": "",
        "prefix": "key",
        "postfix": "",
        "vietnamese_prefix": "giọng",
        "vietnamese_postfix": ""
    },
    {
        "attribute": "",
        "prefix": "minor",
        "postfix": "",
        "vietnamese_prefix": "thứ",
        "vietnamese_postfix": ""
    },
    {
        "attribute": "",
        "prefix": "major",
        "postfix": "",
        "vietnamese_prefix": "trưởng",
        "vietnamese_postfix": ""
    },
    {
        "attribute": "",
        "prefix": "tempo",
        "postfix": "",
        "vietnamese_prefix": "tốc độ",
        "vietnamese_postfix": ""
    }
]

In [None]:
def academic_translation_encoder(attribute_item):
    text = ' '.join([
        attribute_item['prefix'],
        replacements[
            attribute_item['attribute']
        ] if attribute_item['attribute'] in replacements else attribute_item['attribute'],
        attribute_item['postfix']
    ]).strip()

    start = 0
    stop = len(text)
    step = 2
    return text, f"""[{''.join(
            [
                f'{text[i:i+step]}{(i // 2)% 10}'
                for i in range(start, stop, step)
            ]
        )}]"""

In [None]:
def academic_translation_decoder(academic_replacements):
    return {
        v["encoded_data"] : (
            ' '.join([
                attr_item["vietnamese_postfix"],
                reversed_replacements[
                    attr_item["attribute"]
                ] if attr_item["attribute"] in reversed_replacements else attr_item["attribute"],
                attr_item["vietnamese_prefix"]
            ])
        ).strip()
        for k, v in academic_replacements.items()
        for attr_item in [v["attr_pair"]]
    }

In [None]:
academic_replacements = {
    k[0] : {"encoded_data" : k[1], "attr_pair" : attr_pair}
    for attr_pair in attributes_with_academic_term
    for k in [academic_translation_encoder(attr_pair)]
}

print(academic_replacements)

adcademic_replacement_list = [(k, v["encoded_data"]) for k, v in academic_replacements.items()]

print(adcademic_replacement_list)

In [None]:
reversed_academic_replacements = academic_translation_decoder(academic_replacements)
print(reversed_academic_replacements)

In [None]:
json_str = read_file("../refined_template.json")
csv_file_path = "../refined_template.csv"

replacement_list = [(k, v) for k, v in replacements.items()]
replacement_list.extend(adcademic_replacement_list)

print(replacement_list)

In [None]:
reversed_replacement_list = [(k, v) for k, v in reversed_replacements.items()]

reversed_adcademic_replacement_list = [
    (k, v) 
    for k, v in academic_translation_decoder(
        academic_replacements
    ).items()
]

reversed_replacement_list.extend(reversed_adcademic_replacement_list)

print(reversed_replacement_list)

In [None]:
print("CONVERTING", "../refined_template.json", "TO", csv_file_path)

json_str_to_csv_file(
    json_str, 
    csv_file_path, 
    replace_by_mapper, 
    func_kwargs={"replacement_list" : replacement_list}
)

In [None]:
print(f"CSV file path: {os.path.abspath(csv_file_path)}")

# DENOISING

In [None]:
print(reversed_replacement_list)

pre_replacement_list = [
    (
        f"{k[:-1]} ", 
        k
    ) 
    for k, v in reversed_replacements.items()
]

pre_replacement_list.insert(0, ("[key0y1]", "[ke0y1]"))

print(pre_replacement_list)

In [None]:
!pwd

In [None]:
xlsx_file_path = "../refined_template_vn.xlsx"

print("CONVERTING", xlsx_file_path, "TO", "../refined_template_vn.json")

xlsx_file_to_json_file(
    xlsx_file_path=xlsx_file_path, 
    json_file_path="../refined_template_vn.json",
    func_to_apply=replace_by_mapper,
    func_kwargs={
        "replacement_list" : reversed_replacement_list,
        "pre_replacement_list" : pre_replacement_list
    }
)

In [None]:
print(f"DENOISED file path: {os.path.abspath('../refined_template_vn.json')}")