In [6]:
import sys
import os
from json import dump, load

sys.path.append(os.path.abspath(os.path.join('..')))
from scripts.logger_creator import CreateLogger


In [7]:
# Initializing Logger
logger = CreateLogger('AlphabetsBuilder', handlers=1)
logger = logger.get_default_logger()


In [8]:
class AlphabetsBuilder():
    def __init__(self,file_name: str, alphabets_type: int = 2, train_labels: str = '../data/train_labels.json', test_labels: str = '../data/test_labels.json') -> None:
        try:
            self.file_name = file_name
            self.alphabets_type = alphabets_type
            self.train_labels = train_labels
            self.test_labels = test_labels
            self.alphabets_data = {}
            logger.info('Successfully Created Alphabets Builder Class Object')

        except Exception as e:
            logger.exception("Failed to create Alphabets Builder Class Object")

    def get_supported_alphabets(self):
        try:
            # Method 1
            # Conside the entire Amharic Alphabets
            if(self.alphabets_type == 1):
                # Defining Entire Amharic Alphabets
                self.supported_alphabets = """
                    ሀ ሁ ሂ ሃ ሄ ህ ሆ ለ ሉ ሊ ላ ሌ ል ሎ ሏ ሐ ሑ ሒ ሓ ሔ ሕ ሖ ሗ መ ሙ ሚ ማ ሜ ም ሞ ሟ ሠ ሡ ሢ ሣ ሤ ሥ ሦ ሧ
                    ረ ሩ ሪ ራ ሬ ር ሮ ሯ ሰ ሱ ሲ ሳ ሴ ስ ሶ ሷ ሸ ሹ ሺ ሻ ሼ ሽ ሾ ሿ ቀ ቁ ቂ ቃ ቄ ቅ ቆ ቇ ቋ ቐ ቐ ቑ ቒ ቓ ቔ ቕ ቖ
                    በ ቡ ቢ ባ ቤ ብ ቦ ቧ ቨ ቩ ቪ ቫ ቬ ቭ ቮ ቯ ተ ቱ ቲ ታ ቴ ት ቶ ቷ ቸ ቹ ቺ ቻ ቼ ች ቾ ቿ ኀ ኁ ኂ ኃ ኄ ኅ ኆ ኇ ኋ 
                    ነ ኑ ኒ ና ኔ ን ጓ ኖ ኗ ኘ ኙ ኚ ኛ ኜ ኝ ኞ ኟ አ ኡ ኢ ኣ ኤ እ ኦ ኧ ከ ኩ ኪ ካ ኬ ክ ኮ ኯ ኰ ኳ ኲ
                    ኸ ኹ ኺ ኻ ኼ ኽ ኾ ወ ዉ ዊ ዋ ዌ ው ዎ ዐ ዑ ዒ ዓ ዔ ዕ ዖ ዘ ዙ ዚ ዛ ዜ ዝ ዞ ዟ ዠ ዡ ዢ ዣ ዤ ዥ ዦ ዧ
                    የ ዩ ዪ ያ ዬ ይ ዮ ዯ ደ ዱ ዲ ዳ ዴ ድ ዶ ዷ ጀ ጁ ጂ ጃ ጄ ጅ ጆ ጇ ገ ጉ ጊ ጋ ጌ ግ ጐ ጎ ጏ ጔ ጠ ጡ ጢ ጣ ጤ ጥ ጦ ጧ ጨ ጩ ጪ ጫ ጬ ጭ ጮ ጯ
                    ጰ ጱ ጲ ጳ ጴ ጵ ጶ ጷ ጸ ጹ ጺ ጻ ጼ ጽ ጾ ጿ ፀ ፁ ፂ ፃ ፄ ፅ ፆ ፇ ፈ ፉ ፊ ፋ ፌ ፍ ፎ ፏ ፐ ፑ ፒ ፓ ፔ ፕ ፖ ፗ
                """.split()
                # Adding space
                self.supported_alphabets.insert(0, '<space>')

                logger.info('Successfully retrieved alphabets from the entire Amharic Language')

            else:
                # Method 2
                # Conside Characters only from the train and test transcriptions
                # Reading Train Labels
                with open(self.train_labels, 'r', encoding='UTF-8') as label_file:
                    train_labels = load(label_file)
                # Reading Test Labels
                with open(self.test_labels, 'r', encoding='UTF-8') as label_file:
                    test_labels = load(label_file)

                # Creating an Alphabet Character Set
                char_set = set()

                # Reading from each Labels to extract alphabets
                # Extracting from Train Labels
                for label in train_labels.values():
                    characters = [char for char in label]
                    char_set.update(characters)

                # Extracting from Test Labels
                for label in test_labels.values():
                    characters = [char for char in label]
                    char_set.update(characters)

                # Creating Alphabets List
                self.supported_alphabets = list(char_set)
                # Removing Space and Inserting as <space>
                self.supported_alphabets.remove(' ')
                self.supported_alphabets.insert(0, '<space>')

                logger.info('Successfully retrieved alphabets from train and test transcriptions')

        except Exception as e:
            logger.exception('Failed To retrieve supported alphabets')

    def construct_conversion_dicts(self):
        try:
            # Constructing Alphabet to num conversion dict
            alphabet_to_num = {}
            index = 0
            # Iterating through alphabets and appending to the conversion dictionary
            for alphabet in self.supported_alphabets:
                alphabet_to_num[alphabet] = index
                index += 1

            # Constructing Alphabet to num conversion dict
            # Iterating through alphabets to num dictionary to create the reverse
            num_to_alphabet = {v: k for k, v in alphabet_to_num.items()}

            self.alphabets_data['char_to_num'] = alphabet_to_num
            self.alphabets_data['num_to_char'] = num_to_alphabet
            self.alphabets_data['alphabet_size'] = len(self.supported_alphabets)

            logger.info('Successfully constructed conversion dictionaries')

        except Exception as e:
            logger.exception('Failed to construct conversion dictionaries')

    def save_alphabets_dict(self):
        try:
            with open(self.file_name, "w", encoding='UTF-8') as export_file:
                dump(self.alphabets_data, export_file, indent=4, sort_keys=True, ensure_ascii=False)

            logger.info(f'Successfuly Saved Generated Alphabets Dictionary in: {self.file_name}')

        except Exception as e:
            logger.exception('Failed to Save Generated Alphabets Dictionary')

    def generate_and_save_alphabets(self):
        self.get_supported_alphabets()
        self.construct_conversion_dicts()
        self.save_alphabets_dict()


In [10]:
alphabet_builder = AlphabetsBuilder('../data/alphabets_data.json')


AlphabetsBuilder:INFO->Successfully Created Alphabets Builder Class Object
AlphabetsBuilder:INFO->Successfully Created Alphabets Builder Class Object


In [11]:
alphabet_builder.generate_and_save_alphabets()


AlphabetsBuilder:INFO->Successfully retrieved alphabets from train and test transcriptions
AlphabetsBuilder:INFO->Successfully retrieved alphabets from train and test transcriptions
AlphabetsBuilder:INFO->Successfully constructed conversion dictionaries
AlphabetsBuilder:INFO->Successfully constructed conversion dictionaries
AlphabetsBuilder:INFO->Successfuly Saved Generated Alphabets Dictionary in: ../data/alphabets_data.json
AlphabetsBuilder:INFO->Successfuly Saved Generated Alphabets Dictionary in: ../data/alphabets_data.json
