# Fix Labels

## Alphabets are put as words in some of the labels

In [1]:
import sys
import os
from json import load, dump

sys.path.append(os.path.abspath(os.path.join('../scripts')))
from logger_creator import CreateLogger

In [2]:
# Initializing Logger
logger = CreateLogger('LabelsFixer', handlers=1)
logger = logger.get_default_logger()


In [3]:
class LabelCleaner():
    def __init__(self, train_labels: str = '../data/train_labels.json', test_labels: str = '../data/test_labels.json') -> None:
        try:
            self.train_labels_path = train_labels
            self.test_labels_path = test_labels

            logger.info('Successfully Created Label Cleaner Class Object')

        except Exception as e:
            logger.exception('Failed to create Label Cleaner Class Object')

    def load_labels(self):
        try:
            with open(self.train_labels_path, 'r', encoding='UTF-8') as label_file:
                self.train_labels = load(label_file)

            with open(self.test_labels_path, 'r', encoding='UTF-8') as label_file:
                self.test_labels = load(label_file)

            logger.info('Successfully Loaded Train and Test Label Files')
                
        except Exception as e:
            logger.exception('Failed to Load Labels')

    def clean_suffixes(self):
        self.train_cleaned_labels = self.clean_labels_suffixes(self.train_labels)
        self.test_cleaned_labels = self.clean_labels_suffixes(self.test_labels)

    def save_labels(self, train_file_name: str = '../data/train_labels.json', test_file_name: str = '../data/test_labels.json') -> None:
        try:
            with open(train_file_name, "w", encoding='UTF-8') as export_file:
                dump(self.train_cleaned_labels, export_file, indent=4, sort_keys=True, ensure_ascii=False)

            with open(test_file_name, "w", encoding='UTF-8') as export_file:
                dump(self.test_cleaned_labels, export_file, indent=4, sort_keys=True, ensure_ascii=False)

            logger.info(f'Successfully Saved Cleaned Lables in: {train_file_name} and {test_file_name}')

        except Exception as e:
            logger.exception('Failed to Save Cleaned lables')

    def clean_labels_suffixes(self, label_dict:dict):
        try:
            cleaned_labels = {}
            for key, label in label_dict.items():
                word_list = label.split()
                cleaned_label = []
                append_prefix = None
                prefix_words = ['እ', 'የ', "አይ", "ሲ", "አላ",'እንዲ', 'ኰ', 'በ', 'ስለ', 'የሚ', 'ያ', 'አ', 'ለ', 'ከ', 'ተጉ',
                                'ሳ', 'ጐረ', 'አል', 'እጀ', 'ባ', 'እንዳስ', 'በተ', 'ተና', 'እንደ', 'ሳይ', 'ንግስተ', 'ሊ', 'እንደ',
                                'ሊ', 'የተ', 'ጠቁ', 'ተ', 'እያ', 'እን', 'ተሽ', 'አሳ', 'አከራ', 'አስራ', 'ለባለ', 'አለ', 'ከሚያ',
                                'ሳይ', 'ካይ', 'እንዳል', 'ካ', 'ሊያ', 'ያመኑ', 'አሰባ', 'እንደሚ', 'እየ']
                suffix_words = ['ን', "ም", "ና", "ያት",'ው', 'ነዋል', 'ተው', 'መ', 'መና', 'ች', 'ማት', 'ተር', 'ኝ', 'ቱ',
                                'ሎ', 'ት', 'ሁ', 'ጤ', 'ብ', 'ፋው', 'ዬ', 'ጉር', 'ጉ', 'ሯቸው', 'ወድ', 'ስ', 'ዬን', 'ጓጉ',
                                'ቻት', 'ጔ', 'ወ', 'ሚ', 'ልሽ', 'ንም', 'ሺ', 'ኲ', 'ቷል', 'ዋል', 'ቸውን', 'ተኛ', 'ስት', 'ዎች',
                                'ታል', 'ል', 'ዋጣ', 'ያችን', 'ችን', 'ውን', 'ስቶች', 'በታል', 'ነውን', 'ችል', 'ቸው', 'ባቸዋል', 'ሉት',
                                'ሉት', 'ላቸው', 'ተውናል', 'ችሏል', 'ዶች']

                for word in word_list:
                    if(word in prefix_words):
                        if(append_prefix != None):
                            append_prefix = append_prefix + word
                        else:
                            append_prefix = word
                        try:
                            if(word == word_list[-1]):
                                cleaned_label[-1] = cleaned_label[-1] + append_prefix
                            continue
                        except:
                            continue
                    elif(word in suffix_words):
                        if(append_prefix != None):
                            append_prefix  = append_prefix + word
                        else:
                            try:
                                cleaned_label[-1] = cleaned_label[-1] + word
                            except:
                                append_prefix = word
                        continue
                    elif(append_prefix != None):
                        word = append_prefix + word
                        append_prefix = None
                    
                    cleaned_label.append(word)

                cleaned_labels[key] = ' '.join(cleaned_label)
            
            logger.info('Successfully Cleaned Label Suffixes')
                
            return cleaned_labels


        except Exception as e:
            logger.exception('Failed To Clean Labels')

    def clean_and_save(self):
        self.load_labels()
        self.clean_suffixes()
        self.save_labels()

In [4]:
label_cleaner = LabelCleaner()

LabelsFixer:INFO->Successfully Created Label Cleaner Class Object


In [5]:
label_cleaner.clean_and_save()

LabelsFixer:INFO->Successfully Loaded Train and Test Label Files
LabelsFixer:INFO->Successfully Cleaned Label Suffixes
LabelsFixer:INFO->Successfully Cleaned Label Suffixes
LabelsFixer:INFO->Successfully Saved Cleaned Lables in: ../data/train_labels.json and ../data/test_labels.json
