## Amharic Named Entity Recognition (NER) system

### Importing the necessary libraries

In [104]:
import pandas as pd
import demoji

from amseg.amharicNormalizer import AmharicNormalizer as normalizer
from amseg.amharicSegmenter import AmharicSegmenter



In [105]:
import warnings
warnings.filterwarnings('ignore')

In [277]:
import regex as re
    
def remove_emoji(text):
    try:
        no_emoji = demoji.replace(text, repl = "")
    except:
        no_emoji = text

    return no_emoji

def normal_former(message):
  normalized = normalizer.normalize(message)
  return normalized

def tokenizer(message):
  
  punct = ['።', '፤', '፡', '!', '?', '፥', '፦', '፧', '(', ')', ',', '.', '-']
  sent_punct = []
  word_punct = []
  undesired_words = ['@classy', 'ብርands', 'ብርandseller', '@sami_twa', '@kingsmarque']
  segmenter = AmharicSegmenter(sent_punct,word_punct)
  words = segmenter.amharic_tokenizer(message)

  return [word for word in words if word not in punct and not re.match(r'[a-zA-Z]', word) and word not in undesired_words]

def label_conll_format(word_list):
    labeled_data = []
    i = 0
    while i < len(word_list):
        word = word_list[i]

        if word == "ዋጋ":
            # Label 'ዋጋ' as B-Price and next two words as I-Price
            labeled_data.append((word, "B-Price"))
            if i + 1 < len(word_list):
                labeled_data.append((word_list[i + 1], "I-Price"))
            if i + 2 < len(word_list):
                labeled_data.append((word_list[i + 2], "I-Price"))
            i += 3

        elif word == "አድራሻ":
            # Label 'አድራሻ' as O, then next two words as location
            labeled_data.append((word, "O"))
            if i + 1 < len(word_list):
                labeled_data.append((word_list[i + 1], "B-LOC"))
            if i + 2 < len(word_list):
                labeled_data.append((word_list[i + 2], "I-LOC"))
            if i + 3 < len(word_list):
                labeled_data.append((word_list[i + 3], "I-LOC"))                
            if i + 4 < len(word_list):
                labeled_data.append((word_list[i + 4], "I-LOC"))                  
            i += 5

        else:
            # Label all other words as O
            labeled_data.append((word, "O"))
            i += 1

    return labeled_data



#### Loading and cleaning data 

In [188]:
telegram_data = pd.read_csv('../data/telegram_data.csv')

In [190]:
data_clean_message = telegram_data[~(telegram_data['Message'].isna())]
data_clean_message.loc[:,'message_no_emoji'] = data_clean_message['Message'].apply(lambda x: remove_emoji(x))
data_clean_message = data_clean_message[data_clean_message['message_no_emoji'].str.contains("Price")]

In [278]:
replacements = {
    'Price': 'ዋጋ',
    'Br' : ' ብር',
    'br ': ' ብር',
    'Address': ' አድራሻ',
    'Addis Ababa': 'አዲስ አበባ',
    'HayaHulet': 'ሀያሁለት', 
}

data_clean_message['message_no_emoji'] = data_clean_message['message_no_emoji'].replace(replacements)


In [279]:
message_only = data_clean_message['message_no_emoji']
message_only.to_csv('../data/message.csv', index=False)

#### Preprocess text data

In [275]:
messages = pd.read_csv('../data/message.csv')

In [None]:
corpus = []
for i in range(messages.shape[0]):
  corpus.extend(tokenizer(messages['normalized'].iloc[i]))

In [None]:
labeled_data = label_conll_format(corpus)


co_nll_output = []
for word, label in labeled_data:
    co_nll_output.append(f"{word} {label}")

co_nll_formatted_output = "\n".join(co_nll_output) + "\n"

with open('../data/conll_output.txt', 'w', encoding='utf-8') as f:
    f.write(co_nll_formatted_output)
