## Labelling a Subset of Dataset in CoNLL Format (Task 2)
-- an alternate and efficient way --

In [1]:
# import needed libraries for labelling
import pandas as pd
import re

In [2]:
# Read the CSV file
df = pd.read_csv(r'E:\KAIM 2\KAIM 2 Week 5\Week5\data\cleaned_data.csv')

In [3]:
# Function to tokenize Amharic text
def tokenize(text):
    # Split on whitespace and punctuation
    return re.findall(r'\w+|[^\w\s]', text)

In [4]:
# Function to label entities
def label_entities(tokens):
    labels = ['O'] * len(tokens)
    i = 0
    while i < len(tokens):
        # Product labeling
        if tokens[i].lower() in ['baby', 'የልጆች', 'የሲልከን']:
            labels[i] = 'B-Product'
            j = i + 1
            while j < len(tokens) and tokens[j].lower() not in ['0909003864', '0905707448', 'birr', 'ብር']:
                labels[j] = 'I-Product'
                j += 1
            i = j
        # Price labeling
        elif tokens[i].isdigit() and i + 1 < len(tokens) and tokens[i+1].lower() in ['birr', 'ብር']:
            labels[i] = 'B-PRICE'
            labels[i+1] = 'I-PRICE'
            i += 2
        # Location labeling
        elif tokens[i] in ['ገርጂ', '4ኪሎ', 'ብስራተ']:
            labels[i] = 'B-LOC'
            j = i + 1
            while j < len(tokens) and tokens[j] not in ['0909003864', '0905707448', 'ብር']:
                labels[j] = 'I-LOC'
                j += 1
            i = j
        else:
            i += 1
    return labels

In [5]:
# Process messages and create CoNLL format output
output = []
for index, row in df.iterrows():
    if index >= 50:  # Process only 50 messages
        break
    
    message = row['Message']
    tokens = tokenize(message)
    labels = label_entities(tokens)
    
    for token, label in zip(tokens, labels):
        output.append(f"{token} {label}")
    output.append("")  # Empty line between messages

In [7]:
# Save to a text file
with open('E:\KAIM 2\KAIM 2 Week 5\Week5\data\labeled_data_conll.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(output))

print("Labeling complete. Results saved in 'labeled_data_conll.txt'")

Labeling complete. Results saved in 'labeled_data_conll.txt'


  with open('E:\KAIM 2\KAIM 2 Week 5\Week5\data\labeled_data_conll.txt', 'w', encoding='utf-8') as f:


In [8]:
# Attempting to read the file with utf-8 encoding
try:
    with open(r'E:\KAIM 2\KAIM 2 Week 5\Week5\data\labeled_data_conll.txt', 'r', encoding='utf-8') as file:
        lines = file.readlines()
    for line in lines:
        print(line.strip())

except UnicodeDecodeError as e:
    print(f"UnicodeDecodeError: {e}")
    # You can try another encoding if utf-8 fails
    with open(r'E:\KAIM 2\KAIM 2 Week 5\Week5\data\labeled_data_conll.txt', 'r', encoding='latin-1') as file:
        lines = file.readlines()
    for line in lines:
        print(line.strip())

ለኮንዶሚኒየም O
ለጠባብ O
ቤቶች O
ገላግሌ O
የሆነ O
ከንፁህ O
የሲልከን B-Product
ጥሬ I-Product
እቃ I-Product
የተሰራ I-Product
የልጆች I-Product
ማጠቢያ I-Product
ምስሉ I-Product
ላይ I-Product
እንደሚያዩት I-Product
መታጠፍ I-Product
መዘርጋት I-Product
የሚችል I-Product
3350ብር I-Product
ይደውሉልን I-Product
እርሶ I-Product
መምጣት I-Product
ባይመቾ I-Product
እኛ I-Product
ያሉበት I-Product
ድረስ I-Product
እናደርስሎታለን I-Product
ስልክ I-Product
0905707448 O
0909003864 O
ሲና O
የተመረጡና O
ጥራታቸውን O
የጠበቁ O
የልጆች B-Product
እቃ I-Product
አስመጪ I-Product
0909003864 O
0905707448 O
እቃ O
ለማዘዝ O
ከስር O
ያለውን O
ሊንኮች O
በመጫን O
ማዘዝ O
ትችላላቹ O
@ O
sinasinaye O
@ O
sinayalj2 O
አድራሻ O
1 O
⃣ O
ቁጥር1 O
ገርጂ B-LOC
ኢምፔሪያል I-LOC
ከሳሚ I-LOC
ህንፃ I-LOC
ጎን I-LOC
አልፎዝ I-LOC
ፕላዛ I-LOC
ግራውንድ I-LOC
ላይ I-LOC
እንደገቡ I-LOC
ያገኙናል I-LOC
2 I-LOC
⃣ I-LOC
ቁጥር2 I-LOC
4ኪሎ I-LOC
ቅድስት I-LOC
ስላሴ I-LOC
ህንፃ I-LOC
ማለትም I-LOC
ከብልፅግና I-LOC
ዋናፅፈት I-LOC
ቤት I-LOC
ህንፃ I-LOC
በስተ I-LOC
ቀኝ I-LOC
ባለው I-LOC
አስፓልት I-LOC
20ሜትር I-LOC
ዝቅ I-LOC
እንዳሉ I-LOC
ሀበሻ I-LOC
ኮፊ I-LOC
የሚገኝበት I-LOC
ቀይ I-LOC
ሸክላ I-LOC
ህንፃ I-LOC
2ተኛ I-LOC
ፎቅ I-L