# Load Raw Data

In [6]:
file_path = '../data/labels/labeled_telegram_product_price_location.txt'
with open(file_path, encoding='utf-8') as f:
    raw_lines = f.readlines()

print("First 20 lines:\n")
print("".join(raw_lines[:20]))

First 20 lines:

3pcs B-PRODUCT
silicon I-PRODUCT
brush I-PRODUCT
spatulas I-PRODUCT
እስከ O
260°c O
ሙቀት O
መቆቆም O
የሚችል O
ዋጋ-550ብር I-PRICE
አድራሻ O
ቁ.1 O
ስሪ O
ኤም O
ሲቲ O
ሞል O
ሁለተኛ O
ፎቅ O
ቢሮ O
ቁ. O



# Clean and Structure to CoNLL

In [7]:
conll_lines = []
skip_tokens = ["", "\n"]

for line in raw_lines:
    # If it's an empty line or whitespace, treat it as a message separator
    if line.strip() == "":
        conll_lines.append("")  # blank line
        continue

    parts = line.strip().split(maxsplit=1)

    if len(parts) == 2:
        token, label = parts
        conll_lines.append(f"{token}\t{label}")
    elif len(parts) == 1:
        token = parts[0]
        conll_lines.append(f"{token}\tO")


# Save Clean CoNLL File

In [8]:
output_path = "../data/labels/conll_labeled_subset.txt"

with open(output_path, "w", encoding="utf-8") as f:
    f.write("\n".join(conll_lines))

print(f"Saved cleaned CoNLL format to {output_path}")


Saved cleaned CoNLL format to ../data/labels/conll_labeled_subset.txt


# Preview Final Output

In [9]:
with open(output_path, encoding='utf-8') as f:
    for i in range(25):
        print(f.readline().strip())


3pcs	B-PRODUCT
silicon	I-PRODUCT
brush	I-PRODUCT
spatulas	I-PRODUCT
እስከ	O
260°c	O
ሙቀት	O
መቆቆም	O
የሚችል	O
ዋጋ-550ብር	I-PRICE
አድራሻ	O
ቁ.1	O
ስሪ	O
ኤም	O
ሲቲ	O
ሞል	O
ሁለተኛ	O
ፎቅ	O
ቢሮ	O
ቁ.	O
SL-05A(ከ	O
ሊፍቱ	O
ፊት	O
ለ	O
ፊት)	O
