# Data Labeling

Labeling is the process of adding meaningful tags or labels to raw data

In [1]:
# import basic libraries
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Get working directory
import os, sys
sys.path.append(os.path.abspath('..'))

In [3]:
from scripts.token_labeling import label_tokens

### Load preprocessed and tokenized data

In [4]:
tokenized_data = pd.read_csv('../data/tokenized_telegram_data.csv')

In [5]:
tokenized_data.head()

Unnamed: 0,Message Text
0,"['nipple', 'shield', 'የእናት', 'ጡት', 'ጫፍ', 'ማራዘሚ..."
1,"['Marc', 'Jacob', '3', 'in', '1', 'glasses']"
2,"['Marc', 'Jacob', '3', 'in', '1', 'sunglass', ..."
3,"['Door', 'Bottom', 'Sealer']"
4,"['Door', 'Bottom', 'Sealer', 'አየር', 'ከውጭ', 'ወደ..."


In [6]:
# Extract the Message Text column
messages = tokenized_data["Message Text"]


In [7]:
messages

0       ['nipple', 'shield', 'የእናት', 'ጡት', 'ጫፍ', 'ማራዘሚ...
1            ['Marc', 'Jacob', '3', 'in', '1', 'glasses']
2       ['Marc', 'Jacob', '3', 'in', '1', 'sunglass', ...
3                            ['Door', 'Bottom', 'Sealer']
4       ['Door', 'Bottom', 'Sealer', 'አየር', 'ከውጭ', 'ወደ...
                              ...                        
5686    ['Ladies', 'Nike', 'Made', 'in', 'Vietnam', 'S...
5687    ['፧', 'ተሽጦ', 'አልቋል', 'Nike', 'kobe', '11', 'Ma...
5688    ['Nike', 'airmax', 'flyknite', 'Made', 'in', '...
5689    ['Nike', 'airmax', 'flyknite', 'Made', 'in', '...
5690    ['Nike', 'airmax', 'tailwind', 'Made', 'in', '...
Name: Message Text, Length: 5691, dtype: object

## Token labeling

In [8]:
# Parse each message into tokens and label them
labeled_data = []
for message in messages:
    tokens = eval(message)  # Convert string representation of list back to list
    labeled_data.append(label_tokens(tokens))

In [9]:
labeled_data[:10]

[[('nipple', 'B-PRODUCT'),
  ('shield', 'I-PRODUCT'),
  ('የእናት', 'O'),
  ('ጡት', 'O'),
  ('ጫፍ', 'O'),
  ('ማራዘሚያ', 'O'),
  ('ዋጋ', 'B-PRICE'),
  ('450', 'I-PRICE'),
  ('ብር', 'I-PRICE'),
  ('0911762201', 'I-PRICE'),
  ('0972824252', 'I-PRICE'),
  ('0988404491', 'I-PRICE'),
  ('0922282582', 'I-PRICE'),
  ('በቴሌግራም', 'O'),
  ('ለማዘዝ', 'O'),
  ('@GebeyaAdama21', 'O'),
  ('አድራሻችን', 'O'),
  ('አዳማ', 'I-LOC'),
  ('ፖስታ', 'O'),
  ('ቤት', 'O'),
  ('ሶሬቲ', 'O'),
  ('ሞል', 'O'),
  ('ምድር', 'O'),
  ('ላይ', 'O'),
  ('ሱ.ቁ', 'O'),
  ('33', 'I-PRICE'),
  ('ይሄንን', 'O'),
  ('በመጫን', 'O'),
  ('የቤተሠባችን', 'O'),
  ('አባል', 'O'),
  ('ይሁኑ', 'O'),
  ('የመረጡትን', 'O'),
  ('እቃ', 'O'),
  ('ይዘዙ', 'O'),
  ('፤', 'O'),
  ('ያሉበት', 'O'),
  ('እናደርሳለን', 'O'),
  ('!!', 'O'),
  ('በኪስዎ', 'O'),
  ('ጥሬ', 'O'),
  ('ገንዘብ', 'O'),
  ('ካልያዙ', 'O'),
  ('በሞባይል', 'O'),
  ('ማስተላለፍ', 'O'),
  ('ይችላሉ', 'O'),
  ('።', 'O')],
 [('Marc', 'B-PRODUCT'),
  ('Jacob', 'O'),
  ('3', 'I-PRICE'),
  ('in', 'O'),
  ('1', 'I-PRICE'),
  ('glasses', 'I-PRODUCT')],
 [('M

## Generate CoNLL Format

In [10]:
# Save the labeled data in CoNLL format to a file
output_file_path = '../data/labeled_data.conll'
with open(output_file_path, "w", encoding="utf-8") as f:
    for sentence in labeled_data:
        for token, label in sentence:
            f.write(f"{token}\t{label}\n")
        f.write("\n")  # Separate sentences with a blank line

output_file_path
print(f"Labeled data saved to {output_file_path}.")

Labeled data saved to ../data/labeled_data.conll.
