# Data Preprocessing

In [1]:
# Importing modules
import pandas as pd
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from scripts.data_preprocess import DataPreprocess
from scripts.label import Label

In [2]:
# Load csv file
df = pd.read_csv('../data/telegram_data.csv')
df

Unnamed: 0,Channel Title,Channel Username,ID,Sender,Message,Date,Views
0,AwasMart-አዋስማርት🎁,@AwasMart,11892,AwasMart,,2025-06-27 16:32:23+00:00,856.0
1,AwasMart-አዋስማርት🎁,@AwasMart,11891,AwasMart,,2025-06-27 16:32:23+00:00,862.0
2,AwasMart-አዋስማርት🎁,@AwasMart,11890,AwasMart,,2025-06-27 16:32:23+00:00,862.0
3,AwasMart-አዋስማርት🎁,@AwasMart,11889,AwasMart,,2025-06-27 16:32:23+00:00,862.0
4,AwasMart-አዋስማርት🎁,@AwasMart,11888,AwasMart,,2025-06-27 16:32:23+00:00,860.0
...,...,...,...,...,...,...,...
4995,Fashion tera,@Fashiontera,2651,Fashiontera,"Chekich \nMade in Turkey \n Size 40,41,42,43\...",2021-11-26 19:31:57+00:00,15858.0
4996,Fashion tera,@Fashiontera,2650,Fashiontera,,2021-11-26 19:30:14+00:00,15289.0
4997,Fashion tera,@Fashiontera,2649,Fashiontera,,2021-11-26 19:30:14+00:00,15156.0
4998,Fashion tera,@Fashiontera,2648,Fashiontera,DISELE \nPrice 1300 \n(Free Delivery)\nInbox @...,2021-11-25 16:52:23+00:00,23476.0


In [3]:
# Preprocess the data
meta, content, df_processed = DataPreprocess.preprocess_dataframe(df)
df_processed.head()

Unnamed: 0,Channel Title,Channel Username,ID,Sender,Message,Date,Views,Cleaned_Message,Tokens
0,AwasMart-አዋስማርት🎁,@AwasMart,11892,AwasMart,,2025-06-27 16:32:23+00:00,856.0,,[]
1,AwasMart-አዋስማርት🎁,@AwasMart,11891,AwasMart,,2025-06-27 16:32:23+00:00,862.0,,[]
2,AwasMart-አዋስማርት🎁,@AwasMart,11890,AwasMart,,2025-06-27 16:32:23+00:00,862.0,,[]
3,AwasMart-አዋስማርት🎁,@AwasMart,11889,AwasMart,,2025-06-27 16:32:23+00:00,862.0,,[]
4,AwasMart-አዋስማርት🎁,@AwasMart,11888,AwasMart,,2025-06-27 16:32:23+00:00,860.0,,[]


In [4]:
# Save the processed DataFrame to a new CSV file
output_path = '../data/telegram_data_processed.csv'
df_processed.to_csv(output_path, index=False)
print(f"Processed data saved to {output_path}")

Processed data saved to ../data/telegram_data_processed.csv


In [6]:
# Find unique tokens in the 'Tokens' column
unique_tokens = set()
for tokens in df_processed['Tokens']:
    unique_tokens.update(tokens)
unique_token_list = list(unique_tokens)
print(unique_token_list)


['መለጠፊያ', '4xl', 'አዋስማርት', 'watch', 'ሁሌም', 'accessible', 'ቤተ', 'Included', 'quick', 'damaging', 'ምንጣፍ', 'ይዘው', 'Butyl', 'መስርያ', 'Worldwide', 'ሲልከን', 'ታብሌት', 'Roof', '3XL', 'PC', 'ለኪችን', 'flyknit', 'ልዩ', '01', 'Juicing', 'ጥራት', 'granite', 'queen', 'RAF', 'Toy', 'Handy', 'Floor', 'tops', 'በሚኖርበት', 'Bathing', 'ቀረ', 'የታመሙ', 'ጡንቻዎትን', 'Attached', '28cm', 'AIRFORCE', 'እግሮችን', 'Fur', 'access', 'Foot', 'ሰትሪት', 'GOOD', 'በፈለግነው', '260ml', 'በመላጨት', 'ለሉጫ', 'አትክልቶችን', 'ሰዉ', 'Effective', 'ስርአት', 'Pain', 'instructions', 'ነፍሳቶችን', 'ለሻውር', 'ለምሳ', 'ማስፋት', 'ከፀሀይ', '1200mA', '400ML', 'BOSS', 'ከሰኞ', '5L', '400W', 'Controlled', 'ፈጥር', '250', 'an', 'camping', 'ጆግ', 'የልብስ', 'school', 'Coat', '172', 'Stand', 'weight', 'Complet', 'Outside', 'ጥንቃቄ', 'leakage', 'Scraping', 'ወፍራም', 'Hat', 'Flex', 'በእድሜ', 'ያደርገዋል', 'herbs', 'ለማዘገጀት', 'Derma', 'አንፀባራቂ', '2000mAh', 'it', 'አማራጭ', '19pcs', '3Pcs', 'Door', 'IN', 'Boost', 'ለማጠጣት', 'ትንሳኤ', 'ayyaana', 'Rolly', 'ቴላቴሊ', 'አትክልት', 'Wifi', 'ካፕ', 'ለመላው', 'ለመክፈት', 'ይጋግራል', 'nozzl

In [7]:
# Label a Subset of Dataset in CoNLL Format
Label.label_dataframe(df_processed)

CoNLL-formatted data saved to ../data/telegram_data_conll.txt
