In [1]:
# task1_data_preprocessing.ipynb
# Amharic Telegram Data Preprocessing for NER

import pandas as pd
import re

# Load raw Telegram Excel data (replace with your file if running locally)
telegram_data_path = "telegram_data.xlsx"
df = pd.read_excel(telegram_data_path)

# Drop rows with no message content
if "Message" in df.columns:
    df = df.dropna(subset=["Message"]).copy()
else:
    raise ValueError("Expected 'Message' column not found in the dataset.")

# Function to clean Amharic Telegram messages
def clean_amharic_text(text):
    text = str(text)
    text = re.sub(r"http\S+|www\S+|t.me\S+", "", text)      # Remove URLs
    text = re.sub(r"@[A-Za-z0-9_]+", "", text)                # Remove mentions
    text = re.sub(r"#[\u1200-\u137F\w]+", "", text)         # Remove hashtags
    text = re.sub(r"[A-Za-z0-9]+", "", text)                 # Remove Latin letters/numbers
    text = re.sub(r"[\U00010000-\U0010ffff]", "", text)      # Remove emojis
    text = re.sub(r"\s+", " ", text).strip()                # Normalize whitespace
    return text

# Apply cleaning
print("Cleaning messages...")
df["Cleaned_Message"] = df["Message"].apply(clean_amharic_text)

# Remove empty cleaned messages
df = df[df["Cleaned_Message"] != ""]

# Tokenize messages by whitespace (you can later replace this with more advanced tokenization)
df["Tokens"] = df["Cleaned_Message"].apply(lambda x: x.split())

# Save preprocessed tokens for manual NER labeling
export_path = "amharic_ner_tokens_for_labeling.csv"
exploded = df[["Cleaned_Message", "Tokens"]].explode("Tokens").reset_index(drop=True)
exploded = exploded.rename(columns={"Tokens": "Token"})
exploded["Label"] = "O"  # Default label for manual tagging
exploded[["Token", "Label"]].to_csv(export_path, index=False)

print(f"✅ Preprocessing complete. Saved labeled token file to: {export_path}")


Cleaning messages...
✅ Preprocessing complete. Saved labeled token file to: amharic_ner_tokens_for_labeling.csv
