In [1]:
import pandas as pd
import re

# Load your DataFrame
df = pd.read_csv('../data/modern_Data.csv')

In [None]:
df

In [None]:
print("Checking for NaN values in the 'Message' column:")
nan_count = df['Message'].isnull().sum()
print(f"Number of NaN values in 'Message' column: {nan_count}")

In [None]:
df = df.dropna(subset=['Message'])

# Print the shape of the dataset after dropping NaN values in the "Message" column
print(f"Dataset shape after dropping NaN values in 'Message' column: {df.shape}")

In [None]:
df.head()

In [None]:
message_df=df['Message']
message_df

In [None]:
message_df.head()

Labeling a subset of the dataset in CoNLL format for Named Entity Recognition (NER) involves tagging each word in the text with its corresponding entity label. The goal is to annotate tokens as entities such as **Product**, **Price**, and **Location** in Amharic text. Here's how you can approach the task:

### 1. **Understand the CoNLL Format**
   - In the **CoNLL format**, each word (token) from the text is placed on a separate line followed by its **entity label**.
   - Blank lines separate different sentences or messages.
   - Common entity labels include:
     - **B-Product** (Beginning of a product)
     - **I-Product** (Inside a product)
     - **B-LOC** (Beginning of a location)
     - **I-LOC** (Inside a location)
     - **B-PRICE** (Beginning of a price)
     - **I-PRICE** (Inside a price)
     - **O** (Outside of any entity)

### 2. **Set Up the Data**
   - You will work on the **Message** column from the dataset. This column contains text describing various products, prices, and locations.

### 3. **Tokenize the Text**
   - **Tokenization** is the process of splitting the text into individual words or tokens.
   - You can use libraries like **spaCy**, **NLTK**, or manual splitting methods to break the Amharic text into individual words.

### 4. **Annotate the Entities**
   - Manually go through the tokens and annotate them using the following rules:
     - The **first word** of a product name, location, or price gets a **B-** prefix (e.g., **B-Product**, **B-LOC**, **B-PRICE**).
     - Any subsequent word within the same entity is tagged with the **I-** prefix (e.g., **I-Product**, **I-LOC**, **I-PRICE**).
     - Words that are not part of any entity are labeled **O**.
   
#### Example:
If you have the following message:  
"አዲሱ የህጻን ቦትል በ 200 ብር ተሽጠዋል"

1. **Tokenized text:**
   - አዲሱ, የህጻን, ቦትል, በ, 200, ብር, ተሽጠዋል

2. **Labeled in CoNLL format:**
   ```plaintext
   አዲሱ O
   የህጻን B-Product
   ቦትል I-Product
   በ B-PRICE
   200 I-PRICE
   ብር I-PRICE
   ተሽጠዋል O
   ```

### 5. **Repeat for 30-50 Messages**
   - Label **30-50 messages** in this format, ensuring that each entity in the text is annotated according to the rules.

### 6. **Save the Output**
   - Once all the annotations are complete, save the labeled dataset in a **plain text file** (e.g., **labeled_data.txt**).
   - Ensure the format follows the CoNLL standard: **one token per line**, with its label separated by a space, and blank lines between each message.


# Preprocessing: Removing Emojies, blank lines, and unwanted texts

In [None]:
import pandas as pd

# Load the dataset
def load_dataset(file_path):
    df = pd.read_csv(file_path)
    return df

# Define a function to remove emojis
def remove_emojis(text):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F700-\U0001F77F"  # alchemical symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

# Function to remove blank lines from the dataset
def remove_blank_lines(df, column_name):
    # Remove rows where the message is either NaN or only contains whitespace
    df[column_name] = df[column_name].replace(r'^\s*$', pd.NA, regex=True)
    df = df.dropna(subset=[column_name])
    return df

# Function to remove unwanted texts from messages
def remove_unwanted_texts(df, column_name, unwanted_texts):
    # Iterate through each row and remove unwanted texts and double quotes
    for index, row in df.iterrows():
        message = row[column_name]
        if isinstance(message, str):
            # Remove unwanted texts
            for unwanted_text in unwanted_texts:
                if unwanted_text in message:
                    message = message.replace(unwanted_text, "")
            # Remove double quotes
            message = message.replace('"', "")
            df.at[index, column_name] = message.strip()
    return df

# Save the cleaned dataset to a new file
def save_cleaned_dataset(df, output_file):
    df.to_csv(output_file, index=False)
    print(f"Cleaned dataset saved to {output_file}")

# Main function
def main(input_file, output_file, column_name, unwanted_texts):
    df = load_dataset(input_file)

    # Handle Missing Values
    df = df.dropna(subset=['Message'])

    # Apply the function to the 'Message' column
    df['Message'] = df['Message'].apply(remove_emojis)

    # Remove blank lines (or rows with empty messages)
    df = remove_blank_lines(df, column_name)

    # Clean the messages by removing unwanted texts and double quotes
    cleaned_df = remove_unwanted_texts(df, column_name, unwanted_texts)

    # Save the cleaned dataset to a new CSV file
    save_cleaned_dataset(cleaned_df, output_file)

if __name__ == "__main__":
    # Specify the input file and output file paths
    input_file = "../data/modern_Data.csv"
    output_file = "../data/cleaned_dataset.csv"

    # Define the column containing the messages
    column_name = "Message"

    # List of unwanted texts to remove from the messages
    unwanted_texts = [
        "ቴሌግራምt.me/modernshoppingcenter",
        '"በአዲስ ነገረ ሁሌም ቀዳሚዏች ነን"',
        "t.me/modernshopping1",
        "t.me/modernshopping2",
        "በስራችን ላይ ቅሬታ ካለዎት ብቻ በዚህ ስልክ ደዉለዉ ያሳዉቁን።",
        "0956415152",
        "0924743736",
        "0974978584",
        '"በሞደርን እቃወዏች ሂወትዎን',
        ' ሞደርናይዝ ያድርጉ"',
        'የመረጡትን እቃ ለማዘዝ ከታች ባለዉ የቴሌግራም አድራሻ ይላኩልን',
        'ተጀመረ ተጀመረ ተጀመረ',
        'ልዩ እዉነተኛ የበዓል ቅናሽ',
        'ከነሐሴ 29 እስከ መስከረም 7 ድረስ የሚቆይ እዉነተኛ ቅናሽ አድርገናል።',
        'ለክፍለሀገር ደንበኞቻችን ባሉበት ሐገር በመናሐሪያ እንልካለን።',
    ]

    # Run the main function
    main(input_file, output_file, column_name, unwanted_texts)




# Labelling the dataset: Tokenizing and Labelling

In [None]:
import pandas as pd

# Load the dataset
def load_dataset(file_path):
    df = pd.read_csv(file_path)
    return df

# Function to tokenize the Amharic text (a simple split based on space)
def tokenize_message(message):
    # Convert to string to avoid issues with float/NaN values
    if isinstance(message, str):
        return message.split()
    else:
        return []

# Manual annotation function
def annotate_message(tokens):
    labeled_tokens = []
    print("\nStart labeling each token:")
    for token in tokens:
        print(f"Token: {token}")
        label = input("Enter label (B-Product, I-Product, B-LOC, I-LOC, B-PRICE, I-PRICE, O): ")
        labeled_tokens.append((token, label))
    return labeled_tokens

# Save the labeled data in CoNLL format
def save_to_conll(labeled_data, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        for message in labeled_data:
            for token, label in message:
                f.write(f"{token} {label}\n")
            f.write("\n")  # Blank line to separate messages

# Main function to annotate dataset
def main(file_path, output_file, start_row, num_rows):
    df = load_dataset(file_path)

    labeled_data = []

    # Subset the dataframe based on the start_row and num_rows
    df_subset = df.iloc[start_row : start_row + num_rows]

    # Assuming the dataset has a 'Message' column
    for index, row in df_subset.iterrows():
        message = row['Message']

        # Tokenize the message (handle non-string cases)
        tokens = tokenize_message(message)

        if tokens:  # Skip messages that couldn't be tokenized (empty or non-string)
            print(f"\nMessage {index + 1}: {message}")

            # Annotate tokens
            labeled_tokens = annotate_message(tokens)

            # Append labeled tokens
            labeled_data.append(labeled_tokens)

    # Save the annotated data in CoNLL format
    save_to_conll(labeled_data, output_file)
    print(f"\nAnnotated data saved to {output_file}")

if __name__ == "__main__":
    # Provide the path to the dataset and the output file
    dataset_path = "../data/cleaned_dataset.csv"
    output_file = "../data/labeled_data.txt"

    # Specify the starting row and number of rows to process
    start_row = int(input("Enter the starting row: "))
    num_rows = int(input("Enter the number of rows to label: "))

    main(dataset_path, output_file, start_row, num_rows)
