In [1]:
import re

def preprocess_message(user, message):
    """
    Format a single message with the username and normalize UTF-8 encoding.
    Replaces existing colons in the message to avoid ambiguity.
    """
    if not isinstance(message, str):
        message = ""
    
    # Normalize message to UTF-8
    message = message.encode("utf-8").decode("utf-8")
    
    # Replace existing colons to avoid ambiguity
    message = message.replace(":", " |")
    
    # Format the message with the username
    return f'{user}: {message}' if user else message

def preprocess_messages_with_usernames(df):
    """
    Preprocess messages by adding usernames and normalizing UTF-8 encoding.
    Incorporates the original Filter 1 logic and adds selected stricter conditions
    from Filter 2 (symbols_only, numeric_only, too_long, and suspicious links).
    """

    # Ensure the required columns exist
    if "Who" not in df.columns or "Chat Content" not in df.columns:
        raise ValueError("DataFrame must contain 'Who' and 'Chat Content' columns.")

    # Normalize 'Who' and 'Chat Content' to UTF-8
    df['Who'] = df['Who'].apply(lambda x: x.encode("utf-8").decode("utf-8") if isinstance(x, str) else x)
    df['Chat Content'] = df['Chat Content'].apply(lambda x: x.encode("utf-8").decode("utf-8") if isinstance(x, str) else "")

    # Define the array of blocked phrases (original filter 1)
    blocked_phrases = [
        '首充入', '秒到帳', '每筆送', '獎金高達', '報名參加', '報名:', '報名：', '，報名', ', 報名', '優惠', '日期：', '時間：', '地點：', 
        '加入TG', '全文：', '報導', '當年今日', '現正招募', '專訪','拉群', '点我', '有意請', '立即申請：', '關注我們', '尋失物', 'LIHKG', 
        'lih.kg', 'play.google.com', 'Find out more', '得獎內容', '問卷連結', '公告：', 'Happy birthday', '生日快樂'
    ]

    # Define allowed domains for links (from filter 2)
    allowed_domains = ['.uk', '.edu']

    # Define conditions from original filter 1
    cond_empty = df['Chat Content'].str.strip() == ''
    cond_nan = df['Chat Content'].isna() | (df['Chat Content'].str.strip().str.upper() == 'NAN')
    cond_link_only = df['Chat Content'].str.strip().str.match(r'^(https?://\S+|www\.\S+)$', na=False)
    cond_emoji_only = df['Chat Content'].str.match(r'^[\U0001F300-\U0001F6FF]+$', na=False)
    cond_emoji_with_link = df['Chat Content'].str.match(r'^[\U0001F300-\U0001F6FF]+\s+https?://\S+$', na=False)
    cond_who_contains_bot = df['Who'].str.contains('bot', case=False, na=False)
    cond_blocked_phrases = df['Chat Content'].str.contains('|'.join(map(re.escape, blocked_phrases)), case=False, na=False)

    # Calculate char count
    df['CharCount'] = df['Chat Content'].str.len()

    # Conditions from original filter 1
    cond_two_hash_and_word_count = (
        (df['Chat Content'].str.count('#') >= 2) & (df['CharCount'] > 80)
    )

    cond_instagram_and_facebook_words = (
        df['Chat Content'].str.contains('instagram', case=False, na=False) & 
        df['Chat Content'].str.contains('facebook', case=False, na=False)
    )

    cond_instagram_and_facebook_links = (
        df['Chat Content'].str.contains(r'instagram\.com', case=False, na=False) & 
        df['Chat Content'].str.contains(r'facebook\.com', case=False, na=False)
    )

    cond_instagram_and_facebook = cond_instagram_and_facebook_words | cond_instagram_and_facebook_links

    cond_long_no_chinese = (df['CharCount'] > 700) & (~df['Chat Content'].str.contains(r'[\u4e00-\u9fff]', na=False))
    cond_short_no_chinese = (df['CharCount'] == 1) & (~df['Chat Content'].str.contains(r'[\u4e00-\u9fff]', na=False))

    # New conditions from filter 2 to include:
    cond_symbols_only = df['Chat Content'].str.match(r'^[\W_]+$', na=False)
    cond_numeric_only = df['Chat Content'].str.match(r'^\d+$', na=False)
    cond_too_long = df['CharCount'] > 1000

    # Suspicious links (no allowed domain), taken from filter 2 logic
    cond_links_or_hashtags = (
        df['Chat Content'].str.contains(r'(?:https?://|www\.)', na=False) &
        ~df['Chat Content'].str.contains('|'.join(map(re.escape, allowed_domains)), na=False)
    )

    # Remove rows where CharCount <= 5
    cond_char_count_short = df['CharCount'] <= 5

    # New condition: One or two English words with optional symbols or emojis
    cond_one_two_words_with_emojis_or_symbols = df['Chat Content'].str.match(
        r'^\s*[\W_]*[a-zA-Z]+(?:\s+[a-zA-Z]+)?[\W_]*\s*$', na=False
    )

    # Combine all conditions using logical OR
    # Update the combined mask
    mask = (
        cond_empty |
        cond_nan |
        cond_link_only |
        cond_emoji_only |
        cond_emoji_with_link |
        cond_who_contains_bot |
        cond_blocked_phrases |
        cond_two_hash_and_word_count |
        cond_instagram_and_facebook |
        cond_long_no_chinese |
        cond_short_no_chinese |
        cond_symbols_only |
        cond_numeric_only |
        cond_too_long |
        cond_links_or_hashtags |
        cond_char_count_short |  # Add this condition
        cond_one_two_words_with_emojis_or_symbols  # New condition
    )

    # Filter the DataFrame
    df = df[~mask].copy()

    # Preprocess messages (create 'Processed Content')
    df["Processed Content"] = df.apply(
        lambda row: preprocess_message(row["Who"], row["Chat Content"]),
        axis=1
    )

    return df

In [2]:
import pandas as pd
import random

# Function to generate a random "user" ID
def generate_user_id():
    return f"user{random.randint(10000, 99999)}"

# Function to assign random user IDs to empty rows in the 'Who' column
def assign_user_ids(df, column_name):
    current_user_id = None
    for index in df.index:
        if pd.isna(df.at[index, column_name]) or df.at[index, column_name] == "":  # Check if the column is empty
            if current_user_id is None:
                current_user_id = generate_user_id()
            df.at[index, column_name] = current_user_id
        else:
            current_user_id = None  # Reset when encountering a non-empty row
    return df

# Main function to process the file
def process_who_column(input_file):
    # Load the CSV into a pandas DataFrame
    df = pd.read_csv(input_file)
    # Apply the function to assign user IDs
    df = assign_user_ids(df, column_name='Who')
    # Return the modified DataFrame
    return df


In [3]:
input_file = './9000_messages.csv'
output_file = './9000_result.csv'
df = process_who_column(input_file)

df = preprocess_messages_with_usernames(df)
df.to_csv(output_file, index=False)