# new approaches

In [18]:
import os
import json
import pandas as pd
import openai

In [37]:
# Initialize the OpenAI client


## 1. Initialize or Load Categories

In [20]:
def load_categories_from_file(file_path):
    """
    Load categories and subcategories from a JSON file.
    If the file does not exist or contains invalid JSON, initialize with default categories.
    """
    if not os.path.exists(file_path):
        return initialize_empty_file(file_path)

    with open(file_path, "r", encoding="utf-8") as file:
        try:
            return json.load(file)
        except json.JSONDecodeError:
            print(f"Invalid JSON detected in {file_path}. Reinitializing the file.")
            return initialize_empty_file(file_path)


def initialize_empty_file(file_path):
    """
    Initialize a JSON file with a default dictionary and save it.
    """
    categories = categories = {
      "Housing and Living Arrangements": [
        "Rising Rent and Housing Costs",
        "Finding Affordable and Safe Accommodation",
        "Shared Housing and Roommate Dynamics",
        "Adjusting to Different Housing Standards",
        "Living with Extended Family or Parents"
      ],
      "Employment and Economic Opportunities": [
        "Job Market Competition",
        "Skill and Credential Recognition",
        "Overqualification and Career Downgrades",
        "Workplace Culture and Integration",
        "Access to Professional Networking Opportunities",
        "Balancing Work with Family Responsibilities"
      ],
      "Healthcare and Well-Being": [
        "Understanding Local Healthcare Systems",
        "Access to Primary and Emergency Care",
        "Mental Health Resources Availability",
        "Financial Barriers to Healthcare",
        "Navigating Health Insurance Systems",
        "Communicating Effectively with Healthcare Providers"
      ],
      "Social and Cultural Adjustment": [
        "Building Community and Social Connections",
        "Adapting to New Social Norms and Etiquette",
        "Overcoming General Language Barriers in Social Contexts",
        "Facing and Addressing Discrimination",
        "Parenting Challenges in a New Cultural Environment",
        "Celebrating and Preserving Cultural Traditions"
      ],
      "Legal and Bureaucratic Challenges": [
        "Navigating Immigration and Residency Requirements",
        "Understanding Tax Obligations",
        "Securing Visas and Work Permits",
        "Accessing Legal Aid or Advocacy Services",
        "Filing Necessary Documentation for Families",
        "Understanding Local Laws and Regulations"
      ],
      "Education and Personal Development": [
        "Accessing Education for Children and Adults",
        "Recognition of Previous Educational Credentials",
        "Enrolling in Language and Integration Programs",
        "Financial Barriers to Education and Training",
        "Exploring New Career or Academic Opportunities"
      ],
      "Transportation and Mobility": [
        "Navigating Public Transportation Systems",
        "Obtaining Driver’s Licenses or Vehicle Registration",
        "Cost and Accessibility of Transportation",
        "Challenges in Rural or Suburban Mobility",
        "Adjusting to New Traffic Rules and Regulations",
        "Learning to Drive in a New Environment"
      ],
      "Financial and Budgeting Challenges": [
        "Setting Up Bank Accounts and Building Credit",
        "Understanding Local Taxes and Financial Systems",
        "Managing Cost of Living in High-Expense Areas",
        "Sending Money Abroad to Family",
        "Planning and Budgeting for Financial Security"
      ],
      "Family Dynamics and Support": [
        "Adjusting to Changing Family Roles",
        "Reuniting with Family Across Borders",
        "Supporting Children’s Educational and Social Needs",
        "Caring for Aging Parents Remotely",
        "Managing Relationships in Cross-Cultural Marriages",
        "Strengthening Family Bonds in a New Environment"
      ],
      "Identity and Emotional Well-Being": [
        "Coping with Culture Shock and Loneliness",
        "Balancing Old and New Cultural Identities",
        "Addressing Feelings of Isolation or Marginalization",
        "Finding Support Networks for Emotional Health",
        "Building a Sense of Belonging in the New Country",
        "Overcoming Trauma and Resilience Building"
      ],
      "Public Safety and Security Concerns": [
        "Perception of Neighbourhood Safety",
        "Navigating Local Law Enforcement and Emergency Services",
        "Personal Safety Strategies",
        "Anxiety About Personal Safety"
      ],
      "Political Environment and Governance": [
        "Understanding Local Political Parties and Policies",
        "Concerns about Authoritarian or Repressive Tendencies",
        "International Relations and Policy Impact on Immigrants"
      ],
      "Economic and Financial Stability": [
        "Adapting to Market Fluctuations and Economic Changes",
        "Building Resilience Against Economic Uncertainty"
      ]
    }
    save_categories_to_file(categories, file_path)
    print(f"Created a new categories file at: {file_path}")
    return categories


def save_categories_to_file(categories, file_path):
    """
    Save the updated categories and subcategories to a JSON file only if changes are detected.
    """
    # Load existing categories from the file
    if os.path.exists(file_path):
        with open(file_path, "r", encoding="utf-8") as file:
            try:
                existing_categories = json.load(file)
            except json.JSONDecodeError:
                existing_categories = {}
    else:
        existing_categories = {}

    # Check if the categories have changed
    if categories != existing_categories:
        with open(file_path, "w", encoding="utf-8") as file:
            json.dump(categories, file, ensure_ascii=False, indent=4)
        print(f"Updated categories saved to {file_path}")
    else:
        print("No changes detected in categories. Skipping save.")

def parse_consolidation_response(response):
    """
    Parse the GPT response to extract consolidated categories and subcategories.

    Parameters:
        response (str): The GPT response containing the updated categories and subcategories.

    Returns:
        dict: A dictionary of consolidated categories and subcategories.
    """
    consolidated_categories = {}

    # Split the response into lines
    lines = response.split("\n")
    
    current_category = None
    for line in lines:
        line = line.strip()  # Remove extra whitespace
        if not line:
            continue  # Skip empty lines

        # Check for a category line (e.g., "Housing and Rent:")
        if line.endswith(":"):
            current_category = line[:-1].strip()  # Remove the trailing ":"
            consolidated_categories[current_category] = []
        elif current_category:
            # Treat non-category lines as subcategories
            # Example format: "    • Subcategory Name"
            if line.startswith("•"):
                subcategory = line[1:].strip()  # Remove the bullet point
                consolidated_categories[current_category].append(subcategory)

    return consolidated_categories

## 2. Build the Prompt

In [21]:
def build_prompt(categories, messages):
    """
    Build a full prompt including current categories and subcategories
    and the batch of messages to process.
    """
    static_prompt = (
        "You are categorizing chat messages into predefined categories and "
        "subcategories about specific, explicitly stated living problems or challenges "
        "faced by Hong Kong people living in England.\n\n"
        "Here are the current categories and subcategories:\n"
    )

    # Add categories and subcategories
    for category, subs in categories.items():
        static_prompt += f"{category}\n"
        for sub in subs:
            static_prompt += f"    • {sub}\n"

    # Add the messages to categorize
    static_prompt += "\nCurrent Batch of Messages:\n"
    for i, message in enumerate(messages, start=1):
        static_prompt += f"{i}. {message}\n"

    # Revised instructions
    static_prompt += """
    Important Instructions:
    1. Only categorize a message if it explicitly states a difficulty, hardship, or challenge related to living in England as a Hong Kong person.
    
       - For example: "I cannot afford...", "I am struggling to...", "I have difficulty...", "I face a barrier...", or any clear complaint about a problem.
       - Just mentioning a topic (like a driving test, theory test, housing, or healthcare) is NOT enough. Must explicitly describe a problem.
    
    2. If the message:
       - is just sharing an event or promotional activity without mentioning any difficulty,
       - is just a link (like a YouTube link),
       - mentions political activity, theory tests, driving tests, education, or any other topic without explicitly stating a personal difficulty,
       - is general discussion or random content without stating a personal challenge,
    
       then it MUST be categorized as "Uncategorized."
    
    3. DO NOT GUESS a problem. If not clear, choose "Uncategorized."
    
    4. If the message describes a specific difficulty, hardship, or challenge for living in England that is not covered by any of the existing main categories or their subcategories, you must introduce a new category or subcategory:
    
       - If the difficulty logically fits within an existing main category but none of its current subcategories capture this new aspect, add a new subcategory under that existing main category. This new subcategory should clearly describe the specific difficulty mentioned in the message.
       
       - If the difficulty does not fit under any existing main category at all, create a completely new main category and a relevant first subcategory. Both the main category and subcategory names must clearly reflect the nature of the newly mentioned difficulty.
    
    Important:
    - Do not reuse, copy, or refer to any categories or subcategories given as examples in these instructions. They are placeholders only.  
    - Each time you create a new category or subcategory, invent a unique and contextually appropriate name that matches the difficulty described in the message.  
    - The new category and/or subcategory must be directly related to the difficulty stated. If the message talks about a type of difficulty not previously covered, think of a descriptive name that conveys that exact challenge.
    - After introducing a new category or subcategory, do not continue categorizing further messages in this batch. Stop immediately and return only the newly created category and subcategory.
    
    For clarity:
    - If a message states a difficulty and it clearly doesn't match any existing categories or subcategories, you must come up with a new main category name and a new subcategory name that accurately describe this difficulty.  
    - If a message states a difficulty that fits an existing category but needs a more specific angle not listed, add a new subcategory to that existing main category that directly addresses the difficulty mentioned.
    
    Do not guess or approximate. If the difficulty is new, create the category or subcategory right away. Do not return 'Uncategorized' when a difficulty is explicitly described and not covered by existing categories. Instead, produce a new category and/or subcategory as required.
    5. No commentary or extra text outside the specified format.
    
    **Format:**
    <message_number>. <Main Category>
        • <Subcategory>
    
    OR if unrelated/unclassifiable:
    <message_number>. Uncategorized
    
    **EXAMPLES:**
    
    - Unrelated event (no stated difficulty):
      Message: "CLS Hair Studio offering free haircuts on November 12."
      Response:
      1. Uncategorized
    
    - Just a link:
      Message: "https://youtu.be/n4Flcllu9WA"
      Response:
      1. Uncategorized
    
    - Political or asylum mention without stated difficulty:
      Message: "Hong Kong person got asylum in 2 years."
      Response:
      1. Uncategorized
    
    - Mentioning theory/driving test without difficulty:
      Message: "I took a theory test yesterday."
      Response:
      1. Uncategorized
    
    - Relevant (explicitly stating a difficulty):
      Message: "I am struggling to find affordable housing as a Hong Kong migrant in England."
      Response:
      1. Housing and Living Arrangements
          • Finding Affordable and Safe Accommodation
    
    If the message does not explicitly say it is having trouble, difficulty, or a challenge, always choose 'Uncategorized'.
    No guessing.
    If a new category is needed, introduce it and stop.
    """

    return static_prompt

## 3. Process a Batch

In [22]:
def process_batch(client, prompt):
    """
    Process a batch of messages using GPT.
    """
    try:
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ]
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error processing batch: {e}")
        return "Error"

## 4. Update Categories

In [23]:
def update_categories_from_response(response, categories):
    """
    Update categories and subcategories based on GPT's response.
    Return True if a new main category was introduced, False otherwise.
    """
    lines = response.split("\n")
    new_category_introduced = False
    current_main_category = None

    for line in lines:
        stripped_line = line.strip()
        if stripped_line and stripped_line[0].isdigit() and "." in stripped_line:
            # Category line format: "1. Main Category"
            parts = stripped_line.split(".", 1)
            cat_text = parts[1].strip()
            if cat_text.lower() != "uncategorized":
                current_main_category = cat_text
            else:
                current_main_category = None

        elif stripped_line.startswith(("•", "-")) and current_main_category:
            subcategory = stripped_line.lstrip("•-").strip()
            if subcategory.lower() == "uncategorized":
                continue

            # If current_main_category is new
            if current_main_category not in categories:
                categories[current_main_category] = []
                new_category_introduced = True

            if subcategory not in categories[current_main_category]:
                categories[current_main_category].append(subcategory)

    return new_category_introduced

## 5. Messages Batch Processing

In [30]:
# def process_messages_in_batches(client, df, categories_file, batch_size=50, overlap=20):
#     categories = load_categories_from_file(categories_file)

#     results = [("Uncategorized", "Uncategorized")] * len(df)

#     current_index = 0
#     while current_index < len(df):
#         end = min(current_index + batch_size, len(df))
#         batch_messages = df["Chat Content"].iloc[current_index:end].tolist()
#         batch_messages = [str(msg) if isinstance(msg, str) else "" for msg in batch_messages]

#         prompt = build_prompt(categories, batch_messages)
#         response = process_batch(client, prompt)
#         print("Raw GPT Response:\n", response)  # Print raw response for debugging

#         if response == "Error":
#             print(f"Error processing batch {current_index} to {end}")
#             current_index = end
#             continue

#         # Check if new category introduced
#         new_category_added = update_categories_from_response(response, categories)
#         if new_category_added:
#             save_categories_to_file(categories, categories_file)
#             print("New category found and added. Restarting from the same batch with updated categories.")

#             # Clear results for this batch since we are reprocessing
#             for i in range(current_index, end):
#                 results[i] = ("Uncategorized", "Uncategorized")

#             # Re-run same batch with updated categories
#             continue

#         # Parse the response lines into results
#         response_lines = response.split("\n")
#         batch_index = current_index
#         expecting_subcategory = False
#         current_category = None

#         for line in response_lines:
#             line_stripped = line.strip()
#             if not line_stripped:
#                 continue

#             if line_stripped[0].isdigit() and "." in line_stripped:
#                 parts = line_stripped.split(".", 1)
#                 cat_text = parts[1].strip()
#                 if cat_text.lower() == "uncategorized":
#                     if batch_index < len(results):
#                         results[batch_index] = ("Uncategorized", "Uncategorized")
#                         batch_index += 1
#                     expecting_subcategory = False
#                 else:
#                     current_category = cat_text
#                     expecting_subcategory = True
#                 continue

#             if expecting_subcategory and line_stripped.startswith(("•", "-")):
#                 clean_line = line_stripped.lstrip("•-").strip()
#                 subcategory = clean_line
#                 if batch_index < len(results):
#                     results[batch_index] = (current_category, subcategory)
#                     batch_index += 1
#                 expecting_subcategory = False
#                 continue

#             # If format not followed, uncategorized
#             if batch_index < len(results):
#                 results[batch_index] = ("Uncategorized", "Uncategorized")
#                 batch_index += 1
#             expecting_subcategory = False

#         current_index = end

#     df["Category"] = [r[0] for r in results]
#     df["Subcategory"] = [r[1] for r in results]

#     print("Final DataFrame with Categories and Subcategories:\n", df[["Category", "Subcategory"]].head(10))
#     # df.to_csv("./categorized_messages.csv", index=False)
#     # print("Updated CSV saved to ./categorized_messages.csv")

#     return df, categories

## 5. Messages Batch Processing with Max Token Error handling 

In [89]:
def process_messages_in_batches(client, df, categories_file, initial_batch_size=50, overlap=20, min_batch_size=5):
    categories = load_categories_from_file(categories_file)

    results = [("Uncategorized", "Uncategorized")] * len(df)
    batch_size = initial_batch_size
    skipped_indices = set()  # Track messages that are too large to process

    current_index = 0
    while current_index < len(df):
        # Determine which indices will be in this batch, excluding skipped ones
        raw_batch_end = min(current_index + batch_size, len(df))
        batch_indices = [i for i in range(current_index, raw_batch_end) if i not in skipped_indices]

        # If all messages in this range are skipped, move to the next segment
        if not batch_indices:
            current_index = raw_batch_end
            batch_size = initial_batch_size
            continue

        # Build the batch messages from non-skipped indices
        batch_messages = []
        for i in batch_indices:
            msg = df["Chat Content"].iloc[i]
            batch_messages.append(str(msg) if isinstance(msg, str) else "")

        prompt = build_prompt(categories, batch_messages)
        
        try:
            response = process_batch(client, prompt)
        except TokenTooLargeError:  # Replace with your specific error check
            print(f"Batch {current_index} to {raw_batch_end} too large. Reducing batch size.")
            if batch_size > min_batch_size:
                # Halve the batch size down to at least min_batch_size and retry
                batch_size = max(batch_size // 2, min_batch_size)
            else:
                # At min batch size, we still got a TokenTooLargeError
                # This means even a single message is too large to process
                print(f"Cannot reduce batch size further for batch {current_index} to {raw_batch_end}. Skipping problematic message(s).")

                # Mark all messages in this final attempted batch as skipped
                # Usually, this would be just one message if batch_size = 1
                for i in batch_indices:
                    results[i] = ("Uncategorized", "SkippedTooLarge")
                    skipped_indices.add(i)

                # Move on past these messages
                current_index = raw_batch_end
            continue

        print("Raw GPT Response:\n", response)  # Print raw response for debugging

        if response == "Error":
            print(f"Error processing batch {current_index} to {raw_batch_end}")
            # Mark all messages in this batch as uncategorized since we can't process them
            for i in batch_indices:
                results[i] = ("Uncategorized", "Uncategorized")
            current_index = raw_batch_end
            continue

        # Check if new category introduced
        new_category_added = update_categories_from_response(response, categories)
        if new_category_added:
            save_categories_to_file(categories, categories_file)
            print("New category found and added. Restarting from the same batch with updated categories.")

            # Clear results for this batch since we are reprocessing
            for i in batch_indices:
                results[i] = ("Uncategorized", "Uncategorized")

            # Do not advance current_index; just re-run with updated categories
            continue

        # Parse the response lines into results
        response_lines = [line.strip() for line in response.split("\n") if line.strip()]

        # We'll iterate over the response lines and assign categories to batch_indices in order
        idx = 0
        current_category = None
        expecting_subcategory = False

        for line_stripped in response_lines:
            # Check if line starts with a digit and a dot (indicating a new message category line)
            if line_stripped[0].isdigit() and "." in line_stripped:
                parts = line_stripped.split(".", 1)
                cat_text = parts[1].strip()

                if cat_text.lower() == "uncategorized":
                    if idx < len(batch_indices):
                        results[batch_indices[idx]] = ("Uncategorized", "Uncategorized")
                        idx += 1
                    expecting_subcategory = False
                else:
                    current_category = cat_text
                    expecting_subcategory = True
                continue

            # If we are expecting a subcategory line
            if expecting_subcategory and (line_stripped.startswith("•") or line_stripped.startswith("-")):
                clean_line = line_stripped.lstrip("•-").strip()
                subcategory = clean_line
                if idx < len(batch_indices):
                    results[batch_indices[idx]] = (current_category, subcategory)
                    idx += 1
                expecting_subcategory = False
                continue

            # If format is not followed, mark as Uncategorized
            if idx < len(batch_indices):
                results[batch_indices[idx]] = ("Uncategorized", "Uncategorized")
                idx += 1
            expecting_subcategory = False

        # If the response lines are fewer than the messages, the remaining ones get Uncategorized
        while idx < len(batch_indices):
            results[batch_indices[idx]] = ("Uncategorized", "Uncategorized")
            idx += 1

        # Move to the next batch segment
        current_index = raw_batch_end
        batch_size = initial_batch_size  # Reset batch size after successful processing

    df["Category"] = [r[0] for r in results]
    df["Subcategory"] = [r[1] for r in results]

    print("Final DataFrame with Categories and Subcategories:\n", df[["Category", "Subcategory"]].head(10))
    # df.to_csv("./categorized_messages.csv", index=False)
    # print("Updated CSV saved to ./categorized_messages.csv")

    return df, categories

## 7. Filtering Data

In [26]:
import re

def preprocess_messages_with_usernames(df):
    """
    Preprocess messages by adding usernames and normalizing UTF-8 encoding.
    Adds a 'Processed Content' column to the DataFrame after filtering out unwanted rows.
    """

    # Ensure the required columns exist
    if "Who" not in df.columns or "Chat Content" not in df.columns:
        raise ValueError("DataFrame must contain 'Who' and 'Chat Content' columns.")

    # Normalize 'Who' and 'Chat Content' to UTF-8
    df['Who'] = df['Who'].apply(lambda x: x.encode("utf-8").decode("utf-8") if isinstance(x, str) else x)
    df['Chat Content'] = df['Chat Content'].apply(lambda x: x.encode("utf-8").decode("utf-8") if isinstance(x, str) else "")

    # Define the array of blocked phrases
    blocked_phrases = [
        '首充入', '秒到帳', '每筆送', '獎金高達', '報名參加', '報名:', '報名：', '，報名', ', 報名', '優惠', '日期：', '時間：', '地點：', 
        '加入TG', '全文：', '報導', '當年今日', '現正招募', '專訪','拉群', '点我', '有意請', '立即申請：', '關注我們', '尋失物', 'LIHKG', 
        'lih.kg', 'play.google.com', 'Find out more', '得獎內容', '問卷連結', '公告：'
    ]

    # Define the filtering conditions
    cond_empty = df['Chat Content'].str.strip() == ''

    # Check for NaN (both as a missing value and literal string "NaN")
    cond_nan = df['Chat Content'].isna() | (df['Chat Content'].str.strip().str.upper() == 'NAN')

    cond_link_only = df['Chat Content'].str.strip().str.match(r'^(https?://\S+|www\.\S+)$', na=False)

    cond_emoji_only = df['Chat Content'].str.match(r'^[\U0001F300-\U0001F6FF]+$', na=False)

    # Emoji with link only (e.g., "🙂 https://example.com")
    cond_emoji_with_link = df['Chat Content'].str.match(
        r'^[\U0001F300-\U0001F6FF]+\s+https?://\S+$', na=False
    )

    cond_who_contains_bot = df['Who'].str.contains('bot', case=False, na=False)

    cond_blocked_phrases = df['Chat Content'].str.contains('|'.join(map(re.escape, blocked_phrases)), case=False, na=False)

    # Calculate char count
    df['CharCount'] = df['Chat Content'].str.len()

    # At least 2 hashtags and char count more than 80
    cond_two_hash_and_word_count = (
        (df['Chat Content'].str.count('#') >= 2) & (df['CharCount'] > 80)
    )

    # Check for both Instagram and Facebook words
    cond_instagram_and_facebook_words = (
        df['Chat Content'].str.contains('instagram', case=False, na=False) & 
        df['Chat Content'].str.contains('facebook', case=False, na=False)
    )

    # Check for both Instagram and Facebook links
    cond_instagram_and_facebook_links = (
        df['Chat Content'].str.contains(r'instagram\.com', case=False, na=False) & 
        df['Chat Content'].str.contains(r'facebook\.com', case=False, na=False)
    )

    # Combine the Instagram and Facebook conditions
    cond_instagram_and_facebook = cond_instagram_and_facebook_words | cond_instagram_and_facebook_links

    # If char count > 700 and no Chinese characters, filter out
    # Chinese characters range: \u4e00-\u9fff
    cond_long_no_chinese = (df['CharCount'] > 700) & (~df['Chat Content'].str.contains(r'[\u4e00-\u9fff]', na=False))

    # if the message too short
    cond_short_no_chinese = (df['CharCount'] == 1) & (~df['Chat Content'].str.contains(r'[\u4e00-\u9fff]', na=False))

    # Combine all conditions using logical OR
    mask = (
        cond_empty |
        cond_nan |
        cond_link_only |
        cond_emoji_only |
        cond_emoji_with_link |
        cond_who_contains_bot |
        cond_blocked_phrases |
        cond_two_hash_and_word_count |
        cond_instagram_and_facebook |
        cond_long_no_chinese |
        cond_short_no_chinese
    )

    # Filter the DataFrame
    df = df[~mask].copy()

    # Preprocess messages (still create 'Processed Content')
    df["Processed Content"] = df.apply(
        lambda row: preprocess_message(row["Who"], row["Chat Content"]),
        axis=1
    )

    return df


def preprocess_message(user, message):
    """
    Format a single message with the username and normalize UTF-8 encoding.
    Replaces existing colons in the message to avoid ambiguity.
    """
    if not isinstance(message, str):
        message = ""
    
    # Normalize message to UTF-8
    message = message.encode("utf-8").decode("utf-8")
    
    # Replace existing colons to avoid ambiguity
    message = message.replace(":", " |")
    
    # Format the message with the username
    return f'{user}: {message}' if user else message

## 8. Run the Script

In [10]:
def main():
    # File paths
    categories_file = "./categories.json"
    messages_file = "./filtered_messages_nov.csv"
    output_file = "./categorized_messages.csv"

    # Load categories file
    categories = load_categories_from_file(categories_file)

    # Load messages
    df = pd.read_csv(messages_file)

    # Preprocess messages: Add usernames and normalize UTF-8
    df = preprocess_messages_with_usernames(df)

    # Filter rows for processing
    df = df.iloc[0:300]

    ## TODO: Add filtering message here
    ## TODO: Add assign random name to empty user name, and one name if the empty username is continuous

    # Ensure DataFrame has Category and Subcategory columns
    if "Category" not in df.columns:
        df["Category"] = ""
    if "Subcategory" not in df.columns:
        df["Subcategory"] = ""

    # Perform initial reanalysis of existing categories
    client = openai  # Replace with your OpenAI client
    categories = load_categories_from_file(categories_file)

    # Process messages in batches
    df, categories = process_messages_in_batches(
        client=client,
        df=df,
        categories_file=categories_file,
        batch_size=500, ## TODO: see if it's working for testing the auto handling max token bug
        overlap=5
    )

    # Save updated DataFrame to a CSV file
    df.to_csv(output_file, index=False)
    print(f"\nUpdated CSV saved to {output_file}")

    ## TODO: Add the other features into this main
    ## 1. categorized again 2. sub summary 3. merge summery and the final summary txt 4. simple summary txt 5. middle summary txt

In [11]:
if __name__ == "__main__":
    main()

Raw GPT Response:
 1. Uncategorized
2. Uncategorized
3. Uncategorized
4. Legal and Bureaucratic Challenges
    • Accessing Legal Aid or Advocacy Services
5. Economic and Financial Stability
    • Adapting to Market Fluctuations and Economic Changes
6. Uncategorized
7. Uncategorized
8. Uncategorized
9. Uncategorized
10. Employment and Economic Opportunities
    • Job Market Competition
11. Uncategorized
12. Uncategorized
13. Uncategorized
14. Uncategorized
15. Uncategorized
16. Uncategorized
17. Uncategorized
18. Uncategorized
19. Uncategorized
20. Uncategorized
21. Uncategorized
22. Uncategorized
23. Housing and Living Arrangements
    • Finding Affordable and Safe Accommodation
24. Uncategorized
25. Uncategorized
26. Uncategorized
27. Social and Cultural Adjustment
    • Celebrating and Preserving Cultural Traditions
28. Uncategorized
29. Uncategorized
30. Uncategorized
Raw GPT Response:
 4. Legal and Bureaucratic Challenges
    • Understanding Tax Obligations
14. Employment and Econo

## Catogorized Message Filtering again

In [35]:
import pandas as pd

# Replace 'your_file.csv' with the path to your CSV file
csv_file_path = './categorized_messages.csv'
output_file = './refined_categorized_messages.csv'
categories_file = "./categories.json"

# Load the CSV into a pandas DataFrame
df_categorized = pd.read_csv(csv_file_path)

print(len(df_categorized[df_categorized['Category'] != 'Uncategorized']))

df_categorized = df_categorized[df_categorized['Category'].str.strip().str.lower() != 'uncategorized']

# Ensure the 'processed content' column exists
if 'Processed Content' not in df_categorized.columns:
    raise ValueError("The CSV does not contain a 'processed content' column.")

# Combine all rows in the 'processed content' column into a single string
combined_content = df_categorized['Processed Content'].astype(str).tolist()

82


In [38]:
client = openai

categories = load_categories_from_file(categories_file)

# Process messages in batches
df_categorized, categories = process_messages_in_batches(
    client=client,
    df=df_categorized,
    categories_file=categories_file,
    batch_size=30,
    overlap=5
)

# Save updated DataFrame to a CSV file
df_categorized.to_csv(output_file, index=False)

Loading categories...
Categories loaded: {'Housing and Living Arrangements': ['Rising Rent and Housing Costs', 'Finding Affordable and Safe Accommodation', 'Shared Housing and Roommate Dynamics', 'Adjusting to Different Housing Standards', 'Living with Extended Family or Parents'], 'Employment and Economic Opportunities': ['Job Market Competition', 'Skill and Credential Recognition', 'Overqualification and Career Downgrades', 'Workplace Culture and Integration', 'Access to Professional Networking Opportunities', 'Balancing Work with Family Responsibilities'], 'Healthcare and Well-Being': ['Understanding Local Healthcare Systems', 'Access to Primary and Emergency Care', 'Mental Health Resources Availability', 'Financial Barriers to Healthcare', 'Navigating Health Insurance Systems', 'Communicating Effectively with Healthcare Providers'], 'Social and Cultural Adjustment': ['Building Community and Social Connections', 'Adapting to New Social Norms and Etiquette', 'Overcoming General Langu

## Building Summary

In [70]:
def process_summary_in_batches(client, df, categories_file, batch_size=50):
    # Load categories
    categories = load_categories_from_file(categories_file)
    
    # Filter out uncategorized and get the combined content list
    df_refined_categorized = df[df['Category'].str.strip().str.lower() != 'uncategorized'].copy()
    all_contents = df_refined_categorized['Processed Content'].astype(str).tolist()
    
    # We'll store the summaries for each batch
    summaries = []

    def build_summary_prompt(categories, messages):
        prompt = """
        
        The following is a collection of messages and their associated categories. Please analyze the content and provide a detailed generalization.
        
        Your task is to:
        1. Identify the main points and concerns expressed in the messages.
        2. Not all the messages are related to living problems; focus only on relevant ones.
        3. Provide a balanced overview of the categories and subcategories, ensuring no critical insights are missed, even if mentioned only once.
        4. Summarize the key topics or themes across the messages, focusing on their meaning and relevance rather than repetition.
        5. Group related issues into broader themes or perspectives to create a cohesive analysis.
        
        Here are the categories for living problems:
        
        """
        
        # Add categories and subcategories
        for category, subs in categories.items():
            prompt += f"{category}\n"
            for sub in subs:
                prompt += f"    • {sub}\n"
                
        prompt += "\nMessages:\n"
        
        # Add numbers to messages for better separation
        for idx, message in enumerate(messages, 1):
            prompt += f"{idx}. {message.strip()}\n"
    
        return prompt
    
    # Batch processing
    start_index = 0
    n = len(all_contents)
    
    while start_index < n:
        end_index = start_index + batch_size
        
        # Adjust the end index if it goes beyond the length of all_contents
        if end_index > n:
            end_index = n
        
        # Get the current batch of messages
        current_batch = all_contents[start_index:end_index]
        
        # Build a prompt with the current batch of messages
        # Also provide the original message indices for clarity if needed
        # The prompt builder can incorporate message indices if you adjust it accordingly.
        prompt = build_summary_prompt(categories, current_batch)
        
        # Run the prompt
        response = process_batch(client, prompt)
        
        # Store the response
        batch_summary = {
            "batch_start_index": start_index,
            "batch_end_index": end_index - 1,
            "summary": response
        }
        summaries.append(batch_summary)
        
        # Calculate the next start index
        # We move forward by (batch_size - overlap) to create an overlapping window
        next_start = start_index + batch_size
        
        # If next_start is not less than end_index, it means we've processed all messages
        if next_start >= n:
            break
        
        start_index = next_start
    
    return summaries

# Example usage:
# summaries = process_messages_in_batches(client, df_categorized, "categories.txt", batch_size=50, overlap=20)
# for summary in summaries:
#     print("From message index", summary["batch_start_index"], "to", summary["batch_end_index"])
#     print(summary["summary"])

In [72]:
## TODO: solve the same max token bug in process_summary_in_batches()
summaries = process_summary_in_batches(
    client=client,
    df=df_categorized,
    categories_file=categories_file,
    batch_size=8
)

In [74]:
def merge_summaries(client, sub_summaries, max_iterations=10):
    """
    Recursively merges a list of sub_summaries (each is a dict with keys 'batch_start_index', 'batch_end_index', 'summary')
    into a single comprehensive summary using a "merge-sort" like approach.

    Arguments:
        client: The OpenAI client or similar model-processing object.
        sub_summaries: A list of dicts, each containing 'batch_start_index', 'batch_end_index', 'summary'.
        max_iterations: Safety limit for number of merge iterations (in case something goes wrong).

    Returns:
        A single summary string after merging all.
    """

    def build_merge_prompt(summary_a, summary_b, index_range_a, index_range_b):
        return f"""
    Your task is to merge and refine the two sub-summaries into one comprehensive summary. Use the information provided in each sub-summary, including any indications of how many messages mention each theme, to create a cohesive, combined overview.
    
    Please:
    1. Identify all key themes, issues, categories, and subcategories presented in both sub-summaries.
    2. For each theme, combine the frequency counts or message references from Sub-summary A and Sub-summary B. If Sub-summary A indicates a theme appears in X messages and Sub-summary B says Y, your merged summary should reflect the total combined frequency (X+Y, or an estimate if exact counts aren't clearly stated). Don't need to mention it's from A or B. just show the number.
    3. Ensure that no critical insights are lost, even if mentioned only once. DO NOT SKIP ANY ONE.
    4. Focus on the meaning and relevance of the themes, not just on categories or repetition.
    5. Present a balanced and cohesive final summary, clearly grouping related issues into broader thematic areas.
    6. For each sub-category, include the more detail of the concerns that exemplify them.”
    7. In the response, don't need to mention like "after merging two summaries". just show "Summary: " and the concerns in different categories.
    
    Sub-summary A (Messages {index_range_a[0]} to {index_range_a[1]}):
    {summary_a}
    
    Sub-summary B (Messages {index_range_b[0]} to {index_range_b[1]}):
    {summary_b}
    """

    iteration = 0
    current_summaries = sub_summaries

    while len(current_summaries) > 1 and iteration < max_iterations:
        next_round = []
        # Process pairs
        for i in range(0, len(current_summaries), 2):
            if i + 1 < len(current_summaries):
                s1 = current_summaries[i]
                s2 = current_summaries[i+1]

                prompt = build_merge_prompt(
                    s1['summary'], 
                    s2['summary'],
                    (s1['batch_start_index'], s1['batch_end_index']),
                    (s2['batch_start_index'], s2['batch_end_index'])
                )
                
                merged_response = process_batch(client, prompt)
                # Create a new summary dict that covers the range from the earliest start to the latest end
                new_summary = {
                    "batch_start_index": s1['batch_start_index'],
                    "batch_end_index": s2['batch_end_index'],
                    "summary": merged_response
                }
                next_round.append(new_summary)
            else:
                # Odd one out, carry it over
                next_round.append(current_summaries[i])
        
        current_summaries = next_round
        iteration += 1

    # By now, current_summaries should have only one element, which is the final comprehensive summary.
    if len(current_summaries) == 1:
        return current_summaries[0]['summary']
    else:
        raise RuntimeError("Merging did not finish properly within max_iterations.")

# Example usage:
# final_summary = merge_summaries(client, summaries)
# print(final_summary)

In [75]:
## TODO: solve the same max token bug in merge_summaries()

final_summary = merge_summaries(
    client=client,
    sub_summaries = summaries,
    max_iterations = 10
)

In [76]:
final_summary

"Summary:\n\n1. Housing and Living Arrangements: A prevalent subject across multiple discussions revolving around the challenges of finding affordable housing, potential issues with ventilation in homes, property ownership, and rental income. Messages indicate an active exploration of housing investments, understanding local tax systems, and adjusting to different housing standards, including problems caused by internal air cycling systems in kitchens without windows.\n\n2. Employment and Economic Opportunities: Recurring themes include job competition, personal skill recognition, and career building, illustrating a keen observance of economic opportunities and shifting job market trends.\n\n3. Healthcare and Well-Being: No topics directly related to this category were found.\n\n4. Social and Cultural Adjustment: Identified issues emphasize the need for social integration, community building, diminishment of loneliness, and fostering social connections. An additional focus is placed on

In [114]:
def build_simple_summary_prompt(message):
    prompt = """
    
    “Summarize the following detailed report into 2-3 sentences, focusing only on the most critical themes without going into specific examples or details
    
    """
    
    prompt += message

    return prompt

In [86]:
def build_and_process_summary(client, message, summary_type="simple"):
    """
    Generates a summary prompt and processes it based on the summary type.

    Args:
        client: The client object for processing the batch.
        message: The detailed report to be summarized.
        summary_type: The type of summary - "simple" for 2-3 sentences, "middle" for 5-7 sentences.

    Returns:
        The processed summary response.
    """
    if summary_type == "simple":
        prompt = """
        Summarize the following detailed report into 2 sentences, focusing only on the most critical themes. Highlight the major challenges and topics without going into secondary details or specifics.
        """
    elif summary_type == "middle":
        prompt = """
        Summarize the following detailed report into 5-7 sentences, capturing the main themes and challenges in a balanced way and mention how they say it in the original text if it's imprtant. Group related issues where possible, and provide a brief insight into the topics discussed without listing all details.
        """
    else:
        raise ValueError("Invalid summary_type. Choose 'simple' or 'middle'.")

    prompt += f"\n\n{message}"
    response = process_batch(client, prompt)
    return response


In [87]:
summary_type = "simple"  # Change to "middle" for a mid-level summary
summary_response = build_and_process_summary(client, final_summary, summary_type)
print(summary_response)

The main challenges discussed in the report focus on affordable housing, job competition, social and cultural integration, legal and bureaucratic complexities, high cost of education, comprehension of UK tax systems and financial management, family responsibilities, loneliness, economic stability, and personal grooming. There's a lack of discussions around healthcare and public safety, while huge concerns arise about tax implications related to familial responsibilities, the impact of change in interest rates on financial planning and the need to maintain cultural identity in the midst of adjustment.


In [88]:
summary_type = "middle"  # Change to "middle" for a mid-level summary
summary_response = build_and_process_summary(client, final_summary, summary_type)
print(summary_response)

The report captures wide-ranging issues among the UK's Hong Kong community, highlighting challenges in affordable housing, property ownership, and ventilation problems in homes, while an active interest in housing investments and understanding local tax systems is observed. The job market competitiveness and skill recognition underline the community's focus on employment and economic opportunities. Cultural adjustment and community building emerge as key themes, with emphasis on cultural and language preservation, elder care, and social integration. The report also underscores legal and bureaucratic complexities faced by newcomers, such as immigration, residency applications, and BNO visa renewals. High tuition costs for international students and language preservation are identified as key concerns in education. Transportation focuses on compliance with traffic regulations and car registration, suggesting interest in diverse mobility options. The report points out financial difficulti