# new approaches

In [18]:
import os
import json
import pandas as pd
import openai

In [19]:
# Initialize the OpenAI client


## 1. Initialize or Load Categories

In [20]:
def load_categories_from_file(file_path):
    """
    Load categories and subcategories from a JSON file.
    If the file does not exist or contains invalid JSON, initialize with default categories.
    """
    if not os.path.exists(file_path):
        return initialize_empty_file(file_path)

    with open(file_path, "r", encoding="utf-8") as file:
        try:
            return json.load(file)
        except json.JSONDecodeError:
            print(f"Invalid JSON detected in {file_path}. Reinitializing the file.")
            return initialize_empty_file(file_path)


def initialize_empty_file(file_path):
    """
    Initialize a JSON file with a default dictionary and save it.
    """
    categories = categories = {
      "Housing and Living Arrangements": [
        "Rising Rent and Housing Costs",
        "Finding Affordable and Safe Accommodation",
        "Shared Housing and Roommate Dynamics",
        "Adjusting to Different Housing Standards",
        "Living with Extended Family or Parents"
      ],
      "Employment and Economic Opportunities": [
        "Job Market Competition",
        "Skill and Credential Recognition",
        "Overqualification and Career Downgrades",
        "Workplace Culture and Integration",
        "Access to Professional Networking Opportunities",
        "Balancing Work with Family Responsibilities"
      ],
      "Healthcare and Well-Being": [
        "Understanding Local Healthcare Systems",
        "Access to Primary and Emergency Care",
        "Mental Health Resources Availability",
        "Financial Barriers to Healthcare",
        "Navigating Health Insurance Systems",
        "Communicating Effectively with Healthcare Providers"
      ],
      "Social and Cultural Adjustment": [
        "Building Community and Social Connections",
        "Adapting to New Social Norms and Etiquette",
        "Overcoming General Language Barriers in Social Contexts",
        "Facing and Addressing Discrimination",
        "Parenting Challenges in a New Cultural Environment",
        "Celebrating and Preserving Cultural Traditions"
      ],
      "Legal and Bureaucratic Challenges": [
        "Navigating Immigration and Residency Requirements",
        "Understanding Tax Obligations",
        "Securing Visas and Work Permits",
        "Accessing Legal Aid or Advocacy Services",
        "Filing Necessary Documentation for Families",
        "Understanding Local Laws and Regulations"
      ],
      "Education and Personal Development": [
        "Accessing Education for Children and Adults",
        "Recognition of Previous Educational Credentials",
        "Enrolling in Language and Integration Programs",
        "Financial Barriers to Education and Training",
        "Exploring New Career or Academic Opportunities"
      ],
      "Transportation and Mobility": [
        "Navigating Public Transportation Systems",
        "Obtaining Driver’s Licenses or Vehicle Registration",
        "Cost and Accessibility of Transportation",
        "Challenges in Rural or Suburban Mobility",
        "Adjusting to New Traffic Rules and Regulations",
        "Learning to Drive in a New Environment"
      ],
      "Financial and Budgeting Challenges": [
        "Setting Up Bank Accounts and Building Credit",
        "Understanding Local Taxes and Financial Systems",
        "Managing Cost of Living in High-Expense Areas",
        "Sending Money Abroad to Family",
        "Planning and Budgeting for Financial Security"
      ],
      "Family Dynamics and Support": [
        "Adjusting to Changing Family Roles",
        "Reuniting with Family Across Borders",
        "Supporting Children’s Educational and Social Needs",
        "Caring for Aging Parents Remotely",
        "Managing Relationships in Cross-Cultural Marriages",
        "Strengthening Family Bonds in a New Environment"
      ],
      "Identity and Emotional Well-Being": [
        "Coping with Culture Shock and Loneliness",
        "Balancing Old and New Cultural Identities",
        "Addressing Feelings of Isolation or Marginalization",
        "Finding Support Networks for Emotional Health",
        "Building a Sense of Belonging in the New Country",
        "Overcoming Trauma and Resilience Building"
      ],
      "Public Safety and Security Concerns": [
        "Perception of Neighbourhood Safety",
        "Navigating Local Law Enforcement and Emergency Services",
        "Personal Safety Strategies",
        "Anxiety About Personal Safety"
      ],
      "Political Environment and Governance": [
        "Understanding Local Political Parties and Policies",
        "Concerns about Authoritarian or Repressive Tendencies",
        "International Relations and Policy Impact on Immigrants"
      ],
      "Economic and Financial Stability": [
        "Adapting to Market Fluctuations and Economic Changes",
        "Building Resilience Against Economic Uncertainty"
      ]
    }
    save_categories_to_file(categories, file_path)
    print(f"Created a new categories file at: {file_path}")
    return categories


def save_categories_to_file(categories, file_path):
    """
    Save the updated categories and subcategories to a JSON file only if changes are detected.
    """
    # Load existing categories from the file
    if os.path.exists(file_path):
        with open(file_path, "r", encoding="utf-8") as file:
            try:
                existing_categories = json.load(file)
            except json.JSONDecodeError:
                existing_categories = {}
    else:
        existing_categories = {}

    # Check if the categories have changed
    if categories != existing_categories:
        with open(file_path, "w", encoding="utf-8") as file:
            json.dump(categories, file, ensure_ascii=False, indent=4)
        print(f"Updated categories saved to {file_path}")
    else:
        print("No changes detected in categories. Skipping save.")

def parse_consolidation_response(response):
    """
    Parse the GPT response to extract consolidated categories and subcategories.

    Parameters:
        response (str): The GPT response containing the updated categories and subcategories.

    Returns:
        dict: A dictionary of consolidated categories and subcategories.
    """
    consolidated_categories = {}

    # Split the response into lines
    lines = response.split("\n")
    
    current_category = None
    for line in lines:
        line = line.strip()  # Remove extra whitespace
        if not line:
            continue  # Skip empty lines

        # Check for a category line (e.g., "Housing and Rent:")
        if line.endswith(":"):
            current_category = line[:-1].strip()  # Remove the trailing ":"
            consolidated_categories[current_category] = []
        elif current_category:
            # Treat non-category lines as subcategories
            # Example format: "    • Subcategory Name"
            if line.startswith("•"):
                subcategory = line[1:].strip()  # Remove the bullet point
                consolidated_categories[current_category].append(subcategory)

    return consolidated_categories

## 2. Build the Prompt

In [21]:
def build_prompt(categories, messages):
    """
    Build a full prompt including current categories and subcategories
    and the batch of messages to process.
    """
    static_prompt = (
        "You are categorizing chat messages into predefined categories and "
        "subcategories about specific, explicitly stated living problems or challenges "
        "faced by Hong Kong people living in England.\n\n"
        "Here are the current categories and subcategories:\n"
    )

    # Add categories and subcategories
    for category, subs in categories.items():
        static_prompt += f"{category}\n"
        for sub in subs:
            static_prompt += f"    • {sub}\n"

    # Add the messages to categorize
    static_prompt += "\nCurrent Batch of Messages:\n"
    for i, message in enumerate(messages, start=1):
        static_prompt += f"{i}. {message}\n"

    # Revised instructions
    static_prompt += """
    Important Instructions:
    1. Only categorize a message if it explicitly states a difficulty, hardship, or challenge related to living in England as a Hong Kong person.
    
       - For example: "I cannot afford...", "I am struggling to...", "I have difficulty...", "I face a barrier...", or any clear complaint about a problem.
       - Just mentioning a topic (like a driving test, theory test, housing, or healthcare) is NOT enough. Must explicitly describe a problem.
    
    2. If the message:
       - is just sharing an event or promotional activity without mentioning any difficulty,
       - is just a link (like a YouTube link),
       - mentions political activity, theory tests, driving tests, education, or any other topic without explicitly stating a personal difficulty,
       - is general discussion or random content without stating a personal challenge,
    
       then it MUST be categorized as "Uncategorized."
    
    3. DO NOT GUESS a problem. If not clear, choose "Uncategorized."
    
    4. If the message describes a specific difficulty, hardship, or challenge for living in England that is not covered by any of the existing main categories or their subcategories, you must introduce a new category or subcategory:
    
       - If the difficulty logically fits within an existing main category but none of its current subcategories capture this new aspect, add a new subcategory under that existing main category. This new subcategory should clearly describe the specific difficulty mentioned in the message.
       
       - If the difficulty does not fit under any existing main category at all, create a completely new main category and a relevant first subcategory. Both the main category and subcategory names must clearly reflect the nature of the newly mentioned difficulty.
    
    Important:
    - Do not reuse, copy, or refer to any categories or subcategories given as examples in these instructions. They are placeholders only.  
    - Each time you create a new category or subcategory, invent a unique and contextually appropriate name that matches the difficulty described in the message.  
    - The new category and/or subcategory must be directly related to the difficulty stated. If the message talks about a type of difficulty not previously covered, think of a descriptive name that conveys that exact challenge.
    - After introducing a new category or subcategory, do not continue categorizing further messages in this batch. Stop immediately and return only the newly created category and subcategory.
    
    For clarity:
    - If a message states a difficulty and it clearly doesn't match any existing categories or subcategories, you must come up with a new main category name and a new subcategory name that accurately describe this difficulty.  
    - If a message states a difficulty that fits an existing category but needs a more specific angle not listed, add a new subcategory to that existing main category that directly addresses the difficulty mentioned.
    
    Do not guess or approximate. If the difficulty is new, create the category or subcategory right away. Do not return 'Uncategorized' when a difficulty is explicitly described and not covered by existing categories. Instead, produce a new category and/or subcategory as required.
    5. No commentary or extra text outside the specified format.
    
    **Format:**
    <message_number>. <Main Category>
        • <Subcategory>
    
    OR if unrelated/unclassifiable:
    <message_number>. Uncategorized
    
    **EXAMPLES:**
    
    - Unrelated event (no stated difficulty):
      Message: "CLS Hair Studio offering free haircuts on November 12."
      Response:
      1. Uncategorized
    
    - Just a link:
      Message: "https://youtu.be/n4Flcllu9WA"
      Response:
      1. Uncategorized
    
    - Political or asylum mention without stated difficulty:
      Message: "Hong Kong person got asylum in 2 years."
      Response:
      1. Uncategorized
    
    - Mentioning theory/driving test without difficulty:
      Message: "I took a theory test yesterday."
      Response:
      1. Uncategorized
    
    - Relevant (explicitly stating a difficulty):
      Message: "I am struggling to find affordable housing as a Hong Kong migrant in England."
      Response:
      1. Housing and Living Arrangements
          • Finding Affordable and Safe Accommodation
    
    If the message does not explicitly say it is having trouble, difficulty, or a challenge, always choose 'Uncategorized'.
    No guessing.
    If a new category is needed, introduce it and stop.
    """

    return static_prompt

## 3. Process a Batch

In [22]:
def process_batch(client, prompt):
    """
    Process a batch of messages using GPT.
    """
    try:
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ]
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error processing batch: {e}")
        return "Error"

## 4. Update Categories

In [23]:
def update_categories_from_response(response, categories):
    """
    Update categories and subcategories based on GPT's response.
    Return True if a new main category was introduced, False otherwise.
    """
    lines = response.split("\n")
    new_category_introduced = False
    current_main_category = None

    for line in lines:
        stripped_line = line.strip()
        if stripped_line and stripped_line[0].isdigit() and "." in stripped_line:
            # Category line format: "1. Main Category"
            parts = stripped_line.split(".", 1)
            cat_text = parts[1].strip()
            if cat_text.lower() != "uncategorized":
                current_main_category = cat_text
            else:
                current_main_category = None

        elif stripped_line.startswith(("•", "-")) and current_main_category:
            subcategory = stripped_line.lstrip("•-").strip()
            if subcategory.lower() == "uncategorized":
                continue

            # If current_main_category is new
            if current_main_category not in categories:
                categories[current_main_category] = []
                new_category_introduced = True

            if subcategory not in categories[current_main_category]:
                categories[current_main_category].append(subcategory)

    return new_category_introduced

## 5. Messages Batch Processing

In [30]:
# def process_messages_in_batches(client, df, categories_file, batch_size=50, overlap=20):
#     categories = load_categories_from_file(categories_file)

#     results = [("Uncategorized", "Uncategorized")] * len(df)

#     current_index = 0
#     while current_index < len(df):
#         end = min(current_index + batch_size, len(df))
#         batch_messages = df["Chat Content"].iloc[current_index:end].tolist()
#         batch_messages = [str(msg) if isinstance(msg, str) else "" for msg in batch_messages]

#         prompt = build_prompt(categories, batch_messages)
#         response = process_batch(client, prompt)
#         print("Raw GPT Response:\n", response)  # Print raw response for debugging

#         if response == "Error":
#             print(f"Error processing batch {current_index} to {end}")
#             current_index = end
#             continue

#         # Check if new category introduced
#         new_category_added = update_categories_from_response(response, categories)
#         if new_category_added:
#             save_categories_to_file(categories, categories_file)
#             print("New category found and added. Restarting from the same batch with updated categories.")

#             # Clear results for this batch since we are reprocessing
#             for i in range(current_index, end):
#                 results[i] = ("Uncategorized", "Uncategorized")

#             # Re-run same batch with updated categories
#             continue

#         # Parse the response lines into results
#         response_lines = response.split("\n")
#         batch_index = current_index
#         expecting_subcategory = False
#         current_category = None

#         for line in response_lines:
#             line_stripped = line.strip()
#             if not line_stripped:
#                 continue

#             if line_stripped[0].isdigit() and "." in line_stripped:
#                 parts = line_stripped.split(".", 1)
#                 cat_text = parts[1].strip()
#                 if cat_text.lower() == "uncategorized":
#                     if batch_index < len(results):
#                         results[batch_index] = ("Uncategorized", "Uncategorized")
#                         batch_index += 1
#                     expecting_subcategory = False
#                 else:
#                     current_category = cat_text
#                     expecting_subcategory = True
#                 continue

#             if expecting_subcategory and line_stripped.startswith(("•", "-")):
#                 clean_line = line_stripped.lstrip("•-").strip()
#                 subcategory = clean_line
#                 if batch_index < len(results):
#                     results[batch_index] = (current_category, subcategory)
#                     batch_index += 1
#                 expecting_subcategory = False
#                 continue

#             # If format not followed, uncategorized
#             if batch_index < len(results):
#                 results[batch_index] = ("Uncategorized", "Uncategorized")
#                 batch_index += 1
#             expecting_subcategory = False

#         current_index = end

#     df["Category"] = [r[0] for r in results]
#     df["Subcategory"] = [r[1] for r in results]

#     print("Final DataFrame with Categories and Subcategories:\n", df[["Category", "Subcategory"]].head(10))
#     # df.to_csv("./categorized_messages.csv", index=False)
#     # print("Updated CSV saved to ./categorized_messages.csv")

#     return df, categories

In [34]:
def process_messages_in_batches(client, df, categories_file, batch_size=50, overlap=20):
    print("Loading categories...")
    categories = load_categories_from_file(categories_file)
    print("Categories loaded:", categories)

    results = [("Uncategorized", "Uncategorized")] * len(df)

    current_index = 0
    while current_index < len(df):
        print(f"\nProcessing batch: {current_index} to {min(current_index + batch_size, len(df))}")
        end = min(current_index + batch_size, len(df))
        batch_messages = df["Chat Content"].iloc[current_index:end].tolist()
        batch_messages = [str(msg) if isinstance(msg, str) else "" for msg in batch_messages]
        print("Batch messages:", batch_messages[:3], "...")  # Preview a few messages for debugging

        prompt = build_prompt(categories, batch_messages)
        print("Generated prompt:\n", prompt)  # Print the first 500 characters of the prompt

        response = process_batch(client, prompt)
        print("Raw GPT Response:\n", response)  # Print raw response for debugging

        if response == "Error":
            print(f"Error processing batch {current_index} to {end}")
            current_index = end
            continue

        # Check if new category introduced
        print("Updating categories from response...")
        new_category_added = update_categories_from_response(response, categories)
        if new_category_added:
            print("New category found! Saving and restarting batch.")
            save_categories_to_file(categories, categories_file)

            # Clear results for this batch since we are reprocessing
            for i in range(current_index, end):
                results[i] = ("Uncategorized", "Uncategorized")

            # Re-run same batch with updated categories
            continue

        # Parse the response lines into results
        print("Parsing response lines into results...")
        response_lines = response.split("\n")
        batch_index = current_index
        expecting_subcategory = False
        current_category = None

        for line in response_lines:
            line_stripped = line.strip()
            if not line_stripped:
                continue

            if line_stripped[0].isdigit() and "." in line_stripped:
                parts = line_stripped.split(".", 1)
                cat_text = parts[1].strip()
                if cat_text.lower() == "uncategorized":
                    if batch_index < len(results):
                        results[batch_index] = ("Uncategorized", "Uncategorized")
                        batch_index += 1
                    expecting_subcategory = False
                else:
                    current_category = cat_text
                    expecting_subcategory = True
                continue

            if expecting_subcategory and line_stripped.startswith(("•", "-")):
                clean_line = line_stripped.lstrip("•-").strip()
                subcategory = clean_line
                if batch_index < len(results):
                    results[batch_index] = (current_category, subcategory)
                    batch_index += 1
                expecting_subcategory = False
                continue

            # If format not followed, uncategorized
            if batch_index < len(results):
                results[batch_index] = ("Uncategorized", "Uncategorized")
                batch_index += 1
            expecting_subcategory = False

        print(f"Finished processing batch {current_index} to {end}")
        current_index = end

    print("\nAll batches processed. Finalizing DataFrame...")
    df["Category"] = [r[0] for r in results]
    df["Subcategory"] = [r[1] for r in results]

    print("Final DataFrame with Categories and Subcategories:\n", df[["Category", "Subcategory"]].head(10))
    # df.to_csv("./categorized_messages.csv", index=False)
    # print("Updated CSV saved to ./categorized_messages.csv")

    return df, categories

## 6. Summarize Groups

In [25]:
# def summarize_group(client, messages, group_name):
#     """
#     Summarize relevant concerns for a category.
#     """
#     if not messages:
#         return f"No concerns found in {group_name}."

#     messages_text = "\n".join(messages)
#     prompt = f"""
#     Summarize the concerns related to {group_name} based on the following chat messages from individuals in the UK.

#     Include only the struggles and challenges people face in this category. Exclude any irrelevant or unrelated content.

#     Messages:
#     {messages_text}

#     Provide a concise summary of the struggles and challenges faced.
#     """
#     try:
#         response = client.chat.completions.create(
#             model="gpt-4",
#             messages=[
#                 {"role": "system", "content": "You are a helpful assistant."},
#                 {"role": "user", "content": prompt}
#             ]
#         )
#         return response.choices[0].message.content.strip()
#     except Exception as e:
#         print(f"Error summarizing group {group_name}: {e}")
#         return "Error in summarization."

## 7. Filtering Data

In [26]:
import re

def preprocess_messages_with_usernames(df):
    """
    Preprocess messages by adding usernames and normalizing UTF-8 encoding.
    Adds a 'Processed Content' column to the DataFrame after filtering out unwanted rows.
    """

    # Ensure the required columns exist
    if "Who" not in df.columns or "Chat Content" not in df.columns:
        raise ValueError("DataFrame must contain 'Who' and 'Chat Content' columns.")

    # Normalize 'Who' and 'Chat Content' to UTF-8
    df['Who'] = df['Who'].apply(lambda x: x.encode("utf-8").decode("utf-8") if isinstance(x, str) else x)
    df['Chat Content'] = df['Chat Content'].apply(lambda x: x.encode("utf-8").decode("utf-8") if isinstance(x, str) else "")

    # Define the array of blocked phrases
    blocked_phrases = [
        '首充入', '秒到帳', '每筆送', '獎金高達', '報名參加', '報名:', '報名：', '，報名', ', 報名', '優惠', '日期：', '時間：', '地點：', 
        '加入TG', '全文：', '報導', '當年今日', '現正招募', '專訪','拉群', '点我', '有意請', '立即申請：', '關注我們', '尋失物', 'LIHKG', 
        'lih.kg', 'play.google.com', 'Find out more', '得獎內容', '問卷連結', '公告：'
    ]

    # Define the filtering conditions
    cond_empty = df['Chat Content'].str.strip() == ''

    # Check for NaN (both as a missing value and literal string "NaN")
    cond_nan = df['Chat Content'].isna() | (df['Chat Content'].str.strip().str.upper() == 'NAN')

    cond_link_only = df['Chat Content'].str.strip().str.match(r'^(https?://\S+|www\.\S+)$', na=False)

    cond_emoji_only = df['Chat Content'].str.match(r'^[\U0001F300-\U0001F6FF]+$', na=False)

    # Emoji with link only (e.g., "🙂 https://example.com")
    cond_emoji_with_link = df['Chat Content'].str.match(
        r'^[\U0001F300-\U0001F6FF]+\s+https?://\S+$', na=False
    )

    cond_who_contains_bot = df['Who'].str.contains('bot', case=False, na=False)

    cond_blocked_phrases = df['Chat Content'].str.contains('|'.join(map(re.escape, blocked_phrases)), case=False, na=False)

    # Calculate char count
    df['CharCount'] = df['Chat Content'].str.len()

    # At least 2 hashtags and char count more than 80
    cond_two_hash_and_word_count = (
        (df['Chat Content'].str.count('#') >= 2) & (df['CharCount'] > 80)
    )

    # Check for both Instagram and Facebook words
    cond_instagram_and_facebook_words = (
        df['Chat Content'].str.contains('instagram', case=False, na=False) & 
        df['Chat Content'].str.contains('facebook', case=False, na=False)
    )

    # Check for both Instagram and Facebook links
    cond_instagram_and_facebook_links = (
        df['Chat Content'].str.contains(r'instagram\.com', case=False, na=False) & 
        df['Chat Content'].str.contains(r'facebook\.com', case=False, na=False)
    )

    # Combine the Instagram and Facebook conditions
    cond_instagram_and_facebook = cond_instagram_and_facebook_words | cond_instagram_and_facebook_links

    # If char count > 700 and no Chinese characters, filter out
    # Chinese characters range: \u4e00-\u9fff
    cond_long_no_chinese = (df['CharCount'] > 700) & (~df['Chat Content'].str.contains(r'[\u4e00-\u9fff]', na=False))

    # if the message too short
    cond_short_no_chinese = (df['CharCount'] == 1) & (~df['Chat Content'].str.contains(r'[\u4e00-\u9fff]', na=False))

    # Combine all conditions using logical OR
    mask = (
        cond_empty |
        cond_nan |
        cond_link_only |
        cond_emoji_only |
        cond_emoji_with_link |
        cond_who_contains_bot |
        cond_blocked_phrases |
        cond_two_hash_and_word_count |
        cond_instagram_and_facebook |
        cond_long_no_chinese |
        cond_short_no_chinese
    )

    # Filter the DataFrame
    df = df[~mask].copy()

    # Preprocess messages (still create 'Processed Content')
    df["Processed Content"] = df.apply(
        lambda row: preprocess_message(row["Who"], row["Chat Content"]),
        axis=1
    )

    return df


def preprocess_message(user, message):
    """
    Format a single message with the username and normalize UTF-8 encoding.
    Replaces existing colons in the message to avoid ambiguity.
    """
    if not isinstance(message, str):
        message = ""
    
    # Normalize message to UTF-8
    message = message.encode("utf-8").decode("utf-8")
    
    # Replace existing colons to avoid ambiguity
    message = message.replace(":", " |")
    
    # Format the message with the username
    return f'{user}: {message}' if user else message

## 8. Run the Script

In [10]:
def main():
    # File paths
    categories_file = "./categories.json"
    messages_file = "./filtered_messages_nov.csv"
    output_file = "./categorized_messages.csv"

    # Load categories file
    categories = load_categories_from_file(categories_file)

    # Load messages
    df = pd.read_csv(messages_file)

    # Preprocess messages: Add usernames and normalize UTF-8
    df = preprocess_messages_with_usernames(df)

    # Filter rows for processing
    df = df.iloc[0:300]

    # Ensure DataFrame has Category and Subcategory columns
    if "Category" not in df.columns:
        df["Category"] = ""
    if "Subcategory" not in df.columns:
        df["Subcategory"] = ""

    # Perform initial reanalysis of existing categories
    client = openai  # Replace with your OpenAI client
    categories = load_categories_from_file(categories_file)

    # Process messages in batches
    df, categories = process_messages_in_batches(
        client=client,
        df=df,
        categories_file=categories_file,
        batch_size=30,
        overlap=5
    )

    # Save updated DataFrame to a CSV file
    df.to_csv(output_file, index=False)
    print(f"\nUpdated CSV saved to {output_file}")

In [11]:
if __name__ == "__main__":
    main()

Raw GPT Response:
 1. Uncategorized
2. Uncategorized
3. Uncategorized
4. Legal and Bureaucratic Challenges
    • Accessing Legal Aid or Advocacy Services
5. Economic and Financial Stability
    • Adapting to Market Fluctuations and Economic Changes
6. Uncategorized
7. Uncategorized
8. Uncategorized
9. Uncategorized
10. Employment and Economic Opportunities
    • Job Market Competition
11. Uncategorized
12. Uncategorized
13. Uncategorized
14. Uncategorized
15. Uncategorized
16. Uncategorized
17. Uncategorized
18. Uncategorized
19. Uncategorized
20. Uncategorized
21. Uncategorized
22. Uncategorized
23. Housing and Living Arrangements
    • Finding Affordable and Safe Accommodation
24. Uncategorized
25. Uncategorized
26. Uncategorized
27. Social and Cultural Adjustment
    • Celebrating and Preserving Cultural Traditions
28. Uncategorized
29. Uncategorized
30. Uncategorized
Raw GPT Response:
 4. Legal and Bureaucratic Challenges
    • Understanding Tax Obligations
14. Employment and Econo

## Catogorized Message Filtering again

In [35]:
import pandas as pd

# Replace 'your_file.csv' with the path to your CSV file
csv_file_path = './categorized_messages.csv'
output_file = './refined_categorized_messages.csv'
categories_file = "./categories.json"

# Load the CSV into a pandas DataFrame
df_categorized = pd.read_csv(csv_file_path)

print(len(df_categorized[df_categorized['Category'] != 'Uncategorized']))

df_categorized = df_categorized[df_categorized['Category'].str.strip().str.lower() != 'uncategorized']

# Ensure the 'processed content' column exists
if 'Processed Content' not in df_categorized.columns:
    raise ValueError("The CSV does not contain a 'processed content' column.")

# Combine all rows in the 'processed content' column into a single string
combined_content = df_categorized['Processed Content'].astype(str).tolist()

82


In [36]:
client = openai

categories = load_categories_from_file(categories_file)

# Process messages in batches
df_categorized, categories = process_messages_in_batches(
    client=client,
    df=df_categorized,
    categories_file=categories_file,
    batch_size=30,
    overlap=5
)

# Save updated DataFrame to a CSV file
df_categorized.to_csv(output_file, index=False)

Loading categories...
Categories loaded: {'Housing and Living Arrangements': ['Rising Rent and Housing Costs', 'Finding Affordable and Safe Accommodation', 'Shared Housing and Roommate Dynamics', 'Adjusting to Different Housing Standards', 'Living with Extended Family or Parents'], 'Employment and Economic Opportunities': ['Job Market Competition', 'Skill and Credential Recognition', 'Overqualification and Career Downgrades', 'Workplace Culture and Integration', 'Access to Professional Networking Opportunities', 'Balancing Work with Family Responsibilities'], 'Healthcare and Well-Being': ['Understanding Local Healthcare Systems', 'Access to Primary and Emergency Care', 'Mental Health Resources Availability', 'Financial Barriers to Healthcare', 'Navigating Health Insurance Systems', 'Communicating Effectively with Healthcare Providers'], 'Social and Cultural Adjustment': ['Building Community and Social Connections', 'Adapting to New Social Norms and Etiquette', 'Overcoming General Langu

KeyboardInterrupt: 

In [104]:
# import tiktoken

# # Choose the appropriate encoding based on the model
# # For example, for GPT-3.5-Turbo or GPT-4, use 'gpt-4' or 'gpt-3.5-turbo'
# # You can refer to OpenAI's documentation for the correct encoding name

# # Example for GPT-4
# encoding = tiktoken.encoding_for_model("gpt-4")

# # Get the list of tokens
# tokens = encoding.encode(combined_content)

# # Count the number of tokens
# num_tokens = len(tokens)

# print(f"Total number of tokens: {num_tokens}")

In [106]:
def build_summary_prompt(categories, messages):
    prompt = """
    
    The following is a collection of messages and their associated categories. Please analyze the content and provide a detailed generalization.
    
    Your task is to:
    1. Identify the main points and concerns expressed in the messages.
    2. Not all the messages are related to living problems; focus only on relevant ones.
    3. Provide a balanced overview of the categories and subcategories, ensuring no critical insights are missed, even if mentioned only once.
    4. Summarize the key topics or themes across the messages, focusing on their meaning and relevance rather than repetition.
    5. Group related issues into broader themes or perspectives to create a cohesive analysis.
    
    Here are the categories for living problems:
    
    """
    
    # Add categories and subcategories
    for category, subs in categories.items():
        prompt += f"{category}\n"
        for sub in subs:
            prompt += f"    • {sub}\n"
            
    prompt += "\nMessages:\n"
    
    # Add numbers to messages for better separation
    for idx, message in enumerate(messages, 1):
        prompt += f"{idx}. {message.strip()}\n"

    return prompt

In [107]:
# File paths
categories_file = "./categories.json"

# Load categories file
categories = load_categories_from_file(categories_file)

### testing
# Get the list of tokens

prompt = build_summary_prompt(categories, '')
tokens = encoding.encode(prompt)

# Count the number of tokens
num_tokens = len(tokens)

print(num_tokens)

prompt = build_summary_prompt(categories, combined_content)
tokens = encoding.encode(prompt)

# Count the number of tokens
num_tokens = len(tokens)

print(num_tokens)
### testing


914
2895


In [108]:
print(combined_content)

['Mn55245: 想請問有無香港律師介紹', 'Produnkcan: 英國利息將跌至2.75%, Goldeman Sachs預測 \nhttps |//hkmigrate.com/thread/3301729599331 \n英國台 - 香港移民論壇', 'homelala: 💼Jobs for Hongkongers💼\n\nAn eight week programme offering personalised 1-1 support for all of your employment needs!\n\nThis programme will support you to | \n- Find work and work experience opportunities\n- Improve your CV, applications, and interview skills\n- Find new skills and training programmes\n\nThis programme is available to anyone who is living in the UK under the HKBNO Visa. \n\nFind out more and access suport | https |//www.jobsforhongkongers.org.uk/', 'MMCDR: 有房出租', 'nan: 農曆年Sheffield 有粵劇呀', 'lee_cheongsan: 【用Octopus的用戶有福了！】\n\n自家製一Click即睇實時Argile電費。\n\n唔使Login，唔使左襟右襟，真正一Click即睇！係一Click即睇呀呀呀呀。\n\n仲可以睇埋隔離區價錢添！\n\nhttps |//play.google.com/store/apps/details?id=octopus.energy.pricechecker', 'nan: 有時間去睇呢個展覽', 'nan: 住薄扶林的朋友仔睇下似唔似\nhttps |//youtu.be/75KlRM69eG4?si=_E1YGzikyhPW17Ag', 'kevinlaw1984: on 9, 交成晚稅\n我立即問稅局，佢話我文件吾齊。\n咁我就同太太重申填好

In [109]:
print(prompt)


    
    The following is a collection of messages and their associated categories. Please analyze the content and provide a detailed generalization.
    
    Your task is to:
    1. Identify the main points and concerns expressed in the messages.
    2. Not all the messages are related to living problems; focus only on relevant ones.
    3. Provide a balanced overview of the categories and subcategories, ensuring no critical insights are missed, even if mentioned only once.
    4. Summarize the key topics or themes across the messages, focusing on their meaning and relevance rather than repetition.
    5. Group related issues into broader themes or perspectives to create a cohesive analysis.
    
    Here are the categories for living problems:
    
    Housing and Living Arrangements
    • Rising Rent and Housing Costs
    • Finding Affordable and Safe Accommodation
    • Shared Housing and Roommate Dynamics
    • Adjusting to Different Housing Standards
    • Living with Extended F

In [110]:
prompt = build_prompt(categories, combined_content)

client = openai
updated_category_response = process_batch(client, prompt)

print(updated_category_response)

1. Legal and Bureaucratic Challenges
    • Accessing Legal Aid or Advocacy Services
2. Uncategorized
3. Employment and Economic Opportunities
    • Job Market Competition
4. Uncategorized
5. Social and Cultural Adjustment
    • Celebrating and Preserving Cultural Traditions
6. Uncategorized
7. Uncategorized
8. Uncategorized
9. Legal and Bureaucratic Challenges
    • Understanding Tax Obligations
10. Uncategorized
11. Uncategorized
12. Uncategorized
13. Employment and Economic Opportunities
    • Skill and Credential Recognition
14. Transportation and Mobility
    • Learning to Drive in a New Environment
15. Transportation and Mobility
    • Learning to Drive in a New Environment
16. Transportation and Mobility
    • Learning to Drive in a New Environment
17. Transportation and Mobility
    • Learning to Drive in a New Environment
18. Transportation and Mobility
    • Learning to Drive in a New Environment
19. Transportation and Mobility
    • Learning to Drive in a New Environment
20. 

In [111]:
prompt = build_summary_prompt(categories, combined_content)

client = openai
response = process_batch(client, prompt)

print(response)

Analyzing the messages, they fall into the following categories:

Housing and Living Arrangements: 
Message 4 is about housing availability. This speaks to the overall housing situation and noting the cost or standards of the housing isn't specified.

Employment and Economic Opportunities: 
Messages 3, 12, 13, and 22 revolve around employment opportunities and the job market competition. This ranges from providing employment support for visa holders, job specific discussions (IT and military roles), and discussions on an individual's salary. 

Legal and Bureaucratic Challenges:
Message 9 is an example of this category, dealing with tax obligations and issues with filing documentation. 

Education and Personal Development: 
Messages 14 to 21, and 25 to 29 focus on personal development. They discuss the process of preparing for and passing a driving theory test, evidencing challenges in adjusting to new traffic rules and regulations and learning to drive.

Social and Cultural Adjustment:

In [114]:
def build_detial_summary_prompt(message):
    prompt = """
    
    Analyze the following messages to identify the general themes and underlying needs of the individuals. Summarize your findings with a general conclusion that captures the overall context and key insights
    
    """
    
    prompt += message

    return prompt

In [115]:
summary_prompt = build_detial_summary_prompt(response)
detial_summary_response = process_batch(client, summary_prompt)

print(detial_summary_response)

You provided an extensive analysis of the individuals' messages falling within various categories, which mainly include Housing and Living Arrangements, Employment and Economic Opportunities, Legal and Bureaucratic Challenges, Education and Personal Development, Social and Cultural Adjustment, Personal Care and Appearance, and Political Environment and Governance.

Upon examining these messages, the most common themes are Employment and Economic Opportunities, Education and Personal Development, and Social and Cultural Adjustment. The messages suggest that people are actively seeking employment and opportunities for economic advancement. They are also keen on personal development, as seen in their endeavors to prepare for and pass professional examinations.

Furthermore, the individuals appear to be making significant efforts to adjust socially and culturally to their environment. This is illustrated by their attempts to form social groups and potentially negotiate the challenges of ad