# new approaches

In [1]:
import os
import json
import pandas as pd
import openai

In [2]:
# Initialize the OpenAI client


## 1. Initialize or Load Categories

In [3]:
def load_categories_from_file(file_path):
    """
    Load categories and subcategories from a JSON file.
    If the file does not exist or contains invalid JSON, initialize with an empty dictionary,
    save it to the specified path, and return the empty dictionary.
    """
    if not os.path.exists(file_path):
        return initialize_empty_file(file_path)

    with open(file_path, "r", encoding="utf-8") as file:
        try:
            return json.load(file)
        except json.JSONDecodeError:
            print(f"Invalid JSON detected in {file_path}. Reinitializing the file.")
            return initialize_empty_file(file_path)


def initialize_empty_file(file_path):
    """
    Initialize a JSON file with an empty dictionary and save it to the specified path.
    """
    categories = {
    "Housing and Living Arrangements": [
        "Rising Rent and Housing Costs",
        "Finding Affordable and Safe Accommodation",
        "Shared Housing and Roommate Dynamics",
        "Adjusting to Different Housing Standards",
        "Living with Extended Family or Parents"
    ],
    "Employment and Economic Opportunities": [
        "Job Market Competition",
        "Skill and Credential Recognition",
        "Overqualification and Career Downgrades",
        "Workplace Culture and Integration",
        "Access to Professional Networking Opportunities",
        "Overcoming Language Barriers in Everyday Life"
    ],
    "Healthcare and Well-Being": [
        "Understanding Local Healthcare Systems",
        "Access to Primary and Emergency Care",
        "Mental Health Resources Availability",
        "Financial Barriers to Healthcare",
        "Language and Cultural Barriers in Health Services"
    ],
    "Social and Cultural Adjustment": [
        "Building Community and Social Connections",
        "Adapting to New Social Norms and Etiquette",
        "Overcoming Language Barriers in Everyday Life",
        "Facing and Addressing Discrimination",
        "Parenting Challenges in a New Cultural Environment"
    ],
    "Legal and Bureaucratic Challenges": [
        "Navigating Immigration and Residency Requirements",
        "Understanding Tax Obligations",
        "Securing Visas and Work Permits",
        "Accessing Legal Aid or Advocacy Services",
        "Filing Necessary Documentation for Families"
    ],
    "Education and Personal Development": [
        "Accessing Education for Children and Adults",
        "Recognition of Previous Educational Credentials",
        "Enrolling in Language and Integration Programs",
        "Financial Barriers to Education and Training",
        "Balancing Education with Work or Family Responsibilities"
    ],
    "Transportation and Mobility": [
        "Navigating Public Transportation Systems",
        "Obtaining Driver\u2019s Licenses or Vehicle Registration",
        "Cost and Accessibility of Transportation",
        "Challenges in Rural or Suburban Mobility",
        "Adjusting to New Traffic Rules and Regulations"
    ],
    "Financial and Budgeting Challenges": [
        "Setting Up Bank Accounts and Building Credit",
        "Understanding Local Taxes and Financial Systems",
        "Managing Cost of Living in High-Expense Areas",
        "Sending Money Abroad to Family",
        "Saving for Long-Term Goals"
    ],
    "Family Dynamics and Support": [
        "Adjusting to Changing Family Roles",
        "Reuniting with Family Across Borders",
        "Supporting Children\u2019s Educational and Social Needs",
        "Caring for Aging Parents Remotely",
        "Managing Relationships in Cross-Cultural Marriages"
    ],
    "Identity and Emotional Well-Being": [
        "Coping with Culture Shock and Loneliness",
        "Balancing Old and New Cultural Identities",
        "Addressing Feelings of Isolation or Marginalization",
        "Finding Support Networks for Emotional Health",
        "Building a Sense of Belonging in the New Country"
    ],
    "Uncategorized": [
        "Uncategorized"
    ]
}
    save_categories_to_file(categories, file_path)
    print(f"Created a new categories file at: {file_path}")
    return categories


def save_categories_to_file(categories, file_path):
    """
    Save the updated categories and subcategories to a JSON file only if changes are detected.
    """
    # Load existing categories from the file
    if os.path.exists(file_path):
        with open(file_path, "r", encoding="utf-8") as file:
            try:
                existing_categories = json.load(file)
            except json.JSONDecodeError:
                existing_categories = {}
    else:
        existing_categories = {}

    # Check if the categories have changed
    if categories != existing_categories:
        with open(file_path, "w", encoding="utf-8") as file:
            json.dump(categories, file, ensure_ascii=False, indent=4)
        print(f"Updated categories saved to {file_path}")
    else:
        print("No changes detected in categories. Skipping save.")

def reanalyze_existing_categories(client, categories_file):
    """
    Perform a one-time reanalysis of existing categories at the start of the program.
    """
    categories = load_categories_from_file(categories_file)

    # If no categories exist, skip reanalysis
    if not categories:
        return categories

    # Build a prompt for reanalyzing existing categories
    prompt = "You are consolidating categories and subcategories for consistency.\n\n"
    prompt += "Here are the current categories and subcategories:\n"
    for category, subcategories in categories.items():
        prompt += f"{category}:\n"
        for subcategory in subcategories:
            prompt += f"    • {subcategory}\n"

    prompt += """
    Instructions:
    - If subcategories belong to the same category, merge them under one category.
    - If categories are duplicates, consolidate them into one category.
    - Provide the final consistent structure.
    """

    # Call GPT to reanalyze the categories
    response = process_batch(client, prompt)
    print(f"Reanalysis response:\n{response}")

    # Parse the response to get consolidated categories
    consolidated_categories = parse_consolidation_response(response)

    # Save the consolidated categories back to the file
    save_categories_to_file(consolidated_categories, categories_file)
    return consolidated_categories

## 2. Build the Prompt

In [4]:
def build_prompt(categories, messages):
    """
    Build a full prompt including current categories and subcategories
    and the batch of messages to process.
    """
    static_prompt = "You are categorizing chat messages into predefined categories and subcategories about living problems.\n\n"
    static_prompt += "Here are the current categories and subcategories:\n"

    # Add categories and subcategories to the prompt
    for category, subcategories in categories.items():
        static_prompt += f"{category}\n"
        for subcategory in subcategories:
            static_prompt += f"    • {subcategory}\n"

    # Add the messages to categorize
    static_prompt += "\nCurrent Batch of Messages:\n"
    for i, message in enumerate(messages):
        static_prompt += f"{i+1}. {message}\n"

    # Add instructions
    static_prompt += """
    Important Instructions:
    1. Categorize each message individually.
    2. If the message introduces a new subcategory, add it under the appropriate main category.
    3. If the message introduces a new main category, specify the main category and its first subcategory.
    4. If the message is unrelated to living problems, categorize it as "Uncategorized."
    """

    return static_prompt

## 3. Process a Batch

In [5]:
def process_batch(client, prompt):
    """
    Process a batch of messages using GPT.
    """
    try:
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ]
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error processing batch: {e}")
        return "Error"

## 4. Update Categories

In [6]:
def update_categories_from_response(response, categories):
    """
    Update categories and subcategories based on GPT's response, skipping "Uncategorized".
    """
    lines = response.split("\n")  # Split response into lines

    for line in lines:
        if "Category: " in line and "Subcategory: " in line:
            # Extract category and subcategory
            category = line.split("Category: ")[1].split("Subcategory: ")[0].strip()
            subcategory = line.split("Subcategory: ")[1].strip()

            # Skip "Uncategorized"
            if category == "Uncategorized" or subcategory == "Uncategorized":
                continue

            # Add to categories if it's new
            if category not in categories:
                categories[category] = []
            if subcategory not in categories[category]:
                categories[category].append(subcategory)

    return categories

## 5. Batch Processing

In [7]:
def process_messages_in_batches(client, df, categories_file, batch_size=50, overlap=20):
    """
    Process messages in smaller batches, stopping when a new category is detected.
    New categories are applied only to subsequent rows, without updating previous rows.
    """
    categories = load_categories_from_file(categories_file)
    results = [("Uncategorized", "Uncategorized")] * len(df)

    for start in range(0, len(df), batch_size - overlap):
        end = min(start + batch_size, len(df))
        batch_messages = df["Chat Content"].iloc[start:end].tolist()
        batch_messages = [str(msg) if isinstance(msg, str) else "" for msg in batch_messages]

        # Build prompt for the current batch
        prompt = build_prompt(categories, batch_messages)
        response = process_batch(client, prompt)

        if response != "Error":
            for i, message in enumerate(batch_messages):
                try:
                    # Parse the response line by line
                    line = response.split("\n")[i]
                    if "->" in line:
                        category, subcategory = map(str.strip, line.split("->"))
                        category = normalize_category(category)
                        subcategory = normalize_category(subcategory)

                        # Check if it's a new category or subcategory
                        if category not in categories or subcategory not in categories.get(category, []):
                            # Look back 20 rows to ensure consistency
                            lookback_start = max(start - 20, 0)
                            lookback_end = start
                            lookback_messages = df["Chat Content"].iloc[lookback_start:lookback_end].tolist()
                            lookback_messages = [
                                str(msg) if isinstance(msg, str) else "" for msg in lookback_messages
                            ]

                            # Skip previous row updates; just log the new category
                            if lookback_messages:
                                print(f"Checking context for new category: {category} -> {subcategory}")
                                print(f"Lookback range: {lookback_start}-{lookback_end}")
                            else:
                                print("No valid messages in the lookback range.")

                            # Add new category or subcategory
                            if category not in categories:
                                categories[category] = []
                            if subcategory not in categories[category]:
                                categories[category].append(subcategory)

                            # Save updated categories and stop the batch
                            save_categories_to_file(categories, categories_file)
                            print(f"New category added: {category} -> {subcategory}")
                            print("Stopping current batch to ensure consistency.")
                            break

                        # Assign current row's category and subcategory
                        results[start + i] = (category, subcategory)
                except IndexError:
                    # Handle cases where there are fewer response lines than messages
                    results[start + i] = ("Uncategorized", "Uncategorized")

        print(f"Processed messages {start} to {end}")

    # Apply results to the DataFrame
    df["Category"] = [result[0] for result in results]
    df["Subcategory"] = [result[1] for result in results]

    return df, categories

## 6. Summarize Groups

In [12]:
# def summarize_group(client, messages, group_name):
#     """
#     Summarize relevant concerns for a category.
#     """
#     if not messages:
#         return f"No concerns found in {group_name}."

#     messages_text = "\n".join(messages)
#     prompt = f"""
#     Summarize the concerns related to {group_name} based on the following chat messages from individuals in the UK.

#     Include only the struggles and challenges people face in this category. Exclude any irrelevant or unrelated content.

#     Messages:
#     {messages_text}

#     Provide a concise summary of the struggles and challenges faced.
#     """
#     try:
#         response = client.chat.completions.create(
#             model="gpt-4",
#             messages=[
#                 {"role": "system", "content": "You are a helpful assistant."},
#                 {"role": "user", "content": prompt}
#             ]
#         )
#         return response.choices[0].message.content.strip()
#     except Exception as e:
#         print(f"Error summarizing group {group_name}: {e}")
#         return "Error in summarization."

## 7. Filtering Data

In [None]:
def preprocess_messages_with_usernames(df):
    """
    Preprocess messages by adding usernames and normalizing UTF-8 encoding.
    Adds a 'Processed Content' column to the DataFrame.
    """
    # Ensure the required columns exist
    if "User" not in df.columns or "Chat Content" not in df.columns:
        raise ValueError("DataFrame must contain 'User' and 'Chat Content' columns.")

    # Preprocess messages
    df["Processed Content"] = df.apply(
        lambda row: preprocess_message(row["User"], row["Chat Content"]),
        axis=1
    )
    return df


def preprocess_message(user, message):
    """
    Format a single message with the username and normalize UTF-8 encoding.
    Replaces existing colons in the message to avoid ambiguity.
    """
    if not isinstance(message, str):
        message = ""
    
    # Normalize message to UTF-8
    message = message.encode("utf-8").decode("utf-8")
    
    # Replace existing colons to avoid ambiguity
    message = message.replace(":", " |")
    
    # Format the message with the username
    return f'{user}: {message}' if user else message

## 8. Run the Script

In [11]:
def main():
    # File paths
    categories_file = "./categories_prompt.json"
    messages_file = "./filtered_messages_nov.csv"
    output_file = "./categorized_messages.csv"

    # Load categories file
    categories = load_categories_from_file(categories_file)
    print("Loaded categories:", categories)

    # Load messages
    df = pd.read_csv(messages_file)

    # Preprocess messages: Add usernames and normalize UTF-8
    df = preprocess_messages_with_usernames(df)

    # Filter rows for processing
    df = df.iloc[200:300]

    # Ensure DataFrame has Category and Subcategory columns
    if "Category" not in df.columns:
        df["Category"] = ""
    if "Subcategory" not in df.columns:
        df["Subcategory"] = ""

    # Perform initial reanalysis of existing categories
    client = openai  # Replace with your OpenAI client
    categories = reanalyze_existing_categories(client, categories_file)

    # Process messages in batches
    df, categories = process_messages_in_batches(
        client=client,
        df=df,
        categories_file=categories_file,
        batch_size=50,
        overlap=5
    )

    # Save updated DataFrame to a CSV file
    df.to_csv(output_file, index=False)
    print(f"\nUpdated CSV saved to {output_file}")

Estimated token count for batch 0-50: 914
Error processing batch: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
GPT response for batch 0-50:
Error

Processed messages 0 to 50
Estimated token count for batch 30-80: 914
Error processing batch: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
GPT response for batch 30-80:
Error

Processed messages 30 to 80
Estimated token count for batch 60-110: 776
Error processing batch: Error code: 429 - {'error': {'message': 'You exce

In [13]:
import json

# Specify the file path
file_path = "./categories_prompt.json"

# Open and read the JSON file
with open(file_path, "r") as file:
    data = json.load(file)

# Print the JSON content
print(json.dumps(data, indent=4))  # Pretty print the JSON with indentation

{
    "Housing and Living Arrangements": [
        "Rising Rent and Housing Costs",
        "Finding Affordable and Safe Accommodation",
        "Shared Housing and Roommate Dynamics",
        "Adjusting to Different Housing Standards",
        "Living with Extended Family or Parents"
    ],
    "Employment and Economic Opportunities": [
        "Job Market Competition",
        "Skill and Credential Recognition",
        "Overqualification and Career Downgrades",
        "Workplace Culture and Integration",
        "Access to Professional Networking Opportunities",
        "Overcoming Language Barriers in Everyday Life"
    ],
    "Healthcare and Well-Being": [
        "Understanding Local Healthcare Systems",
        "Access to Primary and Emergency Care",
        "Mental Health Resources Availability",
        "Financial Barriers to Healthcare",
        "Language and Cultural Barriers in Health Services"
    ],
    "Social and Cultural Adjustment": [
        "Building Community and 