<a href="https://colab.research.google.com/github/Laney048/Startup/blob/main/Startup_Proposal_Evaluation_Script_(Refactored_Prompts)3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

"""Startup Proposal Evaluation Script (Enhanced with Temperature and Evaluation Method Controls)

This script evaluates startup proposals using three main criteria: Feasibility, Potential for Impact, and Innovative Approach.
All evaluations are performed by the OpenAI API.

Key Features:
- Evaluates proposals based on user-defined criteria and scoring scale.
- Supports different prompt forms ("default", "no_harsh", "no_rubric", "no_examples", "without_everything") for varied instructions and tones.
- All supported forms use a single API call to get all three criteria evaluations in one JSON object.
- Includes dynamic few-shot examples (optional) in the prompt.
- Calculates an overall weighted score based on user-defined percentages.
- Generates heatmaps to visualize proposal scores.
- Processes one selected prompt form at a time, generating specific outputs for that form.

ENHANCED FEATURES:
1. Temperature Control: Configurable temperature values (0.1, 0.3, 0.5, 0.7, 1.0) for controlled experiments
2. Evaluation Method Control:
   - 'simultaneous': Evaluate all 3 criteria in one API call (original method, potential halo effect)
   - 'sequential': Evaluate each criterion separately in 3 API calls (reduced halo effect)

Prompt Content Philosophy:
- Each prompt form has its own independent system message, instruction, challenge description, and criteria definitions.
- 'SCORING_SCALE_TEXT' and 'JSON_OUTPUT_FORMAT_PROMPT_SNIPPET' remain universal where applicable.
- Example inclusion is finely controlled by `include_static_examples_rationales` and `use_dynamic_few_shot_examples` flags.
"""

# --- 1. Imports and Setup ---
# Import necessary libraries
import pandas as pd
import re
import os
import json
import numpy as np # Import numpy for NaN
# google.colab.files is used for downloading files in Google Colab environment
from google.colab import files
# tqdm is used for displaying progress bars
from tqdm import tqdm
# OpenAI library for interacting with GPT models
from openai import OpenAI
from openai import RateLimitError, APIConnectionError, APIStatusError # Import specific API error types
# Matplotlib and Seaborn for plotting visualizations
import matplotlib.pyplot as plt
import seaborn as sns
# Requests is a common library for making HTTP requests (general purpose, not specifically used for API calls here as openai client handles it)
import requests
# concurrent.futures for parallel execution of API calls
from concurrent.futures import ThreadPoolExecutor, as_completed
tqdm.pandas() # Enable tqdm progress bar for pandas operations

#User Configuration settings

In [None]:
'''
Modify the following two variables to run different experiments:
1. EXPERIMENT_TEMPERATURE: Controlling the randomness of AI responses
2. EXPERIMENT_EVALUATION_METHOD: Control assessment method
'''
# Available temperature values for experiments
TEMPERATURE_VALUES = [0.1, 0.3, 0.5, 0.7, 1.0]

# Available evaluation methods
EVALUATION_METHODS = {
    'simultaneous': 'Evaluate all 3 criteria in one API call (faster, potential halo effect)',
    'sequential': 'Evaluate each criterion separately in 3 calls (slower, reduces halo effect)'
}

# ===== USER CONFIGURATION SETTINGS =====
# Modify these two values to run different experiments
EXPERIMENT_TEMPERATURE = 1 # Choose from TEMPERATURE_VALUES: 0.1, 0.3, 0.5, 0.7, or 1.0
EXPERIMENT_EVALUATION_METHOD = 'sequential'  # Choose: 'simultaneous' or 'sequential'


def validate_experiment_config():
    """Validate configuration settings"""
    if EXPERIMENT_TEMPERATURE not in TEMPERATURE_VALUES:
        raise ValueError(f"Temperature must be one of {TEMPERATURE_VALUES}. Got: {EXPERIMENT_TEMPERATURE}")
    if EXPERIMENT_EVALUATION_METHOD not in EVALUATION_METHODS:
        raise ValueError(f"Evaluation method must be one of {list(EVALUATION_METHODS.keys())}. Got: {EXPERIMENT_EVALUATION_METHOD}")
    return True

def get_experiment_config():
    """Return current experimental configuration as a dictionary"""
    return {
        'temperature': EXPERIMENT_TEMPERATURE,
        'evaluation_method': EXPERIMENT_EVALUATION_METHOD,
        'method_description': EVALUATION_METHODS.get(EXPERIMENT_EVALUATION_METHOD, 'Unknown')
    }

def print_experiment_config():
    """Print current configuration in a formatted way"""

    print("EXPERIMENT CONFIGURATION")

    config = get_experiment_config()
    print(f"Temperature:        {config['temperature']}")
    print(f"Evaluation Method:  {config['evaluation_method']}")
    print(f"Description:        {config['method_description']}")
    print()

# Validate configuration on module load
try:
    validate_experiment_config()
    print("Experiment configuration validated successfully")
    print_experiment_config()
except ValueError as e:
    print(f"Configuration Error: {e}")
    print("Please fix the EXPERIMENT_TEMPERATURE and EXPERIMENT_EVALUATION_METHOD variables")
    raise


#Universal Prompt Components

In [None]:
# These components are extracted into independent variables to avoid duplication in PROMPT_CONTENT_CONFIG

# Universal Scoring Scale - Applies to all prompt forms that use a scale
SCORING_SCALE_TEXT = """
Passing Probability Scale (use these exact values or scores between them):
- 1: Strongly Disagree - The submission clearly fails on this dimension.
- 2: Somewhat Disagree - Significant weaknesses outweigh strengths.
- 3: Neither Agree nor Disagree - Mixed evidence; meets only minimum expectations.
- 4: Somewhat Agree - Strong overall, with minor limitations.
- 5: Strongly Agree - Excellent across this dimension with negligible gaps.


Important:
• Use these exact values.
• Most good—but not exceptional—answers will earn 3 (“Neither Agree nor Disagree”).
• Reserve 5 for *clear* excellence with no substantial weaknesses.
• Use half-steps sparingly (e.g., 4 = “Somewhat Agree”) when the solution
  almost—but not fully—reaches the next level.
• Give low scores to solutions that could be significantly improved
• If information needed to assess a point is absent, assume it is weak and
  score downward.
• A solution that fails to provide detailed and comprehensive answers to all required elements and demonstrates aspirational rather than evidence-based claims should receive low scores.
• When unsure, err on the side of the lower score.
• If any substantial part of the application is not in English, it should be immediately given low scores.

"""

**JSON Output Format simultaneous or sequential**

In [None]:
# Universal JSON Output Format Snippet - Used for simultaneous evaluation (all criteria at once)
JSON_OUTPUT_FORMAT_PROMPT_SNIPPET = """
Your response MUST be in a single JSON object. Do not include any text, explanation, or markdown before or after the JSON. The JSON should have the following top-level keys: "potential_of_impact", "feasibility", and "innovative_approach".

Here is the expected JSON structure with definitions for each criterion:
{{
  "potential_of_impact": {{
    "detailed_reasoning": "Your highly detailed step-by-step reasoning and assessment for Potential for Impact. Evaluate thoroughly.",
    "summary_rationale": "Summary reasoning in less than 200 words. Be helpful, clear, and highly informative. Provide specific supported evidence from the solution that an external reviewer could check to verify your assessment. Describe both sides if an equal case can be made for both pass or fail.",
    "passing_probability": "Integer between 1-5 using the passing probability scale"
  }},
  "feasibility": {{
    "detailed_reasoning": "Your highly detailed step-by-step reasoning and assessment for Feasibility. Evaluate thoroughly.",
    "summary_rationale": "Summary reasoning in less than 200 words. Be helpful, clear, and extremely informative. Provide specific supported evidence from the solution that an external reviewer could check to verify your assessment. Describe both sides if an equal case can be made for both pass or fail.",
    "passing_probability": "Integer between 1-5 using the passing probability scale"
  }},
  "innovative_approach": {{
    "detailed_reasoning": "Your highly detailed step-by-step reasoning and assessment for Innovative Approach. Evaluate thoroughly.",
    "summary_rationale": "Summary reasoning in less than 200 words. Be helpful, clear, and highly informative. Provide specific supported evidence from the solution that an external reviewer could check to verify your assessment. Describe both sides if an equal case can be made for both pass or fail.",
    "passing_probability": "Integer between 1-5 representing the overall score for Innovative Approach, averaging any relevant sub-scores if applicable."
  }}
}}
"""

# JSON Output Format for Sequential Evaluation (Single Criterion at a time)
JSON_OUTPUT_FORMAT_SEQUENTIAL_SINGLE = """
Your response MUST be in a single JSON object for the ONE criterion being evaluated.

CRITICAL INSTRUCTIONS:
- You have been shown all three criteria for context
- You must evaluate and score ONLY the ONE criterion specified above
- Do NOT provide scores, reasoning, or rationale for the other two criteria
- Do NOT include the other criteria names in your JSON response
- Your JSON should contain ONLY the evaluation for the requested criterion

Do not include any text, explanation, or markdown before or after the JSON.

Here is the expected JSON structure:
{{
  "detailed_reasoning": "Your highly detailed step-by-step reasoning and assessment for THIS SPECIFIC criterion only. Evaluate thoroughly but focus only on the requested criterion.",
  "summary_rationale": "Summary reasoning in less than 200 words for this criterion only. Be helpful, clear, and highly informative. Provide specific supported evidence from the solution.",
  "passing_probability": "Integer between 1-5 using the passing probability scale for this criterion only"
}}

Remember: Evaluate ONLY the one criterion specified. Ignore the others.
"""

**Extract Common Prompt Components into Independent Variables**

In [None]:

# These are text blocks that are shared or have only minor differences between different prompt forms.

# Core part of the base system message
BASE_SYSTEM_MESSAGE_CORE = """
You are a finalist selection evaluator for MIT Solve 2025 Global Health Challenge.
Your role is to assess each solution against the three official judging criteria described below.
Assess each aspect according to the criterion specified.

You will receive:
- the instructions,
- the challenge description,
- the criterion to evaluate,
- one solution example that gets high scores,
- one solution example that gets low scores,
- the solution that you are going to evaluate,
- the required format of your answer output.

Your primary responsibility is to identify the best solutions for the finalists.
"""

# Harsh part of the system message
HARSH_SYSTEM_MESSAGE_PART = """
Many solutions (>50%) should get low scores. Your default assumption should be that a solution will not get high scores unless it clearly demonstrates excellence.

CRITICAL INSTRUCTION: You must evaluate each criterion INDEPENDENTLY, STRICTLY, CRITICALLY,FAIRLY, and be DEMANDING in your assessments.
"""

# Combined Instruction and Important Evaluation Instructions
BASE_INSTRUCTION_CORE = """
Your task:
Overall, you are evaluating solutions to a highly competitive global challenge. Most solutions will have significant weaknesses. Only truly exceptional solutions that thoroughly meet all criteria should be selected. When unsure, err on the side of the lower score.

1. Carefully read the criterion and understand what it's asking for.
2. Analyze the proposed solution in light of this criterion.
3. Determine whether the proposed solution meets the criterion based on its own merits.
4. Provide your evaluation, explaining your reasoning step-by-step clearly. You will identify strengths and critically list all the ways in which the solution can be improved with regards to the criterion, including weaknesses, limitations, gaps, or risks. Avoid generic praise.
Each application must meet all the elements of the established criterion. A solution needs to provide detailed and comprehensive answers to all required elements and demonstrates evidence-based claims.
The application must be entirely in English — this is a strict requirement. The only exception is for cities and locations or potentially relevant information that is not in English. Use your best judgment.
5. Provide a grade between 1-5 using the grading scale that will be provided.

IMPORTANT EVALUATION INSTRUCTIONS:
1. Assess each criterion completely separately from the others
2. Do not let strengths in one criterion influence your evaluation of other criteria
"""


# Challenge description (same as BASE_CHALLENGE_DESCRIPTION)
BASE_CHALLENGE_DESCRIPTION = """
Challenge Description: Every person deserves to experience good health and well-being. While there has been some progress towards these goals over the last two decades, much of that progress has now slowed or reversed. Currently, half the world lacks access to comprehensive health services. Two billion people face financial hardship due to out-of-pocket healthcare costs. Inequalities are increasing worldwide, particularly in low- and middle-income countries.

Technology and innovation have an important role to play in improving health and well-being for all. New technologies can improve health outcomes and access when deployed effectively. Innovation provides the business models, decreased costs, and community-focused design necessary for lasting change and efficient scale. These changes can appear across healthcare systems and affect many areas of care including primary care, mental health, or infectious diseases.

MIT Solve seeks exceptional solutions that leverage technology to increase access to good health and healthcare with a particular interest for 2025 in solutions that address one or more of the following or more generally relate to health equity:
- Ensure health-related data is collected ethically and effectively and that AI and other insights are accurate, targeted, and actionable in the real world.
- Increase capacity and resilience of health systems, including workforce, supply chains, and other infrastructure.
- Increase access to and quality of health services for all communities.
"""

# Core part of the base criteria definitions (shared by Impact, Feasibility, Innovative Approach)
BASE_CRITERIA_IMPACT_DEFINITION_CORE = """
Criterion to assess: Potential for Impact

The planned solution implementation has the potential to impact the intended population.

Consider whether the solution has the potential to impact the intended population. Is the description of how the team expects the solution to impact the problem (the theory of change) logical? Has the team tested the assumptions underlying it? Is there evidence from other contexts that strongly suggests that the solution, in its proposed form, can have the intended impact?

Some solution teams that have fully launched their solution may provide evidence from an evaluation of their solution. In these cases, the potential for impact is likely to be high, but note that solutions that haven’t yet been launched or evaluated may also score highly on this criterion.
"""

BASE_CRITERIA_FEASIBILITY_DEFINITION_CORE = """
Criterion to assess: Feasibility

The team has a realistic, practical plan for implementing the solution, and it is feasible in the given context.

Consider whether it is feasible to implement the solution in the given context. Does the team have a realistic, practical plan for implementation that takes into account the political, economic, geographic, and cultural realities in the context? Assuming the necessary funding is acquired, do the necessary conditions exist for the team to carry out their plan? Since many solutions are early-stage, we know that plans will change, so consider whether the team has what it takes - in terms of grit, determination, and expertise - to succeed. This criterion is not about whether the team can attract the funding they need to scale, but rather about the practicalities of implementing the solution.
"""

BASE_CRITERIA_INNOVATIVE_APPROACH_DEFINITION_CORE = """
Criterion to assess: Innovative Approach

The solution includes a new technology, a new application of technology, a new business model, or a new process for solving the Challenge.

Consider whether the team is proposing a new or significantly improved approach to address the Challenge. Is the solution leveraging a new technology, a new application of an existing technology, or a new business model or process? If it leverages an existing technology or business model, does it involve novel context-specific modifications or methods? Is it sufficiently different from what competitors in the market are offering? Does the team convincingly explain how the solution could change the market or enable broader positive impacts from others in this space?

Even if a technology already exists in one environment (e.g. urban areas), we consider the expansion and appropriate adaptation of an existing technology in a new environment (e.g. rural areas) to be innovative.
"""

# Rubric part within criteria definitions (shared by Impact, Feasibility, Innovative Approach)
RUBRIC_CRITERIA_PART_IMPACT = """
A solution should score lower on Potential for Impact if the theory of how it could change lives does not make logical sense, or if there is existing evidence that it will not work.

A solution should score higher on Potential for Impact if the theory of how it could change the lives of the intended population makes sense and the applicant provides evidence that it is likely to have the intended impact (either from evaluations of the solution itself or from an existing body of evidence about similar interventions).
"""

RUBRIC_CRITERIA_PART_FEASIBILITY = """
A solution should score lower on Feasibility if the team does not have a realistic plan for implementation, or if the plan is unlikely to succeed (even if funding is acquired).

A solution should score higher on Feasibility if the team has a realistic plan for implementation that accounts for the political, economic, geographic, and cultural context, and the team has the necessary skills to implement that plan.
"""

RUBRIC_CRITERIA_PART_INNOVATIVE_APPROACH = """
A solution should score lower on Innovative Approach if it is an implementation of an existing approach without any context-specific modifications.

A solution should score higher on Innovative Approach if it is a truly novel approach, or a novel, context-appropriate application of an existing approach
"""

# Focus part within criteria definitions (shared by Impact, Feasibility, Innovative Approach)
BASE_CRITERIA_FOCUS_SUFFIX_IMPACT = """
Focus on these aspects of the solution in your evaluation:
- What specific problem are you solving?
- What is your solution?
- Who does your solution serve, and in what ways will the solution impact their lives? OR Which Indigenous community(s) does your solution benefit? In what ways will your solution benefit this community?
- Describe in simple terms how and why you expect your solution to have an impact on the problem.
"""

BASE_CRITERIA_FOCUS_SUFFIX_FEASIBILITY = """
Focus on these aspects of the solution in your evaluation:
- What type of organization is your solution team?
- How are you and your team well-positioned to deliver this solution?
- What are your impact goals for your solution and how are you measuring your progress towards them?
- In which countries do you currently operate? OR In which parts of the US and/or Canada do you currently operate?
- Which, if any, additional countries will you be operating in within the next year? OR Which, if any, additional parts of the US or Canada will you be operating in within the next year?
- What is your business model?
"""

BASE_CRITERIA_FOCUS_SUFFIX_INNOVATIVE_APPROACH = """
Focus on these aspects of the solution in your evaluation:
- Which of the following categories best describes your solution?
- What makes your solution innovative?
- Describe the core technology that powers your solution.
"""


#Prompt Content Configuration

In [None]:
# This dictionary stores all the unique prompt components for each defined prompt form.
# Each form has its own system_message, instruction, challenge_description, criteria definitions,
# and flags to control the inclusion of dynamic examples.

PROMPT_CONTENT_CONFIG = {
    "default": {
        "system_message": BASE_SYSTEM_MESSAGE_CORE + HARSH_SYSTEM_MESSAGE_PART, # Core + Harsh part
        "instruction": BASE_INSTRUCTION_CORE,
        "challenge_description": BASE_CHALLENGE_DESCRIPTION, # Directly reference common challenge description
        "criteria_impact_definition": BASE_CRITERIA_IMPACT_DEFINITION_CORE + RUBRIC_CRITERIA_PART_IMPACT + BASE_CRITERIA_FOCUS_SUFFIX_IMPACT,
        "criteria_feasibility_definition": BASE_CRITERIA_FEASIBILITY_DEFINITION_CORE + RUBRIC_CRITERIA_PART_FEASIBILITY + BASE_CRITERIA_FOCUS_SUFFIX_FEASIBILITY,
        "criteria_innovative_approach_definition": BASE_CRITERIA_INNOVATIVE_APPROACH_DEFINITION_CORE + RUBRIC_CRITERIA_PART_INNOVATIVE_APPROACH + BASE_CRITERIA_FOCUS_SUFFIX_INNOVATIVE_APPROACH,
        "use_dynamic_few_shot_examples": True # Include dynamic examples based on example_dict
    },
    "no_harsh": {
        "system_message": BASE_SYSTEM_MESSAGE_CORE, # Remove harsh part, keep core
        "instruction": BASE_INSTRUCTION_CORE,
        "challenge_description": BASE_CHALLENGE_DESCRIPTION,
        "criteria_impact_definition": BASE_CRITERIA_IMPACT_DEFINITION_CORE + RUBRIC_CRITERIA_PART_IMPACT + BASE_CRITERIA_FOCUS_SUFFIX_IMPACT,
        "criteria_feasibility_definition": BASE_CRITERIA_FEASIBILITY_DEFINITION_CORE + RUBRIC_CRITERIA_PART_FEASIBILITY + BASE_CRITERIA_FOCUS_SUFFIX_FEASIBILITY,
        "criteria_innovative_approach_definition": BASE_CRITERIA_INNOVATIVE_APPROACH_DEFINITION_CORE + RUBRIC_CRITERIA_PART_INNOVATIVE_APPROACH + BASE_CRITERIA_FOCUS_SUFFIX_INNOVATIVE_APPROACH,
        "use_dynamic_few_shot_examples": True
    },
    "no_rubric": {
        "system_message": BASE_SYSTEM_MESSAGE_CORE + HARSH_SYSTEM_MESSAGE_PART,
        "instruction": BASE_INSTRUCTION_CORE,
        "challenge_description": BASE_CHALLENGE_DESCRIPTION,
        "criteria_impact_definition": BASE_CRITERIA_IMPACT_DEFINITION_CORE + BASE_CRITERIA_FOCUS_SUFFIX_IMPACT, # Remove Rubric part
        "criteria_feasibility_definition": BASE_CRITERIA_FEASIBILITY_DEFINITION_CORE + BASE_CRITERIA_FOCUS_SUFFIX_FEASIBILITY, # Remove Rubric part
        "criteria_innovative_approach_definition": BASE_CRITERIA_INNOVATIVE_APPROACH_DEFINITION_CORE + BASE_CRITERIA_FOCUS_SUFFIX_INNOVATIVE_APPROACH, # Remove Rubric part
        "use_dynamic_few_shot_examples": True
    },
    "no_examples": { # New form: no dynamic examples, but had static EXAMPLES_RATIONALES (now removed)

        "system_message": BASE_SYSTEM_MESSAGE_CORE + HARSH_SYSTEM_MESSAGE_PART,
        "instruction": BASE_INSTRUCTION_CORE,
        "challenge_description": BASE_CHALLENGE_DESCRIPTION,
        "criteria_impact_definition": BASE_CRITERIA_IMPACT_DEFINITION_CORE + RUBRIC_CRITERIA_PART_IMPACT + BASE_CRITERIA_FOCUS_SUFFIX_IMPACT,
        "criteria_feasibility_definition": BASE_CRITERIA_FEASIBILITY_DEFINITION_CORE + RUBRIC_CRITERIA_PART_FEASIBILITY + BASE_CRITERIA_FOCUS_SUFFIX_FEASIBILITY,
        "criteria_innovative_approach_definition": BASE_CRITERIA_INNOVATIVE_APPROACH_DEFINITION_CORE + RUBRIC_CRITERIA_PART_INNOVATIVE_APPROACH + BASE_CRITERIA_FOCUS_SUFFIX_INNOVATIVE_APPROACH,
        "use_dynamic_few_shot_examples": False # Explicitly exclude dynamic examples
    },
    "without_everything": {

        "system_message": BASE_SYSTEM_MESSAGE_CORE,
        "instruction": BASE_INSTRUCTION_CORE,
        "challenge_description": BASE_CHALLENGE_DESCRIPTION,
        "criteria_impact_definition": BASE_CRITERIA_IMPACT_DEFINITION_CORE + BASE_CRITERIA_FOCUS_SUFFIX_IMPACT,
        "criteria_feasibility_definition": BASE_CRITERIA_FEASIBILITY_DEFINITION_CORE + BASE_CRITERIA_FOCUS_SUFFIX_FEASIBILITY,
        "criteria_innovative_approach_definition": BASE_CRITERIA_INNOVATIVE_APPROACH_DEFINITION_CORE + BASE_CRITERIA_FOCUS_SUFFIX_INNOVATIVE_APPROACH,
        "use_dynamic_few_shot_examples": False # Explicitly exclude dynamic examples
    }
}

#OpenAI API Configuration

In [None]:
# --- 4. OpenAI API Configuration ---
from google.colab import userdata
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

global openai_client
openai_client = None
selected_openai_model_name = "gpt-5-nano"

def configure_openai_client():
    """Initializes the OpenAI client using an environment variable."""
    global openai_client
    try:
        openai_api_key = os.environ.get("OPENAI_API_KEY")
        if not openai_api_key:
            print("OPENAI_API_KEY not found in environment variables. Please set it.")
            if 'google.colab' in str(get_ipython()):
                print("In Google Colab, consider setting it as a secret: from google.colab import userdata; os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')")

        if openai_api_key:
            openai_client = OpenAI(api_key=openai_api_key)
            print(f"OpenAI API client initialized successfully using model: {selected_openai_model_name}.")
        else:
            print("OpenAI API calls will be skipped due to missing API key.")
    except Exception as e:
        print(f"An unexpected error occurred during OpenAI API configuration: {e}")
        print("Please ensure the 'openai' library is installed and your API key is correctly configured.")

# Call configuration at script start
configure_openai_client()

#Data Loading and Preprocessing

In [None]:
def load_and_preprocess_data():
    """Loads data from health.xlsx or uses sample data if not found."""
    try:
        df = pd.read_excel("health.xlsx")
        tqdm.write("Successfully loaded health.xlsx")
    except FileNotFoundError:
        tqdm.write("health.xlsx not found. Using dummy sample data for demonstration.")
        sample_data = {
            'Solution ID': [1, 2, 3, 4, 5],
            'Problem': ['Lack of clean water', 'Poor education access', 'Food waste', 'Mental health stigma', 'Lack of digital literacy'],
            'Solution': ['Solar water purifier for rural areas', 'Interactive online learning platform', 'AI-powered food distribution system', 'Community-based mental health support', 'Free coding bootcamps for youth'],
            'Description': [
                'Our solar water purifier uses advanced filtration to provide clean drinking water to remote villages. It is low-cost, easy to maintain, but requires initial funding for widespread deployment.',
                'An adaptive online platform that uses gamification and personalized learning paths to improve educational outcomes for underserved communities. Needs strong internet infrastructure.',
                'This system uses machine learning to predict food surplus and efficiently redistribute it to food banks, reducing waste and feeding the needy. Requires partnerships with food producers and retailers.',
                'A peer-support network combined with accessible therapy sessions to combat mental health stigma and provide immediate support. Relies heavily on volunteer commitment and community trust.',
                'Intensive bootcamps teaching essential coding skills to unemployed youth, preparing them for tech jobs in a rapidly evolving economy.'
            ],
            'Team': ['Aqua Innovators', 'EdTech Pioneers', 'ZeroWaste Solutions', 'Mindful Minds', 'CodeUp'],
            'Created': ['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05'],
            'Updated': ['2024-06-01', '2024-06-01', '2024-06-01', '2024-06-01', '2024-06-01'],
            'User / Team': ['UserA', 'UserB', 'UserC', 'UserD', 'UserE'],
            'Name': ['Project Alpha', 'Project Beta', 'Project Gamma', 'Project Delta', 'Project Epsilon'],
            'Team Leader': ['LeaderA', 'LeaderB', 'LeaderC', 'LeaderD', 'LeaderE'],
            'Team Leader Email': ['a@example.com', 'b@example.com', 'c@example.com', 'd@example.com', 'e@example.com'],
            'Status': ['Submitted', 'Submitted', 'Submitted', 'Submitted', 'Submitted'],
            'Submitted At': ['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05'],
            'Title': ['Clean Water for All', 'Accessible Education', 'Combatting Food Waste', 'Breaking Mental Health Stigma', 'Digital Skills for Youth'],
            'Terms Accepted': ['Yes', 'Yes', 'Yes', 'Yes', 'Yes']
        }
        df = pd.DataFrame(sample_data)

    df_processed = df.copy()
    dropped_cols = ['Created', 'Updated', 'User / Team', 'Name',
                    'Team Leader', 'Team Leader Email', 'Status', 'Submitted At','Title',
                    'Terms Accepted']
    existing_dropped_cols = [col for col in dropped_cols if col in df_processed.columns]
    df_processed = df_processed.drop(columns=existing_dropped_cols, axis=1)

    # Modified combine_columns to include Solution ID in the combined text for logging
    def combine_columns(row):
        parts = []
        if "Solution ID" in row.index:
            parts.append(f"Solution ID: {row['Solution ID']}")

        for col in row.index:
            if col != "Solution ID": # Exclude Solution ID to avoid duplication if already added
                parts.append(f"{col}: {row[col]}")

        return '\n '.join(parts)

    df_processed['Combined'] = df_processed.apply(combine_columns, axis=1)
    df_processed["Combined"] = df_processed["Combined"].apply(lambda x: re.sub('<.*?>', '', str(x)))
    return df_processed


#Chosen cases

Filter the matched case

In [None]:
import pandas as pd

# Define the file paths
file_path_1 = "health.xlsx"
file_path_2 = "Health Review Round.xlsx"

try:
    # Load the two Excel files into pandas DataFrames
    df1 = pd.read_excel(file_path_1)
    df2 = pd.read_excel(file_path_2)

    # Assuming 'Solution ID' is the column containing the IDs in both files
    # Get the Solution IDs from each DataFrame
    solution_ids_1 = set(df1['Solution ID'].tolist())
    solution_ids_2 = set(df2['Solution ID'].tolist())

    # Find the common Solution IDs (intersection of the two sets)
    matching_solution_ids = list(solution_ids_1.intersection(solution_ids_2))

    # Display the matching Solution IDs
    print("Matching Solution IDs found in both files:")
    print(matching_solution_ids)

    # Filter df2 (Health Review Round.xlsx) based on the matching_solution_ids
    filtered_df2 = df2[df2['Solution ID'].isin(matching_solution_ids)].copy()
    print("\nFiltered rows from Health Review Round.xlsx based on matching Solution IDs:")
    display(filtered_df2)


except FileNotFoundError as e:
    print(f"Error: One of the files not found - {e}")
except KeyError as e:
    print(f"Error: 'Solution ID' column not found in one of the files - {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

In [None]:
def load_and_preprocess_data():
    """
    Loads data from health.xlsx, filters for specific Solution IDs,
    and preprocesses the data for LLM input.
    """
    # Define the specific Solution IDs to be included in the evaluation.
    # This list ensures that only these nine proposals are processed.
    specific_solution_ids = [95105, 93186, 94339, 96129, 97666, 94606, 96399, 95356, 92944, 93842, 95635, 94618, 94620, 93347, 95014, 97703, 93736, 93235, 93367, 93113, 94267, 95296, 97613, 94030, 94674, 96722, 94548, 95958, 97882, 95709, 95838, 93662, 94819, 98021, 96614, 95847, 94181, 95721, 95729, 96631, 97530, 95227, 94972]

    try:
        df = pd.read_excel("health.xlsx")
        tqdm.write("Successfully loaded health.xlsx")

        # Filter the DataFrame to keep only the rows with the specific Solution IDs.
        # The .isin() method checks if the value in 'Solution ID' is in the list.
        # We use .copy() to prevent a SettingWithCopyWarning in later operations.
        df_filtered = df[df['Solution ID'].isin(specific_solution_ids)].copy()

        tqdm.write(f"Filtered to include only the following Solution IDs: {specific_solution_ids}")
        tqdm.write(f"Number of proposals after filtering: {len(df_filtered)}")

    except FileNotFoundError:
        tqdm.write("health.xlsx not found. Using dummy sample data for demonstration.")
        # Note: The dummy data below does not contain the specific IDs you requested.
        # The code will proceed with this dummy data if the file is not found.
        sample_data = {
            'Solution ID': [1, 2, 3, 4, 5],
            'Problem': ['Lack of clean water', 'Poor education access', 'Food waste', 'Mental health stigma', 'Lack of digital literacy'],
            'Solution': ['Solar water purifier for rural areas', 'Interactive online learning platform', 'AI-powered food distribution system', 'Community-based mental health support', 'Free coding bootcamps for youth'],
            'Description': [
                'Our solar water purifier uses advanced filtration to provide clean drinking water to remote villages. It is low-cost, easy to maintain, but requires initial funding for widespread deployment.',
                'An adaptive online platform that uses gamification and personalized learning paths to improve educational outcomes for underserved communities. Needs strong internet infrastructure.',
                'This system uses machine learning to predict food surplus and efficiently redistribute it to food banks, reducing waste and feeding the needy. Requires partnerships with food producers and retailers.',
                'A peer-support network combined with accessible therapy sessions to combat mental health stigma and provide immediate support. Relies heavily on volunteer commitment and community trust.',
                'Intensive bootcamps teaching essential coding skills to unemployed youth, preparing them for tech jobs in a rapidly evolving economy.'
            ],
            'Team': ['Aqua Innovators', 'EdTech Pioneers', 'ZeroWaste Solutions', 'Mindful Minds', 'CodeUp'],
            'Created': ['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05'],
            'Updated': ['2024-06-01', '2024-06-01', '2024-06-01', '2024-06-01', '2024-06-01'],
            'User / Team': ['UserA', 'UserB', 'UserC', 'UserD', 'UserE'],
            'Name': ['Project Alpha', 'Project Beta', 'Project Gamma', 'Project Delta', 'Project Epsilon'],
            'Team Leader': ['LeaderA', 'LeaderB', 'LeaderC', 'LeaderD', 'LeaderE'],
            'Team Leader Email': ['a@example.com', 'b@example.com', 'c@example.com', 'd@example.com', 'e@example.com'],
            'Status': ['Submitted', 'Submitted', 'Submitted', 'Submitted', 'Submitted'],
            'Submitted At': ['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05'],
            'Title': ['Clean Water for All', 'Accessible Education', 'Combatting Food Waste', 'Breaking Mental Health Stigma', 'Digital Skills for Youth'],
            'Terms Accepted': ['Yes', 'Yes', 'Yes', 'Yes', 'Yes']
        }
        df_filtered = pd.DataFrame(sample_data)

    df_processed = df_filtered.copy()
    dropped_cols = ['Created', 'Updated', 'User / Team', 'Name',
                    'Team Leader', 'Team Leader Email', 'Status', 'Submitted At','Title',
                    'Terms Accepted']
    existing_dropped_cols = [col for col in dropped_cols if col in df_processed.columns]
    df_processed = df_processed.drop(columns=existing_dropped_cols, axis=1, errors='ignore')

    # Modified combine_columns to include Solution ID in the combined text for logging
    def combine_columns(row):
        parts = []
        if "Solution ID" in row.index:
            parts.append(f"Solution ID: {row['Solution ID']}")

        for col in row.index:
            if col != "Solution ID": # Exclude Solution ID to avoid duplication if already added
                parts.append(f"{col}: {row[col]}")

        return '\n '.join(parts)

    df_processed['Combined'] = df_processed.apply(combine_columns, axis=1)
    df_processed["Combined"] = df_processed["Combined"].apply(lambda x: re.sub('<.*?>', '', str(x)))
    return df_processed


#Prompt Construction Logic (example for score)

In [None]:
# This function is the core of dynamic prompt generation based on the selected form.

def get_prompts_for_form(prompt_form_name, example_dict=None):
    """
    Retrieves and constructs system and user prompts based on the specified prompt form.
    It fetches content from the PROMPT_CONTENT_CONFIG and dynamically includes examples if provided.

    Args:
        prompt_form_name (str): The name of the prompt form.
        example_dict (dict, optional): Dictionary containing example solution texts keyed by type
                                        (e.g., 'pass_all', 'criteria1_fail').

    Returns:
        tuple: (system_prompt_content, user_prompt_template_content, response_is_json_expected).
    """
    config = PROMPT_CONTENT_CONFIG.get(prompt_form_name)
    if not config:
        raise ValueError(f"Prompt form '{prompt_form_name}' not found in PROMPT_CONTENT_CONFIG.")

    system_msg = config["system_message"]
    instruction_txt = config["instruction"]
    challenge_desc = config["challenge_description"]
    use_dynamic_examples = config["use_dynamic_few_shot_examples"]

    system_prompt_content = ""
    user_prompt_template_content = ""
    response_is_json_expected = True # All forms now aim for unified JSON output

    COMMON_USER_PROMPT_ALL_CRITERIA_TEMPLATE = f"""
Evaluate the following startup proposal against all three criteria (Potential for Impact, Feasibility, and Innovative Approach).
Provide your comprehensive evaluation in the specified JSON format.

Startup Proposal:
{{combined_proposal_text}}

---
{{criteria_impact_def}}
---
{{criteria_feasibility_def}}
---
{{criteria_innovative_approach_def}}
"""

    # --- Constructing System Prompt ---
    # Start with the base components for the current form
    system_prompt_content = f"""
{system_msg}
{instruction_txt}
{SCORING_SCALE_TEXT}
{challenge_desc}
"""

    # Dynamically add few-shot examples if required and provided
    examples_section_dynamic = ""
    examples_reminder_dynamic = ""
    if use_dynamic_examples and example_dict and any(example_dict.values()):
        examples_section_dynamic += "\n--- Examples for Reference ---\nIMPORTANT: The following examples are for reference only. Do not directly compare the proposed solution to these examples. Instead, use them to understand the criteria and how they should be applied.\n\n"

        # Add high-scoring examples (scores 4-5)
        if 'high_scores' in example_dict and example_dict['high_scores']:
            for sol_id, sol_text in example_dict['high_scores']:
                examples_section_dynamic += f"Example of high-scoring solution (ID: {sol_id}) - typically scores 5 across criteria: \n{sol_text}\n\n"

        # Add medium-scoring examples (scores 3)
        if 'medium_scores' in example_dict and example_dict['medium_scores']:
            for sol_id, sol_text in example_dict['medium_scores']:
                examples_section_dynamic += f"Example of medium-scoring solution (ID: {sol_id}) - typically scores 3 across criteria: \n{sol_text}\n\n"

        # Add overall low-scoring examples (scores 1-2 across multiple criteria)
        if 'low_scores' in example_dict and example_dict['low_scores']:
            for sol_id, sol_text in example_dict['low_scores']:
                examples_section_dynamic += f"Example of low-scoring solution overall (ID: {sol_id}) - typically scores 2 across multiple criteria: \n{sol_text}\n\n"

        # Add low-scoring examples for specific criteria (scores 1-2)
        # Map our 3 criteria names to keys used in example_dict
        criterion_keys_in_examples = {
            "potential_of_impact": "criteria1_low_score",
            "feasibility": "criteria2_low_score",
            "innovative_approach": "criteria3_low_score"
        }
        for criterion_name, example_key in criterion_keys_in_examples.items():
            if example_key in example_dict and example_dict[example_key]:
                for sol_id, sol_text in example_dict[example_key]:
                    examples_section_dynamic += f"Example with low score on {criterion_name} (ID: {sol_id}) - typically scores 1-2 on this criterion: \n{sol_text}\n\n"

        #examples_reminder_dynamic = "\nREMINDER: The above examples are just to illustrate the 5-point scoring scale and criteria application. Your evaluation should be based solely on how well the proposed solution meets the given criteria, not on how similar it is to the examples. Use the full 1-5 scoring scale: 1 (Strongly Disagree), 2 (Somewhat Disagree), 3 (Neither Agree nor Disagree), 4 (Somewhat Agree), 5 (Strongly Agree). Evaluate each criterion independently and assign scores based on the solution's actual merits.\n"

        examples_reminder_dynamic = """REMINDER: The above examples are provided to clarify the extreme ends of the 5-point scoring scale and illustrate the type of proposals that receive very high or very low scores. Specifically:
- Solutions marked as 'high_scores' are genuinely excellent proposals that meet all criteria exceptionally well. These proposals should serve as the benchmark for a score of 5.
- Solutions marked as 'low_scores' or 'criteria_low_score' are genuinely weak proposals with significant, demonstrable flaws on a specific criterion or across the board. These proposals should serve as the benchmark for scores of 1 or 2.
Your evaluation must be based solely on the merits of the proposed solution you are currently assessing. Do not assign high scores unless the solution is truly outstanding and clearly meets the high-scoring criteria demonstrated by the examples. Conversely, you MUST be prepared to assign low scores (1s and 2s) if the solution is weak, as illustrated in the examples.
You must use the full 1-5 scoring scale: 1 (Strongly Disagree), 2 (Somewhat Disagree), 3 (Neither Agree nor Disagree), 4 (Somewhat Agree), 5 (Strongly Agree). Evaluate each criterion independently and assign scores based on the solution's actual merits. Do not be generous with your scoring.
"""



        system_prompt_content += examples_section_dynamic
        system_prompt_content += examples_reminder_dynamic


    # Add JSON output snippet (all forms now aim for JSON output)
    system_prompt_content += f"{JSON_OUTPUT_FORMAT_PROMPT_SNIPPET}"

    # Add form-specific final instruction/clarification to system prompt
    # These are specific instructions that guide the model's final behavior/tone.
    if prompt_form_name == "default":
        system_prompt_content += "Remember the IMPORTANT EVALUATION INSTRUCTIONS. Be critical, rigorous, and demanding in your assessments.\n"
    elif prompt_form_name == "no_harsh":
        system_prompt_content += "Remember the IMPORTANT EVALUATION INSTRUCTIONS. (Note: This version is less harsh in its tone compared to 'default' in some aspects, but still rigorous.)\n"
    elif prompt_form_name == "no_rubric":
        system_prompt_content += "You are an evaluator for startup proposals. Provide your assessment for all three criteria in the specified JSON format.\n"
    elif prompt_form_name == "no_examples":
        system_prompt_content += "You are an evaluator for startup proposals. Focus solely on the provided instructions and challenge description. Do not generate examples."
        system_prompt_content += "\nProvide your assessment in the specified JSON format.\n"
    elif prompt_form_name == "without_everything":
        system_prompt_content += "Provide your assessment in the specified JSON format."


    # --- Constructing User Prompt ---
    # Populate the common user prompt template with criterion definitions specific to the current form
    user_prompt_template_content = COMMON_USER_PROMPT_ALL_CRITERIA_TEMPLATE.format(
        combined_proposal_text="{combined_proposal_text}", # This remains as a placeholder to be formatted later
        criteria_impact_def=config["criteria_impact_definition"],
        criteria_feasibility_def=config["criteria_feasibility_definition"],
        criteria_innovative_approach_def=config["criteria_innovative_approach_definition"]
    )

    return system_prompt_content, user_prompt_template_content, response_is_json_expected

**OpenAI Evaluation API Call Function for SIMULTANEOUS evaluation**

In [None]:
# This function executes a single API call for one proposal.

def perform_openai_api_call(combined_text, prompt_form_name, example_dict=None):
    """
    Executes a single API call to OpenAI GPT-4o-mini for evaluation.
    All forms now attempt a single unified API call expecting JSON.
    Dynamically includes examples in the prompt if example_dict is provided.
    Includes robust retry logic for API errors.
    """
    # Check if OpenAI client is initialized globally
    if openai_client is None:
        tqdm.write(f"Skipping OpenAI API call for {prompt_form_name}. API client not initialized.")
        # Return a dummy structure that matches the expected unified JSON output
        return {
            "feasibility": {"detailed_reasoning": "API skipped.", "summary_rationale": "API skipped.", "passing_probability": None},
            "potential_of_impact": {"detailed_reasoning": "API skipped.", "summary_rationale": "API skipped.", "passing_probability": None},
            "innovative_approach": {"detailed_reasoning": "API skipped.", "summary_rationale": "API skipped.", "passing_probability": None},
            "raw_response_content": "API skipped due to no client."
        }

    # Retrieve prompt components specific to the form and with examples if applicable
    system_prompt, user_prompt_template, response_is_json_expected = get_prompts_for_form(prompt_form_name, example_dict=example_dict)
    # Format the user prompt with the actual proposal text
    # The {combined_proposal_text} in user_prompt_template will now be correctly replaced.
    full_user_prompt = user_prompt_template.format(combined_proposal_text=combined_text)

    # Assemble the messages for the API call
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": full_user_prompt}
    ]

    # Log the API call action, using tqdm.write to avoid interfering with progress bar
    # Extract Solution ID safely for logging
    solution_id_match = re.search(r'Solution ID: (\d+)', combined_text)
    solution_id_for_log = solution_id_match.group(1) if solution_id_match else "N/A"
    tqdm.write(f"Making OpenAI API call for proposal ID {solution_id_for_log} (Form: {prompt_form_name})...")


    # --- API Call with Retry Logic ---
    max_retries = 5
    base_delay = 1 # seconds
    import time # Import time for sleep function
    for attempt in range(max_retries):
        try:
            # Prepare arguments for the chat completion API call
            completion_args = {
                "model": selected_openai_model_name,
                "messages": messages,
                "temperature": EXPERIMENT_TEMPERATURE # A common temperature for balanced creativity and consistency
            }
            # Explicitly request JSON response format, as all forms are designed to output it now
            completion_args['response_format'] = {"type": "json_object"}

            # Execute the API call
            response = openai_client.chat.completions.create(**completion_args)
            # Extract the content from the API response

            total_tokens = response.usage.total_tokens if response.usage else 0
            tqdm.write(f"API call for {solution_id_for_log} used {total_tokens} tokens.")

            response_content = response.choices[0].message.content if response.choices and response.choices[0].message else ""

            # Attempt to parse the response content as JSON
            try:
                parsed_response = json.loads(response_content)
                # Return the parsed JSON along with the raw content
                return {
                    "feasibility": parsed_response.get('feasibility', {}),
                    "potential_of_impact": parsed_response.get('potential_of_impact', {}),
                    "innovative_approach": parsed_response.get('innovative_approach', {}),
                    "raw_response_content": response_content
                }
            except json.JSONDecodeError as json_e: # Capture the JSONDecodeError object
                # If JSON decoding fails, log the error and provide a fallback structure.
                # This might happen if the model fails to adhere to the JSON format, especially with minimal instructions.
                tqdm.write(f"JSONDecodeError for proposal ID {solution_id_for_log} (Form: {prompt_form_name}). Error: {json_e}. Raw response: {response_content[:200]}...")

                # Heuristically attempt to extract scores from raw text if JSON is malformed
                # This is a best-effort approach for cases where the model doesn't return proper JSON.
                impact_score = None
                feasibility_score = None
                innovative_approach_score = None

                # Use regex to find potential scores in the unstructured text. This is a best-effort approach.
                # Regex patterns for score extraction
                impact_match = re.search(r'(?:Impact|potential of impact)(?:\s*(?:score|passing probability))?:\s*(\d(?:\.\d)?)', response_content, re.IGNORECASE)
                feasibility_match = re.search(r'(?:Feasibility|feasibility)(?:\s*(?:score|passing probability))?:\s*(\d(?:\.\d)?)', response_content, re.IGNORECASE)
                innovative_match = re.search(r'(?:Innovative Approach|innovative approach)(?:\s*(?:score|passing probability))?:\s*(\d(?:\.\d)?)', response_content, re.IGNORECASE)

                try:
                    if impact_match: impact_score = float(impact_match.group(1))
                    if feasibility_match: feasibility_score = float(feasibility_match.group(1))
                    if innovative_match: innovative_approach_score = float(innovative_match.group(1))
                except ValueError:
                    pass # If conversion to float fails (e.g., regex finds non-numeric string), score remains None


                # Return a structured error response with any heuristically extracted scores
                return {
                    "feasibility": {"detailed_reasoning": f"JSON parse error or unstructured response: {response_content}", "summary_rationale": f"JSON parse error or unstructured response: {response_content[:200]}...", "passing_probability": feasibility_score},
                    "potential_of_impact": {"detailed_reasoning": f"JSON parse error or unstructured response: {response_content}", "summary_rationale": f"JSON parse error or unstructured response. Raw: {response_content[:200]}...", "passing_probability": impact_score},
                    "innovative_approach": {"detailed_reasoning": f"JSON parse error or unstructured response: {response_content}", "summary_rationale": f"JSON parse error or unstructured response. Raw: {response_content[:200]}...", "passing_probability": innovative_approach_score},
                    "raw_response_content": response_content
                }

        except (RateLimitError, APIConnectionError, APIStatusError) as e:
            # Handle API-specific errors, especially RateLimitError (429)
            if isinstance(e, RateLimitError):
                tqdm.write(f"RateLimitError for proposal ID {solution_id_for_log} (Form: {prompt_form_name}) on attempt {attempt + 1}/{max_retries}. Retrying in {base_delay * (2**attempt)} seconds...")
            elif isinstance(e, APIConnectionError):
                tqdm.write(f"APIConnectionError for proposal ID {solution_id_for_log} (Form: {prompt_form_name}) on attempt {attempt + 1}/{max_retries}. Retrying in {base_delay * (2**attempt)} seconds...")
            else: # Other APIStatusError (e.g., 500, 502, 503)
                 tqdm.write(f"APIStatusError for proposal ID {solution_id_for_log} (Form: {prompt_form_name}) on attempt {attempt + 1}/{max_retries}. Error: {e.status_code}. Retrying in {base_delay * (2**attempt)} seconds...")

            if attempt < max_retries - 1:
                time.sleep(base_delay * (2**attempt)) # Exponential backoff
                continue # Try again
            else:
                tqdm.write(f"Max retries reached for proposal ID {solution_id_for_log} (Form: {prompt_form_name}). Failing...")
                # After max retries, propagate the error or return a failure state
                # Here, we return a failure state that gets logged in the DataFrame
                return {
                    "feasibility": {"detailed_reasoning": f"API call failed after retries: {e}", "summary_rationale": f"API error: {e}", "passing_probability": None},
                    "potential_of_impact": {"detailed_reasoning": f"API error: {e}", "summary_rationale": f"API error: {e}", "passing_probability": None},
                    "innovative_approach": {"detailed_reasoning": f"API error: {e}", "summary_rationale": f"API error: {e}", "passing_probability": None},
                    "raw_response_content": f"API call failed after retries: {e}"
                }

        except Exception as e:
            # Catch any other unexpected exceptions during API call or processing (e.g., unexpected code error)
            tqdm.write(f"An unexpected error occurred for proposal ID {solution_id_for_log} (Form: {prompt_form_name}). Error: {e}")
            # Fail immediately for non-retryable errors or unexpected errors
            return {
                "feasibility": {"detailed_reasoning": f"API call error: {e}", "summary_rationale": f"API call error: {e}", "passing_probability": None},
                "potential_of_impact": {"detailed_reasoning": f"API call error: {e}", "summary_rationale": f"API error: {e}", "passing_probability": None},
                "innovative_approach": {"detailed_reasoning": f"API call error: {e}", "summary_rationale": f"API error: {e}", "passing_probability": None},
                "raw_response_content": f"API call failed: {e}"
            }

**SEQUENTIAL Evaluation Functions**

In [None]:
def build_single_criterion_prompt(criterion_name, combined_text, prompt_form_name, all_criteria_definitions, example_dict=None):
    """
    Build a prompt for evaluating a SINGLE criterion independently.

    CRITICAL: Uses the EXACT SAME prompt as simultaneous evaluation (with all 3 criteria definitions),
    but explicitly instructs the AI to evaluate ONLY ONE specific criterion.
    This allows us to test for halo effect by comparing:
    - Simultaneous: AI sees all criteria and evaluates all at once (potential halo effect)
    - Sequential: AI sees all criteria but evaluates only one at a time (reduced halo effect)

    Args:
        criterion_name: Which criterion to evaluate ('potential_of_impact', 'feasibility', 'innovative_approach')
        combined_text: The solution proposal text to evaluate
        prompt_form_name: The prompt form being used ('default', 'no_harsh', etc.)
        all_criteria_definitions: Dict with all 3 criteria definitions (same as simultaneous)
        example_dict: Optional dict with few-shot examples (same as simultaneous)

    Returns:
        tuple: (system_message, user_message) for API call
    """
    # Get the config for this form - SAME as simultaneous
    config = PROMPT_CONTENT_CONFIG.get(prompt_form_name)
    if not config:
        raise ValueError(f"Prompt form '{prompt_form_name}' not found in PROMPT_CONTENT_CONFIG.")

    system_msg = config["system_message"]
    instruction_txt = config["instruction"]
    challenge_desc = config["challenge_description"]
    use_dynamic_examples = config["use_dynamic_few_shot_examples"]

    system_message = ""
    user_message = ""

    # Template for user message - SAME structure as simultaneous
    SINGLE_CRITERION_USER_PROMPT_TEMPLATE = f"""
Evaluate the following startup proposal against all three criteria (Potential for Impact, Feasibility, and Innovative Approach).
Provide your comprehensive evaluation in the specified JSON format.

Startup Proposal:
{{combined_proposal_text}}

---
{{criteria_impact_def}}
---
{{criteria_feasibility_def}}
---
{{criteria_innovative_approach_def}}
"""

    # --- Constructing System Message ---
    # Start with the base components for the current form - SAME as simultaneous
    system_message = f"""
{system_msg}
{instruction_txt}
{SCORING_SCALE_TEXT}
{challenge_desc}
"""

    # Dynamically add few-shot examples if required and provided - SAME as simultaneous
    examples_section_dynamic = ""
    examples_reminder_dynamic = ""
    if use_dynamic_examples and example_dict and any(example_dict.values()):
        examples_section_dynamic += "\n--- Examples for Reference ---\nIMPORTANT: The following examples are for reference only. Do not directly compare the proposed solution to these examples. Instead, use them to understand the criteria and how they should be applied.\n\n"

        # Add high-scoring examples (scores 4-5)
        if 'high_scores' in example_dict and example_dict['high_scores']:
            for sol_id, sol_text in example_dict['high_scores']:
                examples_section_dynamic += f"Example of high-scoring solution (ID: {sol_id}) - typically scores 5 across criteria: \n{sol_text}\n\n"

        # Add medium-scoring examples (scores 3)
        if 'medium_scores' in example_dict and example_dict['medium_scores']:
            for sol_id, sol_text in example_dict['medium_scores']:
                examples_section_dynamic += f"Example of medium-scoring solution (ID: {sol_id}) - typically scores 3 across criteria: \n{sol_text}\n\n"

        # Add overall low-scoring examples (scores 1-2 across multiple criteria)
        if 'low_scores' in example_dict and example_dict['low_scores']:
            for sol_id, sol_text in example_dict['low_scores']:
                examples_section_dynamic += f"Example of low-scoring solution overall (ID: {sol_id}) - typically scores 2 across multiple criteria: \n{sol_text}\n\n"

        # Add low-scoring examples for specific criteria (scores 1-2)
        # Map our 3 criteria names to keys used in example_dict
        criterion_keys_in_examples = {
            "potential_of_impact": "criteria1_low_score",
            "feasibility": "criteria2_low_score",
            "innovative_approach": "criteria3_low_score"
        }
        for crit_name, example_key in criterion_keys_in_examples.items():
            if example_key in example_dict and example_dict[example_key]:
                for sol_id, sol_text in example_dict[example_key]:
                    examples_section_dynamic += f"Example with low score on {crit_name} (ID: {sol_id}) - typically scores 1-2 on this criterion: \n{sol_text}\n\n"

        examples_reminder_dynamic = """REMINDER: The above examples are provided to clarify the extreme ends of the 5-point scoring scale and illustrate the type of proposals that receive very high or very low scores. Specifically:
- Solutions marked as 'high_scores' are genuinely excellent proposals that meet all criteria exceptionally well. These proposals should serve as the benchmark for a score of 5.
- Solutions marked as 'low_scores' or 'criteria_low_score' are genuinely weak proposals with significant, demonstrable flaws on a specific criterion or across the board. These proposals should serve as the benchmark for scores of 1 or 2.
Your evaluation must be based solely on the merits of the proposed solution you are currently assessing. Do not assign high scores unless the solution is truly outstanding and clearly meets the high-scoring criteria demonstrated by the examples. Conversely, you MUST be prepared to assign low scores (1s and 2s) if the solution is weak, as illustrated in the examples.
You must use the full 1-5 scoring scale: 1 (Strongly Disagree), 2 (Somewhat Disagree), 3 (Neither Agree nor Disagree), 4 (Somewhat Agree), 5 (Strongly Agree). Evaluate each criterion independently and assign scores based on the solution's actual merits. Do not be generous with your scoring.
"""

        system_message += examples_section_dynamic
        system_message += examples_reminder_dynamic

    # Add the CRITICAL INSTRUCTION for sequential evaluation
    system_message += f"\n\n### CRITICAL INSTRUCTION FOR THIS EVALUATION ###"
    system_message += f"\nYou will see ALL THREE criteria below, but you must evaluate ONLY: {criterion_name.upper().replace('_', ' ')}"
    system_message += f"\nIgnore the other two criteria completely. Do not score them. Do not mention them."
    system_message += f"\nFocus exclusively on {criterion_name.upper().replace('_', ' ')}, but be aware of the full context.\n"

    # Add JSON output snippet for single criterion
    system_message += f"{JSON_OUTPUT_FORMAT_SEQUENTIAL_SINGLE}"

    # Add form-specific final instruction/clarification to system message - SAME as simultaneous
    # These are specific instructions that guide the model's final behavior/tone.
    if prompt_form_name == "default":
        system_message += "Remember the IMPORTANT EVALUATION INSTRUCTIONS. Be critical, rigorous, and demanding in your assessments.\n"
    elif prompt_form_name == "no_harsh":
        system_message += "Remember the IMPORTANT EVALUATION INSTRUCTIONS. (Note: This version is less harsh in its tone compared to 'default' in some aspects, but still rigorous.)\n"
    elif prompt_form_name == "no_rubric":
        system_message += "You are an evaluator for startup proposals. Provide your assessment in the specified JSON format.\n"
    elif prompt_form_name == "no_examples":
        system_message += "You are an evaluator for startup proposals. Focus solely on the provided instructions and challenge description. Do not generate examples."
        system_message += "\nProvide your assessment in the specified JSON format.\n"
    elif prompt_form_name == "without_everything":
        system_message += "Provide your assessment in the specified JSON format."

    # --- Constructing User Message ---
    # Populate the user prompt template with criterion definitions - SAME as simultaneous
    user_message = SINGLE_CRITERION_USER_PROMPT_TEMPLATE.format(
        combined_proposal_text=combined_text,
        criteria_impact_def=all_criteria_definitions['potential_of_impact'],
        criteria_feasibility_def=all_criteria_definitions['feasibility'],
        criteria_innovative_approach_def=all_criteria_definitions['innovative_approach']
    )

    # Add emphasis on which ONE to evaluate
    user_message += f"\n### YOUR TASK: EVALUATE ONLY {criterion_name.upper().replace('_', ' ')} ###\n\n"
    user_message += f"While you have seen all three criteria above for context, you must ONLY evaluate and provide a score for:\n"
    user_message += f"**{criterion_name.upper().replace('_', ' ')}**\n\n"
    user_message += "Do NOT provide scores or detailed reasoning for the other two criteria.\n\n"

    return system_message, user_message

def perform_sequential_api_call_single_criterion(combined_text, criterion_name, prompt_form_name, all_criteria_definitions, example_dict=None):
    """
    Make an API call to evaluate a SINGLE criterion.
    Part of sequential evaluation method.

    IMPORTANT: Uses the EXACT SAME prompt content as simultaneous evaluation,
    including few-shot examples if provided, but only asks for evaluation of ONE criterion at a time.

    Args:
        combined_text: Solution proposal text
        criterion_name: Which criterion to evaluate ('potential_of_impact', 'feasibility', 'innovative_approach')
        prompt_form_name: Prompt form to use
        all_criteria_definitions: Dict with all 3 criteria definitions (ensures same prompt as simultaneous)
        example_dict: Optional dict with few-shot examples (same as simultaneous)

    Returns:
        dict: Result for this single criterion with structure:
              {"detailed_reasoning": ..., "summary_rationale": ..., "passing_probability": ...}
    """
    if openai_client is None:
        tqdm.write(f"Skipping API call for {criterion_name}. API client not initialized.")
        return {"detailed_reasoning": "API skipped.", "summary_rationale": "API skipped.", "passing_probability": None}

    # Build the prompt - uses SAME content as simultaneous but asks for only one criterion
    system_message, user_message = build_single_criterion_prompt(
        criterion_name, combined_text, prompt_form_name, all_criteria_definitions, example_dict
    )

    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message}
    ]

    # Extract Solution ID for logging
    solution_id_match = re.search(r'Solution ID: (\d+)', combined_text)
    solution_id_for_log = solution_id_match.group(1) if solution_id_match else "N/A"

    # Log with info about examples
    examples_info = " (with examples)" if example_dict and any(example_dict.values()) else " (no examples)"
    tqdm.write(f"  └─ Evaluating {criterion_name} for proposal ID {solution_id_for_log}{examples_info} (using full prompt, requesting only 1 criterion)...")

    # API Call with retry logic
    max_retries = 5
    base_delay = 1
    import time

    for attempt in range(max_retries):
        try:
            completion_args = {
                "model": selected_openai_model_name,
                "messages": messages,
                "temperature": EXPERIMENT_TEMPERATURE,  # Use configured temperature from global variable
            }
            completion_args['response_format'] = {"type": "json_object"}

            response = openai_client.chat.completions.create(**completion_args)
            total_tokens = response.usage.total_tokens if response.usage else 0
            tqdm.write(f"    └─ {criterion_name} evaluation used {total_tokens} tokens")

            response_content = response.choices[0].message.content if response.choices and response.choices[0].message else ""

            try:
                parsed_response = json.loads(response_content)
                return parsed_response
            except json.JSONDecodeError as json_e:
                tqdm.write(f"JSONDecodeError for {criterion_name} in proposal ID {solution_id_for_log}. Error: {json_e}")
                return {
                    "detailed_reasoning": f"JSON parse error: {response_content}",
                    "summary_rationale": f"JSON parse error: {response_content[:200]}...",
                    "passing_probability": None
                }

        except (RateLimitError, APIConnectionError, APIStatusError) as e:
            if attempt < max_retries - 1:
                wait_time = base_delay * (2**attempt)
                tqdm.write(f"API error for {criterion_name}, retrying in {wait_time}s...")
                time.sleep(wait_time)
                continue
            else:
                tqdm.write(f"Max retries reached for {criterion_name}")
                return {
                    "detailed_reasoning": f"API error after retries: {e}",
                    "summary_rationale": f"API error: {e}",
                    "passing_probability": None
                }

        except Exception as e:
            tqdm.write(f"Unexpected error for {criterion_name}: {e}")
            return {
                "detailed_reasoning": f"API error: {e}",
                "summary_rationale": f"API error: {e}",
                "passing_probability": None
            }

def perform_sequential_evaluation(combined_text, prompt_form_name, example_dict=None):
    """
    Perform sequential evaluation: Make 3 separate API calls, one for each criterion.

    CRITICAL DESIGN: Each API call uses the EXACT SAME prompt content as simultaneous evaluation
    (including all 3 criteria definitions AND few-shot examples if provided), but explicitly asks
    the AI to evaluate only ONE criterion.

    This design allows us to properly test for halo effect:
    - Simultaneous: AI sees all criteria + examples, evaluates all at once → potential halo effect
    - Sequential: AI sees all criteria + examples, but evaluates one at a time → reduced halo effect

    The difference is NOT in what information the AI sees, but in what it's asked to do.
    This is the proper way to isolate the effect of simultaneous vs sequential evaluation.

    Args:
        combined_text: Solution proposal text
        prompt_form_name: Prompt form to use
        example_dict: Optional dict with few-shot examples (same as simultaneous)

    Returns:
        dict: Combined results from all 3 criteria with raw response tracking
    """
    solution_id_match = re.search(r'Solution ID: (\d+)', combined_text)
    solution_id_for_log = solution_id_match.group(1) if solution_id_match else "N/A"

    # Log info about examples
    examples_info = " (with examples)" if example_dict and any(example_dict.values()) else " (no examples)"
    tqdm.write(f"Making SEQUENTIAL evaluation for proposal ID {solution_id_for_log}{examples_info} (3 API calls, SAME prompt each time, different criterion requested)...")

    # Get ALL criteria definitions from config - SAME as simultaneous evaluation
    config = PROMPT_CONTENT_CONFIG.get(prompt_form_name, {})

    all_criteria_definitions = {
        'potential_of_impact': config.get('criteria_impact_definition', BASE_CRITERIA_IMPACT_DEFINITION_CORE),
        'feasibility': config.get('criteria_feasibility_definition', BASE_CRITERIA_FEASIBILITY_DEFINITION_CORE),
        'innovative_approach': config.get('criteria_innovative_approach_definition', BASE_CRITERIA_INNOVATIVE_APPROACH_DEFINITION_CORE)
    }


    # Order of evaluation - evaluate all 3 criteria
    criteria_order = ['potential_of_impact', 'feasibility', 'innovative_approach']

    # ============================================================
    # OPTIMIZATION: Parallel execution of 3 criterion API calls
    # ============================================================
    results = {}
    raw_responses_dict = {}  # Use dict to maintain order later

    tqdm.write(f"  → Requesting ALL 3 criteria in parallel: {', '.join(criteria_order)}")

    with ThreadPoolExecutor(max_workers=3) as inner_executor:
        # Submit all 3 API calls simultaneously
        future_to_criterion = {
            inner_executor.submit(
                perform_sequential_api_call_single_criterion,
                combined_text,
                criterion_name,
                prompt_form_name,
                all_criteria_definitions,
                example_dict
            ): criterion_name
            for criterion_name in criteria_order
        }

        # Collect results as they complete
        for future in as_completed(future_to_criterion):
            criterion_name = future_to_criterion[future]
            try:
                result = future.result()
                results[criterion_name] = result
                raw_responses_dict[criterion_name] = f"{criterion_name}: {json.dumps(result)}"
            except Exception as exc:
                tqdm.write(f"  ✗ {criterion_name} generated an exception: {exc}")
                results[criterion_name] = {
                    "detailed_reasoning": f"API error: {exc}",
                    "summary_rationale": f"API error: {exc}",
                    "passing_probability": None
                }
                raw_responses_dict[criterion_name] = f"{criterion_name}: ERROR - {exc}"

    # Combine raw responses in consistent order
    raw_responses = [raw_responses_dict.get(c, "") for c in criteria_order]
    results['raw_response_content'] = "\n\n".join(raw_responses)

    tqdm.write(f"  Completed sequential evaluation for proposal ID {solution_id_for_log}")
    tqdm.write(f"  Note: Each call used the SAME full prompt (all 3 criteria + examples), but requested only 1 criterion score")
    return results



In [None]:
    # Evaluate each criterion separately, but with full context AND examples
    results = {}
    raw_responses = []

    # Order of evaluation - evaluate all 3 criteria
    criteria_order = ['potential_of_impact', 'feasibility', 'innovative_approach']

    for criterion_name in criteria_order:
        tqdm.write(f"  → Requesting evaluation of: {criterion_name} (with full prompt context + examples)")
        result = perform_sequential_api_call_single_criterion(
            combined_text,
            criterion_name,
            prompt_form_name,
            all_criteria_definitions,  # Pass ALL criteria definitions, not just one
            example_dict  # Pass examples - SAME as simultaneous
        )
        results[criterion_name] = result
        raw_responses.append(f"{criterion_name}: {json.dumps(result)}")

    # Combine results in the same format as simultaneous evaluation
    results['raw_response_content'] = "\n\n".join(raw_responses)

    tqdm.write(f"  Completed sequential evaluation for proposal ID {solution_id_for_log}")
    tqdm.write(f"  Note: Each call used the SAME full prompt (all 3 criteria + examples), but requested only 1 criterion score")
    return results

#Evaluation Orchestration for a Single Prompt Form

In [None]:
def evaluate_proposals_for_form(df_processed_data, form_name, example_solution_ids=None):
    """
    Orchestrates the evaluation process for a single specified prompt form across all proposals.
    Handles parallel API calls for each proposal and populates the DataFrame with results.

    NOW SUPPORTS TWO EVALUATION METHODS (controlled by EXPERIMENT_EVALUATION_METHOD):
    - 'simultaneous': All criteria in one API call (faster, potential halo effect)
    - 'sequential': Each criterion in separate API calls (slower, reduces halo effect)
    """
    output_df = df_processed_data.copy()

    # Display which evaluation method is being used
    eval_method = EXPERIMENT_EVALUATION_METHOD
    tqdm.write(f"\n--- Starting Evaluation for Prompt Form: '{form_name}' ---")
    tqdm.write(f"--- Evaluation Method: {eval_method.upper()} ({EVALUATION_METHODS[eval_method]}) ---")
    tqdm.write(f"--- Temperature: {EXPERIMENT_TEMPERATURE} ---\n")

    # Adjust max_workers based on evaluation method
    # Sequential method makes 3x more API calls, so reduce parallelism to avoid rate limits

    if eval_method == 'sequential':
        max_workers = 3  # Increased since inner calls are now parallel (was 2)
        tqdm.write(f"Note: Using max_workers={max_workers} for sequential evaluation (3 PARALLEL API calls per proposal)")
    else:
        max_workers = 5  # Original value for simultaneous (1 call per proposal)

    # Prepare dynamic example texts if IDs are provided AND the form is configured to use them
    current_form_config = PROMPT_CONTENT_CONFIG.get(form_name)
    dynamic_example_dict_for_prompt = {}
    if current_form_config and current_form_config.get("use_dynamic_few_shot_examples") and example_solution_ids:
        for example_type, solution_ids in example_solution_ids.items():
            dynamic_example_dict_for_prompt[example_type] = []
            for sol_id in solution_ids:
                # Ensure sol_id is treated as integer for comparison
                if sol_id in df_processed_data["Solution ID"].values:
                    # Retrieve the full combined text for the example solution
                    solution_text = df_processed_data[df_processed_data["Solution ID"] == sol_id].iloc[0]["Combined"]
                    dynamic_example_dict_for_prompt[example_type].append((sol_id, solution_text))
                else:
                    tqdm.write(f"Warning: Example Solution ID {sol_id} not found in the dataframe for '{example_type}'.")


    futures_list = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for index, row in tqdm(output_df.iterrows(), total=len(output_df), desc=f"Submitting tasks for {form_name}", unit="tasks", unit_scale=True, ncols=100, ascii=True, bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]"):
            combined_text = row['Combined']

            # Choose evaluation method based on EXPERIMENT_EVALUATION_METHOD
            if eval_method == 'simultaneous':
                # Original method: single API call for all criteria
                future_evaluation = executor.submit(
                    perform_openai_api_call,
                    combined_text,
                    form_name,
                    example_dict=dynamic_example_dict_for_prompt
                )
            else:  # sequential
                # New method: separate API calls for each criterion
                # IMPORTANT: Pass the SAME example_dict as simultaneous
                future_evaluation = executor.submit(
                    perform_sequential_evaluation,
                    combined_text,
                    form_name,
                    example_dict=dynamic_example_dict_for_prompt  # Pass examples - SAME as simultaneous
                )

            futures_list.append((index, future_evaluation, combined_text)) # Simplified item structure

        results_from_api_calls = []
        # Use tqdm for collecting results as well
        for item in tqdm(futures_list, desc=f"Collecting results for {form_name}", unit="results", unit_scale=True, ncols=100, ascii=True, bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]"):
            index = item[0]
            row_results = {
                'original_index': index,
                'Combined': item[2] # Pass combined_text through
            }

            future_evaluation = item[1]
            try:
                unified_result = future_evaluation.result()
                row_results['feasibility'] = unified_result.get('feasibility', {})
                row_results['potential_of_impact'] = unified_result.get('potential_of_impact', {})
                row_results['innovative_approach'] = unified_result.get('innovative_approach', {})
                row_results['raw_response_content_unified'] = unified_result.get('raw_response_content')
            except Exception as exc:
                tqdm.write(f"Task for row {index} generated an exception: {exc}")
                row_results['feasibility'] = {"detailed_reasoning": f"API error: {exc}", "summary_rationale": f"API error: {exc}", "passing_probability": None}
                row_results['potential_of_impact'] = {"detailed_reasoning": f"API error: {exc}", "summary_rationale": f"API error: {exc}", "passing_probability": None}
                row_results['innovative_approach'] = {"detailed_reasoning": f"API error: {exc}", "summary_rationale": f"API error: {exc}", "passing_probability": None}
                row_results['raw_response_content_unified'] = f"API call failed: {exc}"

            results_from_api_calls.append(row_results)

    tqdm.write(f"API calls complete for '{form_name}'. Populating DataFrame...")

    # Initialize all columns that will be populated to avoid KeyError
    output_df['feasibility_detailed_reasoning'] = None
    output_df['feasibility_summary_rationale'] = None
    output_df['feasibility_passing_probability'] = None
    output_df['potential_of_impact_detailed_reasoning'] = None
    output_df['potential_of_impact_summary_rationale'] = None
    output_df['potential_of_impact_passing_probability'] = None
    output_df['innovative_approach_detailed_reasoning'] = None
    output_df['innovative_approach_summary_rationale'] = None
    output_df['innovative_approach_passing_probability'] = None
    output_df['unified_raw_response_content'] = None


    for r in results_from_api_calls:
        idx = r['original_index']

        # Populate Feasibility details
        output_df.loc[idx, 'feasibility_detailed_reasoning'] = r['feasibility'].get('detailed_reasoning')
        output_df.loc[idx, 'feasibility_summary_rationale'] = r['feasibility'].get('summary_rationale')
        output_df.loc[idx, 'feasibility_passing_probability'] = r['feasibility'].get('passing_probability')

        # Populate Impact details
        output_df.loc[idx, 'potential_of_impact_detailed_reasoning'] = r['potential_of_impact'].get('detailed_reasoning')
        output_df.loc[idx, 'potential_of_impact_summary_rationale'] = r['potential_of_impact'].get('summary_rationale')
        output_df.loc[idx, 'potential_of_impact_passing_probability'] = r['potential_of_impact'].get('passing_probability')

        # Populate Innovative Approach details
        innovative_approach_data = r.get('innovative_approach', {})
        output_df.loc[idx, 'innovative_approach_detailed_reasoning'] = innovative_approach_data.get('detailed_reasoning')
        output_df.loc[idx, 'innovative_approach_summary_rationale'] = innovative_approach_data.get('summary_rationale')
        output_df.loc[idx, 'innovative_approach_passing_probability'] = innovative_approach_data.get('passing_probability')

        # Raw response content (always unified now)
        output_df.loc[idx, 'unified_raw_response_content'] = r.get('raw_response_content_unified')


    # Local Overall Assessment (synthesis from individual summaries)
    output_df['overall_holistic_rationale'] = output_df.apply(
        lambda row: f"Feasibility Summary: {row.get('feasibility_summary_rationale', 'N/A')}\n"
                    f"Impact Summary: {row.get('potential_of_impact_summary_rationale', 'N/A')}\n"
                    f"Innovative Approach Summary: {row.get('innovative_approach_summary_rationale', 'N/A')}",
        axis=1
    )
    output_df['overall_passing_probability'] = None # Initialized, will be overwritten by weighted score

    return output_df


In [None]:
    if eval_method == 'sequential':
        max_workers = 2  # Reduced for sequential (3 calls per proposal)
        tqdm.write(f"Note: Using max_workers={max_workers} for sequential evaluation (3 API calls per proposal)")
    else:
        max_workers = 5  # Original value for simultaneous (1 call per proposal)


#Calculations and Plotting

In [None]:
def convert_scores_to_numeric(df_input):
    """
    Converts passing probability columns to numeric format.
    """
    df = df_input.copy() # Work on a copy to avoid modifying original outside this function

    # Convert passing probability columns to numeric, coercing errors to NaN
    df['feasibility_passing_probability_numeric'] = pd.to_numeric(df['feasibility_passing_probability'], errors='coerce')
    df['potential_of_impact_passing_probability_numeric'] = pd.to_numeric(df['potential_of_impact_passing_probability'], errors='coerce')
    df['innovative_approach_passing_probability_numeric'] = pd.to_numeric(df['innovative_approach_passing_probability'], errors='coerce')

    return df

def generate_heatmaps_logic(df_input, form_name):
    """
    Generates and saves two heatmaps:
    1. Proposal scores across all criteria (with Solution ID on Y-axis).
    2. Correlation matrix between criteria passing probabilities.
    """
    df = df_input.copy() # Work on a copy

    # Plot 1: Heatmap showing each proposal's scores across criteria
    tqdm.write(f"\nGenerating Proposals vs. Criteria Scores Heatmap for '{form_name}'...")
    # Define columns to be included in the heatmap, including 'Solution ID'
    heatmap_data_columns = [
        'Solution ID', # Ensure Solution ID is included to be set as index
        'feasibility_passing_probability_numeric',
        'potential_of_impact_passing_probability_numeric',
        'innovative_approach_passing_probability_numeric'
    ]
    # Filter for columns that actually exist in the DataFrame to prevent KeyError
    actual_heatmap_cols = [col for col in heatmap_data_columns if col in df.columns]

    # Select the columns for plotting. Create a copy to avoid SettingWithCopyWarning.
    plot_scores_df = df[actual_heatmap_cols].copy()

    if not plot_scores_df.empty:
        # Set 'Solution ID' as the index for the DataFrame, which will then be used as the Y-axis labels
        if 'Solution ID' in plot_scores_df.columns:
            # Drop rows where any of the numeric score columns are NaN *before* setting index
            # This ensures that all data in the heatmap is valid numeric and has a corresponding Solution ID.
            numeric_score_cols_for_plot = [col for col in plot_scores_df.columns if col != 'Solution ID']
            plot_scores_df = plot_scores_df.dropna(subset=numeric_score_cols_for_plot)

            # Check again if plot_scores_df is empty AFTER dropna, as it might become empty
            if plot_scores_df.empty:
                tqdm.write(f"Not enough valid numeric scores after cleaning to generate Proposal Scores Heatmap for '{form_name}'. Skipping heatmap.")
                return # Exit function if no data to plot

            # Now set the index
            # Ensure Solution ID is of a type suitable for indexing (e.g., string or int) and unique
            plot_scores_df['Solution ID'] = plot_scores_df['Solution ID'].astype(str)
            plot_scores_df = plot_scores_df.set_index('Solution ID')

        # Rename the remaining columns for better readability in the plot
        plot_scores_df.columns = ['Feasibility', 'Impact', 'Innovative Approach']

        plt.figure(figsize=(12, max(6, len(plot_scores_df) * 0.4))) # Dynamic height, with a minimum of 6 inches
        # Create heatmap
        sns.heatmap(plot_scores_df, annot=True, cmap='viridis', fmt=".0f", linewidths=.5, vmin=1, vmax=5) # Set vmin/vmax for 5-point scale
        plt.title(f'Proposal Scores Across Criteria and Total Score (Form: {form_name})')
        plt.xlabel('Evaluation Criteria')
        plt.ylabel('Proposal Solution ID') # Update Y-axis label to reflect Solution ID
        plt.tight_layout() # Adjust plot to prevent labels overlapping
        # Save and show plot
        plt.savefig(f'proposal_criteria_scores_heatmap_{form_name}.png')
        plt.show() # Display plot (for interactive environments like Colab)
        tqdm.write(f"Proposal scores heatmap for '{form_name}' saved as 'proposal_criteria_scores_heatmap_{form_name}.png'.")
    else:
        tqdm.write(f"Not enough valid numeric scores or Solution IDs to generate Proposal Scores Heatmap for '{form_name}'.")



#Main Execution

In [None]:
# This section orchestrates the entire evaluation workflow.

# Load and preprocess data once at the top level
df_processed_data = load_and_preprocess_data()

#Example usage with dynamic few-shot example for score

In [None]:
# Example usage with dynamic few-shot examples using 5-point scoring scale
# These IDs must exist in your loaded health.xlsx or dummy data.
# For 3 criteria: Potential for Impact (criteria1), Feasibility (criteria2), Innovative Approach (criteria3)

CUSTOM_FEW_SHOT_EXAMPLES = {
    'high_scores': [97613],      # Example Solution ID for high scores (5) across all criteria
    'medium_scores': [97530],    # Example Solution ID for medium scores (3) across criteria
    'low_scores': [97882],       # NEW: Example Solution ID for overall low scores (2) across multiple criteria
    'criteria1_low_score': [96399],  # Example Solution ID with low score (1-2) on Potential for Impact
    'criteria2_low_score': [95014],  # Example Solution ID with low score (1-2) on Feasibility
    'criteria3_low_score': [95356]   # Example Solution ID with low score (1-2) on Innovative Approach
}

In [None]:
# --- USER SELECTION: CHOOSE YOUR PROMPT FORM HERE ---
# Select the single prompt form to execute for this run.
# "default", "no_harsh", "no_rubric", "no_examples", "without_everything"
selected_prompt_form_name = "without_everything" # <--- CHANGE THIS VALUE TO SELECT A SINGLE PROMPT FORM FOR EXECUTION ### USER TO CUSTOMIZE ###

print(f"\n===== Starting Full Evaluation for Selected Prompt Form: '{selected_prompt_form_name}' =====")

# Determine if dynamic examples should be passed to the evaluation function based on the selected form's configuration
current_form_config = PROMPT_CONTENT_CONFIG.get(selected_prompt_form_name)
examples_to_pass = None
if current_form_config and current_form_config.get("use_dynamic_few_shot_examples"):
    examples_to_pass = CUSTOM_FEW_SHOT_EXAMPLES


# 1. Execute the core evaluation process for the selected prompt form
# This function handles API calls, results collection, and initial DataFrame population.
final_df = evaluate_proposals_for_form(df_processed_data.copy(), selected_prompt_form_name, example_solution_ids=examples_to_pass)

# 2. Perform post-processing: convert scores to numeric format
final_df = convert_scores_to_numeric(final_df)

# 3. Display a snapshot of the final evaluation results
print(f"\n--- Final Evaluation Results for '{selected_prompt_form_name}' (First 3 rows for key columns) ---")

# Raw response column is always unified now
raw_response_cols_to_display = ['unified_raw_response_content']

display_columns = ['Solution ID', 'Combined',
                   'potential_of_impact_passing_probability',
                   'feasibility_passing_probability',
                   'innovative_approach_passing_probability',
                   'overall_holistic_rationale'] + raw_response_cols_to_display

print(final_df[display_columns].head(3))

# 4. Save the evaluation results to CSV file (not Excel) with experiment configuration in filename

# Build filename with experiment parameters for easy identification
config = get_experiment_config()
output_csv = (f"startup_evaluations_results_"
              f"{selected_prompt_form_name}_"
              f"temp{config['temperature']}_"
              f"{config['evaluation_method']}.csv")

# Add experiment configuration columns to DataFrame for reference
final_df['experiment_temperature'] = config['temperature']
final_df['experiment_evaluation_method'] = config['evaluation_method']

final_df.to_csv(output_csv, index=False)
print(f"\nResults for '{selected_prompt_form_name}' saved to {output_csv}")
print(f"  Temperature: {config['temperature']}")
print(f"  Method: {config['evaluation_method']}")

# 5. Generate and save visualization plots (heatmaps) with experiment config in filename
# Update heatmap filename to include experiment parameters
heatmap_filename_suffix = f"{selected_prompt_form_name}_temp{config['temperature']}_{config['evaluation_method']}"
generate_heatmaps_logic(final_df, heatmap_filename_suffix)

print(f"\nEvaluation for '{selected_prompt_form_name}' complete. Review generated files and plots.")

# --- Optional: Download all generated files in Colab environment ---
# This block allows automatic download of generated files if running in Google Colab.
if 'google.colab' in str(get_ipython()):
    print("\nInitiating downloads for generated files...")
    config = get_experiment_config()
    heatmap_suffix = f"{selected_prompt_form_name}_temp{config['temperature']}_{config['evaluation_method']}"

    files_to_download = [
        f"startup_evaluations_results_{selected_prompt_form_name}_temp{config['temperature']}_{config['evaluation_method']}.csv",
        f'proposal_criteria_scores_heatmap_{heatmap_suffix}.png',
    ]

    for file_path in files_to_download:
        if os.path.exists(file_path):
            files.download(file_path)
    print("Downloads initiated. Check your downloads folder.")