In [5]:
import pandas as pd
from openai import OpenAI
import time

def evaluate_solution(question: str, correct_solution: str, test_solution: str, 
                     correct_options: str, client: OpenAI) -> str:
    """
    Evaluate a test solution against the correct solution using GPT-4.
    """
    if not test_solution:
        return "No solution provided"
        
    if correct_solution and len(correct_solution.strip()) > 0:
        prompt = f"""
        Question: {question}
        
        Correct Solution: {correct_solution}
        
        Test Solution: {test_solution}
        
        Please evaluate if the test solution is correct compared to the correct solution.
        Consider mathematical accuracy, approach, and final answer. 
        The text is in LaTeX format, so interpret and understand the LaTeX structure while performing the evaluation. 
        Respond with either 'Correct' or 'Incorrect'.
        """
    elif correct_options:
        prompt = f"""
        Question: {question}
        
        Correct Options: {correct_options}
        
        Test Solution: {test_solution}
        
        Please evaluate if the test solution leads to the correct answer among the given options.
        The text is in LaTeX format, so interpret and understand the LaTeX structure while performing the evaluation.
        Respond with either 'Correct' or 'Incorrect'.
        """
    else:
        return "No reference solution or options available"

    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.5
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"API Error: {str(e)}")
        # time.sleep(2)
        return f"Error: {str(e)}"

def process_evaluations(df: pd.DataFrame, client: OpenAI) -> pd.DataFrame:
    """Process the DataFrame and add evaluation columns."""
    # Create new columns for evaluations
    df['zero_shot_evaluation'] = ''
    df['few_shot_evaluation'] = ''
    
    for idx, row in df.iterrows():
        print(f"Processing row {idx + 1}")
        # Get question and solutions
        question = row['transformed_question']
        correct_solution = row['transformed_solution']
        correct_options = row['transformed_options']
        
        # Evaluate zero-shot solution
        if 'zero_shot_gpt_solution' in df.columns:
            df.at[idx, 'zero_shot_evaluation'] = evaluate_solution(
                question,
                correct_solution,
                row['zero_shot_gpt_solution'],
                correct_options,
                client
            )
            # time.sleep(1)  # Rate limiting
        
        # Evaluate few-shot solution
        if 'few_shot_gpt_solution' in df.columns:
            df.at[idx, 'few_shot_evaluation'] = evaluate_solution(
                question,
                correct_solution,
                row['few_shot_gpt_solution'],
                correct_options,
                client
            )
            # time.sleep(1)  # Rate limiting
            
    return df

# Main execution
def main():
    # Initialize OpenAI client
    client = OpenAI(api_key="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")
    
    # Read JSON file into DataFrame
    df = pd.read_json(r'C:\Users\karth\OneDrive\Desktop\gpt_solutions\mathematics_data_unique.json')
    
    # If the JSON file contains a single record, convert it to a DataFrame
    if isinstance(df, pd.Series):
        df = pd.DataFrame([df])
    
    # Process evaluations
    df = process_evaluations(df, client)
    
    # Save results
    df.to_csv('evaluation_results.csv', index=False)
    print("Evaluation completed and saved to evaluation_results.csv")
    
    return df

if __name__ == "__main__":
    result_df = main()

Processing row 1
Processing row 2
Processing row 3
Processing row 4
Processing row 5
Processing row 6
Processing row 7
Processing row 8
Processing row 9
Processing row 10
Processing row 11
Processing row 12
Processing row 13
Processing row 14
Processing row 15
Processing row 16
Processing row 17
Processing row 18
Processing row 19
Processing row 20
Processing row 21
Processing row 22
Processing row 23
Processing row 24
Processing row 25
Processing row 26
Processing row 27
Processing row 28
Processing row 29
Processing row 30
Processing row 31
Processing row 32
Processing row 33
Processing row 34
Processing row 35
Processing row 36
Processing row 37
Processing row 38
Processing row 39
Processing row 40
Processing row 41
Processing row 42
Processing row 43
Processing row 44
Processing row 45
Processing row 46
Processing row 47
Processing row 48
Processing row 49
Processing row 50
Processing row 51
Processing row 52
Processing row 53
Processing row 54
Processing row 55
Processing row 56
P