# Answers and Feedback Evaluation

In [23]:
import os
import json
from dotenv import load_dotenv
from openai import OpenAI
from collections import Counter

## Load environmental variables

In [None]:
if not load_dotenv(".env"):
    print("An error has occured. Make sure the file exists and is readable")
else:
    print("Loading successful")

Loading successful


## Initialize LLM clients - OpenAI and DeepSeek

In [25]:
def initialize_llm_clients():
    if 'OpenAI' in globals() and 'OpenAI' is not None:
        try:
            openai_api_key = os.getenv("OPENAI_API_KEY")
            if not openai_api_key:
                print("An error has occured. OpenAI API key not found.")
            
            else:
                openai_client = OpenAI(api_key=openai_api_key)
                print("OpenAI client succesfully initialized.")

        except Exception as e:
            print(f"An error has occured during OpenAi initialization process: {e}")
    
        try:
            deepseek_api_key = os.getenv("DEEPSEEK_API_KEY")
            if not deepseek_api_key:
                print("An error has occured. DeepSeek API key not found.")

            else:
                deepseek_client = OpenAI(
                    base_url="https://openrouter.ai/api/v1",
                    api_key=deepseek_api_key
                )
        
        except Exception as e:
            print(f"An error has occured during DeepSeek initialization process: {e}")

    else:
        print("Skipping LLM initialization process. OpenAI library imported incorrectly")

    return openai_client, deepseek_client

  if 'OpenAI' in globals() and 'OpenAI' is not None:


## Evaluate user's answers and provide feedback

##### Grade the user's answers and explain the incorrect ones

In [26]:
def evaluate_answers(model_choice, passage_content, questions_string, user_answers):
    if not all([passage_content, questions_string, user_answers]):
         print("Error: Missing passage, questions, or user answers for evaluation.")
         return None
    
    prompt = f"""
    You are an IELTs Reading Expert. Your task is to evaluate the user's answers based on the provided passage and questions. 
    Provide the correct answers and grade the user's submission.

    Passage:
    \"\"\"{passage_content}\"\"\"

    Questions:
    \"\"\"{questions_string}\"\"\"

    User's answers:
    \"\"\"{user_answers}\"\"\"

    Please follow this structure:
    
    Instructions:
    1. Go through each question number found in the 'Questions' section.
    2. For each question number, find the corresponding answer in the 'User's answers' section.
    3. Evaluate if the user's answer is correct or incorrect based ONLY on the provided Passage.
    4. State the correct answer based ONLY on the provided passage.
    5. If the user's answer is incorrect, provide a brief explanation referencing the SPECIFIC part of the passage that supports the correct answer. 
    Keep explanations concise.
    6. After evaluating all questions, count the number of 'Correct' answers.
    7. Calculate the total number of questions evaluated based on the 'Questions' input section.
    8. Calculate the score percentage (Number Correct / Total Questions * 100), rounded to the nearest whole number.
    9. Format the output exactly as specified below, including the Detailed Evaluation and the Final Grade with the score percentage.

    Format the output clearly as shown below:
    ===DETAILED EVALUATION===
    Question 1:
    - Your answer: [user's answer for Question 1.]
    - Evaluation: [Correct/Incorrect.]
    - Correct answer: [Correct answer for Question 1.]
    - Explanation: [Brief explanation referencing the passage if the user's answer is incorrect.
    Write N/A if the answer is correct.]

    Question 2:
    - Your answer: [user's answer for Question 2.]
    - Evaluation: [Correct/Incorrect.]
    - Correct answer: [Correct answer for Question 2.]
    - Explanation: [Brief explanation referencing the passage if the user's answer is incorrect.
    Write N/A if the answer is correct.]

    (Repeat this process for ALL questions numbered in the 'Question' input)

    ===FINAL GRADE===
    Total questions answered correctly: [Number of correct questions] / [Total number of questions]
    Score Percentage: [Calculated Percentage]%

    """
    try:
        if model_choice == "GPT 4.1":
            if not openai_client:
                raise ValueError("OpenAI client is not available")
            
            response = openai_client.chat.completions.create(
                model="gpt-4.1-2025-04-14",
                messages=[
                    {"role": "system", "content": "You are an IELTs Reading tutor expert in evaluation and following output formats precisely."},
                    {"role": "user", "content": prompt},
                ]
            )

            evaluation = response.choices[0].message.content 

        elif model_choice == "DeepSeekR1":
            if not deepseek_client:
                raise ValueError("DeepSeek client is not available")
            
            response = deepseek_client.chat.completions.create(
                model="deepseek/deepseek-r1:free",
                messages=[
                    {"role": "system", "content": "You are an IELTs Reading tutor expert in evaluation and following output formats precisely."},
                    {"role": "user", "content": prompt},
                ]
            )

            evaluation = response.choices[0].message.content 
        
        else:
            print(f"An error has occured. Invalid model choice: {model_choice}. Please choose 'GPT 4.1' or 'DeepSeekR1'.")
            return None
        
    except Exception as e:
        print(f"An error has occured during API call for {model_choice}: {e}. Please try again.")
        return None
    
    return evaluation

##### Provide feedback

In [27]:
def get_feedback(evaluation_results, questions_data):
    # Error handling for input
    if not isinstance(evaluation_results, list) or not isinstance(questions_data, list):
        print("An error has occured. Invalid input format for get_feedback. Expecting list")
        return "Error generating feedback due to invalid input format", []
    
    if not evaluation_results:
        if questions_data:
            return "Could not parse the evaluation results. Unable to provide feedback", []
        else:
            return "No question or results detected to be evaluate", []

    # Map question numbers to types
    questions_type_map = {
        q.get('number'): q.get('type')
        for q in questions_data
        if isinstance(q, dict) and 'number' in q and 'type' in q
    }

    unknown_placeholder = "Unknown types of questions"
    unknown_types = not bool(questions_type_map) and bool(questions_data)
    if unknown_types:
        print("Question formatting error. Could not map question types.")
    
    # Count the incorrect answers by type
    incorrect_answers_count_by_type = Counter()
    struggling_type = set()
    found_incorrect = False

    for result in evaluation_results:
        if isinstance(result, dict) and result.get('evaluation') == 'Incorrect':
            found_incorrect = True
            q_num = result.get('number')
            q_type = questions_type_map.get(q_num) if q_num is not None else None

            if q_type:
                incorrect_answers_count_by_type[q_type] += 1
                struggling_type.add(q_type)

            else:
                """ Only count unknown if if q_num exists but not in the map
                or if q_num was missing in results
                """
                incorrect_answers_count_by_type[unknown_placeholder] += 1
                struggling_type.add(unknown_placeholder)

        elif not isinstance(result, dict) or 'evaluation' not in result:
            print(f"An error has occurred. Skipping invalid item in evaluation results: {result}")

    # Refining the list of struggling questions types
    struggling_type_list = sorted(list(struggling_type))

    if unknown_placeholder in struggling_type and len(struggling_type) > 1:
        struggling_type.remove(unknown_placeholder)
        struggling_type_list = sorted(list(struggling_type))

    # Feedback message generation:
    feedback = "===FEEDBACK===\n"
    struggling_questions_identified = struggling_type_list and struggling_type_list != [unknown_placeholder]

    if struggling_questions_identified:
        feedback += "Based on your performance, the types of questions you might want to practice more are: \n"
        for q_type in struggling_type_list:
            count = incorrect_answers_count_by_type[q_type]
            feedback += f"- {q_type} ({count} incorrect)\n"

        if unknown_types and incorrect_answers_count_by_type[unknown_placeholder] > 0:
             feedback += f"- (Also {incorrect_answers_count_by_type[unknown_placeholder]} incorrect answers where the type couldn't be identified.)\n"
        feedback += "\n"

    # Incorrect answers exist, but their types could not be identified    
    elif found_incorrect:
        count_unknown = incorrect_answers_count_by_type[unknown_placeholder]
        
        if count_unknown > 0:
            feedback += f"There are {count_unknown} incorrect answers. However, the types of questions cannot be specified for feedback this time\n"

        else:
            feedback += "There are some incorrect answers, but their specific types cannot be determined."
    
    # No incorrect answer
    elif evaluation_results:
        feedback += "Excellent! It looks like you have answered all questions correctly\n"

    else:
        feedback += "No specific areas for improvements identified from this evaluation\n"
    
    # Generate follow up question
    follow_up_questions = "Congratulation on completing the Reading passage. What would you like to do next?\n"
    option_letter = 'A'

    # Option A: Practice with the types of questions the user is struggling with
    if struggling_questions_identified:
        if len(struggling_type_list) == 1:
            types_str = struggling_type_list[0]
        
        elif len(struggling_type_list) == 2:
            types_str = f"{struggling_type_list[0]} and {struggling_type_list[1]}"
        
        else:
            types_str = ", ".join(struggling_type_list[:-1]) + f" and {struggling_type_list[-1]}"
        
        follow_up_questions += f"{option_letter} Practice weak areas ({types_str})with new sets of questions\n"
        option_letter = chr(ord(option_letter) + 1)        # Increase increment from A -> B

    # Option B: Pratice with a new passage and new problem sets (or if A has no weak areas)
    follow_up_questions += f"{option_letter} Continue practicing with a new passage and sets of questions?\n"
    option_letter = chr(ord(option_letter) + 1)            # Increase increment from B -> C

    # Option C: Practice the same problem again with different question sets
    follow_up_questions += f"{option_letter} Retry the passage with different sets of questions?\n"
    option_letter = chr(ord(option_letter) + 1)            # Increase increment from C -> D

    # Option D: End the practice session
    follow_up_questions += f"Ending the practice session. Hope you have had a productive learning session\n"

    valid_options = [chr(ord('A') + i) for i in range(ord(option_letter) - ord('A'))]
    follow_up_questions += f"Please enter your choice ({', '.join(valid_options)}): \n"

    full_response = feedback + follow_up_questions
    # Returns the list of types needed for option A in the orchestrator
    types_for_practice = struggling_type_list if struggling_questions_identified else []

    return full_response, types_for_practice

## Main Execution Block

In [None]:
if __name__ == "__main__":
    print("\nInitializing Answer Evaluation and Feedback process")

    openai_client, deepseek_client = initialize_llm_clients()

    # Initiate the process using the example from query.ipynb:
    example_passage_content = "This is an example passage about photosynthesis. Plants use sunlight, water, and carbon dioxide to create their food (glucose) and release oxygen. This process occurs in chloroplasts using chlorophyll."

    example_questions_string = """
    Question 1: What do plants use to make food?
    Question 2: Where does photosynthesis occur?
    Question 3: True/False/Not Given: Plants release nitrogen during photosynthesis.
    """

    example_user_answers = """
    Answer 1: Sunlight, water, CO2
    Answer 2: Leaves
    Answer 3: False
    """

    example_questions_data = [
        {"number": 1, "type": "Short-answer questions", "text": "What do plants use..."},
        {"number": 2, "type": "Short-answer questions", "text": "Where does photosynthesis occur?"},
        {"number": 3, "type": "True/False/Not Given", "text": "True/False/Not Given: Plants release nitrogen..."}
    ]

    chosen_model = "GPT 4.1"

    print(f"\nEvaluating answers using {chosen_model}...")

    # Call the evaluation function
    evaluation_output_string = evaluate_answers(
        chosen_model,
        example_passage_content,
        example_questions_string,
        example_user_answers,x
    )

    if evaluation_output_string:
        print("\n" + "="*30 + " EVALUATION RESULTS " + "="*30)
        print(evaluation_output_string)

        # --- Generate Feedback ---
        print("\nFeedback Generation (Requires Parsing)")
        print("(Skipping feedback generation as parsing the evaluation string is needed first)")

    else:
        print("\nFailed to get evaluation results")

    print("\n Answer Evaluation and Feedback Process completed")



Initializing Answer Evaluation and Feedback process
OpenAI client succesfully initialized.

Evaluating answers using GPT 4.1...

===DETAILED EVALUATION===
Question 1:
- Your answer: Sunlight, water, CO2
- Evaluation: Correct.
- Correct answer: Sunlight, water, and carbon dioxide
- Explanation: N/A

Question 2:
- Your answer: Leaves
- Evaluation: Incorrect.
- Correct answer: Chloroplasts
- Explanation: The passage states: "This process occurs in chloroplasts using chlorophyll." It does not mention 'leaves' explicitly.

Question 3:
- Your answer: False
- Evaluation: Correct.
- Correct answer: False
- Explanation: Plants release oxygen, not nitrogen, during photosynthesis, as stated in the passage.

===FINAL GRADE===
Total questions answered correctly: 2 / 3
Score Percentage: 67%

Feedback Generation (Requires Parsing)
(Skipping feedback generation as parsing the evaluation string is needed first)

 Answer Evaluation and Feedback Process completed
