In [2]:
system_prompt = """
You are an experienced instructor for a graduate-level data science course in an engineering program. 
You have experience creating assessments for the course. Your task is to assess the question based on certain evaluation criteria given. Output only the answer and nothing else and only from the options

Evaluation Criteria:
 - Understandable: Could you understand what the question is asking? Please check if the question is composed in such a way that the question can be comprehended easily; Options: yes, no
 - TopicRelated: Is the question related to the course topic given? Please check if the question pertains directly to the key themes or concepts of the given course topic; Options: yes, no, NA
 - Grammatical: Is the question grammatically well-formed? Please check if the question adheres to the rules of English grammar; Options: yes, no, NA
 - Clear: Is it clear what the question asks for? Please check if the phrasing of the question leaves any doubt about what is being asked. Also check if there's vagueness in the making it difficult to answer the question; Options: yes, more_or_less, no, NA
 - Rephrase: Could you rephrase the question to make it clearer and error-free? Please check if this question, as it is posed, can be reworded to improve clarity or correct errors while preserving its original meaning. If your answer is yes, rephrase the question; Options: yes, no, NA
 - Answerable: Can students answer the question with the information or context provided within? Please check if the question is answerable using the knowledge that the students are expected to have from the course material on the topic provided within the question itself. The course curriculum is a standard graduate-level data science course curriculum; Options: yes, no, NA
 - Central: Do you think being able to answer the question is important to work on the course topic given? Please check if answering the question requires an understanding of the key concepts that are critical to the subject matter; Options: yes, no, NA
 - WouldYouUseIt: If you were a teacher teaching the course topic, would you use this question or the rephrased version in the course? Please check if you would consider the question to be of practical value for teaching and learning, and if it is something that would be chosen for inclusion in course materials or assessments; Options: yes, maybe, no, NA
 - SkillLevel: What is the Bloom’s skill associated with the question? This should be aligned with the complexity and type of cognitive process assessed in the question, from simple recall of facts (remember) to more complex tasks like synthesizing new ideas (create); Options: remember, understand, apply, analyze, evaluate, create, NA
"""

In [3]:
user_prompt_template = """
Please make sure you read and understand the following content and instructions carefully. Evaluate it according to the instructions.

Question: {question}

Course topic: {topic}

Evaluation instructions: {Hierarchical_evaluation_description}
"""

In [4]:
metrics = ["Understandable", "TopicRelated", "Grammatical", "Clear", "Rephrase", "Answerable", "Central", "WouldYouUseIt", "SkillLevel"]

In [8]:
def fill_prompt_template(template_text, values_dict):
    for key, value in values_dict.items():
        if value is None:
            value = "None"
        template_text = template_text.replace(f"{{{key}}}", value)
    return template_text

In [10]:
TOPICS = [
    "Decision Tree Models",
    "Training, Validation, and Testing of Machine Learning Models",
    "Gradient Boosted Tree Models",
    "Linear Regression, Logistic Regression, and Multilayer Perceptron",
    "Stochastic Gradient Descent",
    "Backpropagation",
    "Foundations of Computer Vision and Convolutional Neural Networks",
    "Transfer Learning for Computer Vision",
    "Image Segmentation and Object Detection",
    "Data Pre-processing for Natural Language Processing Tasks",
    "Bag of Words Approach and Word Embedding",
    "Attention Mechanism in Transformers",
    "Neural Machine Translation using Transformers",
    "Encoder, Decoder, and Sequence-to-Sequence Transformers",
    "Pretraining, Finetuning, and Reinforcement Learning with Human Feedback",
    "Prompt Engineering and Chain of Thought Prompting",
    "Natural Language Processing Tasks and Transformer Architectures Used for the Tasks"
]

In [11]:
def get_merged_cell_value(ws, cell):
    # Check if the cell is part of a merged range
    for merged_range in ws.merged_cells.ranges:
        if cell.coordinate in merged_range:
            # Return the value from the top-left cell of the merged range
            return ws.cell(merged_range.min_row, merged_range.min_col).value
    return cell.value

In [12]:
import re

def clean_multi_task_outputs(outputs):
    """
    Cleans a list of 9 LLM outputs by checking the beginning of each response
    against predefined valid options for each task. Only exact matches with optional
    trailing punctuation are accepted.

    Parameters:
    - outputs (list of str): Raw outputs from the LLM for 9 tasks.

    Returns:
    - list of str: Cleaned outputs with valid labels or "Did not answer".
    """
    if len(outputs) != 9:
        raise ValueError("Expected exactly 9 outputs.")

    valid_options = [
        ["yes", "no"],  # 1
        ["yes", "no", "na"],  # 2
        ["yes", "no", "na"],  # 3
        ["yes", "more_or_less", "no", "na"],  # 4
        ["yes", "no", "na"],  # 5
        ["yes", "no", "na"],  # 6
        ["yes", "no", "na"],  # 7
        ["yes", "maybe", "no", "na"],  # 8
        ["remember", "understand", "apply", "analyze", "evaluate", "create", "na"],  # 9
    ]

    cleaned_outputs = []
    for output, options in zip(outputs, valid_options):
        output_lower = output.lower().strip()[:min(100,len(output))]

        # Extract the first few words (in case it's a phrase like "more_or_less")
        # Strip common trailing punctuation for matching
        first_part = re.split(r'\s|[.,!?]', output_lower)[0].strip(",.!?")

        matched = None
        for opt in options:
            if first_part == opt:
                matched = opt
                break

        cleaned_outputs.append(matched if matched else "Did not answer")

    return cleaned_outputs


In [13]:
import openpyxl

# Load the workbook
workbook_path = 'Questions_generated.xlsx'  # Replace with your actual file path
wb = openpyxl.load_workbook(workbook_path)

In [14]:
!pip install mistralai

Defaulting to user installation because normal site-packages is not writeable
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [None]:
import os
from mistralai import Mistral

In [None]:
import random

def global_mistral_api_call(prompt, model = "mistral-large-2411"):
  api_key = ["<YOUR_MISTRAL_KEY1>", "<YOUR_MISTRAL_KEY2>"]
  mistral_client = Mistral(api_key=api_key[random.randint(0, len(api_key)-1)])
  mistal_messages = []
  mistal_messages.append({"role": "system", "content": system_prompt})
  mistal_messages.append({"role": "user", "content": prompt})
  chat_response = mistral_client.chat.complete(
    model = model,
    messages = mistal_messages,
    temperature = 0.1
)

  return chat_response.choices[0].message.content

In [17]:
import csv
import time
model = "mistral"
# Iterate over all sheets in the workbook
for sheet_name in wb.sheetnames:
    ws = wb[sheet_name]
    print(f"\nSheet: {sheet_name}")
    with open(f"{sheet_name}_{model}.csv", "w", newline="") as file:
        writer = csv.writer(file)
    
        for row in ws.iter_rows(min_row=2, max_col=4, max_row = 715):
            index = row[0].value  
            prompt = row[1].value 
            col_b_cell = row[1] if len(row) > 1 else None  
            prompt = get_merged_cell_value(ws, col_b_cell) if col_b_cell else None
            question = row[3].value  
            topic = [topic for topic in TOPICS if topic in prompt][0]
            print(f"Index {index} Topic: {topic}")
            temp = []
            for metric in metrics:
                template_values = {
                    "question": question,
                    "topic": topic,
                    "Hierarchical_evaluation_description": f"Evaluate only the metric {metric}"
                }
                filled_prompt = fill_prompt_template(user_prompt_template, template_values)
                temp.append(global_mistral_api_call(filled_prompt))
                time.sleep(3)
            temp = clean_multi_task_outputs(temp)
            writer.writerow(temp)


Sheet: PS1
Index 1 Topic: Decision Tree Models
Index 2 Topic: Decision Tree Models
Index 3 Topic: Decision Tree Models
Index 4 Topic: Decision Tree Models
Index 5 Topic: Decision Tree Models
Index 6 Topic: Decision Tree Models
Index 7 Topic: Decision Tree Models
Index 8 Topic: Decision Tree Models
Index 9 Topic: Decision Tree Models
Index 10 Topic: Decision Tree Models
Index 11 Topic: Decision Tree Models
Index 12 Topic: Decision Tree Models
Index 13 Topic: Decision Tree Models
Index 14 Topic: Decision Tree Models
Index 15 Topic: Decision Tree Models
Index 16 Topic: Decision Tree Models
Index 17 Topic: Decision Tree Models
Index 18 Topic: Decision Tree Models
Index 19 Topic: Decision Tree Models
Index 20 Topic: Decision Tree Models
Index 21 Topic: Decision Tree Models
Index 22 Topic: Decision Tree Models
Index 23 Topic: Decision Tree Models
Index 24 Topic: Decision Tree Models
Index 25 Topic: Decision Tree Models
Index 26 Topic: Decision Tree Models
Index 27 Topic: Decision Tree Model