# Rating Objective of the Prompt


In [2]:
import pandas as pd
import sqlite3
import os
import numpy as np


from helpers.normalization import remove_punctuation, remove_punctuation_and_newlines, remove_newlines
from helpers.statistical_tests import run_t_test_on_gender, compare_genders

db_path = "../../giicg.db"
if not os.path.exists(db_path):
    raise FileNotFoundError(f"Database file does not exist: {db_path}")

conn = sqlite3.connect(db_path)
prompts = pd.read_sql("SELECT ep.*, u.age, u.work_exp_years FROM annotated_prompts ep JOIN users u ON ep.user_id = u.user_id", conn)

## Set up LLM

In [7]:
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate

load_dotenv()

class OutputFormat(BaseModel):
    score: int = Field(description="Final score of the prompt")

system_prompt = SystemMessagePromptTemplate.from_template(
    "You are an assistant that helps categorize prompts."
)

user_prompt = HumanMessagePromptTemplate.from_template(
    """
    You are a highly experienced judge tasked with evaluating a code generation prompt on criteria.
    The prompt given to you is provided below:
    ---
    {prompt}
    ---
    Your task is to evaluate the above prompt on the following criterion on a scale of 1-10:
    Objectives: How well the prompt explicitly communicates the task objectives, including expected outputs, expected style of the output, formats constraints, audiences, relevant context and the programming language to use.
    The scoring system is provided below:
    - 1-2 (Poor): The prompt lacks any clear objectives or guidance.
    - 3-4 (Below Average): Vague or incomplete objectives.
    - 5-6 (Average): Outlines basic objectives but lacks depth.
    - 7-8 (Good): Clearly communicates objectives, may miss edge cases.
    - 9-10 (Excellent): Comprehensive and leaves no ambiguity.

    Your evaluations must focus on explicit instructions rather than implicit instructions.
    For example, if the prompt does not mention about the formats or constraints of the objectives
    then you should not assume that the prompt is effective in communicating the objectives.
    """,

input_variables=["prompt"]
)

complete_prompt = ChatPromptTemplate.from_messages([system_prompt, user_prompt])

def categorize_prompt(prompt):
    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
    openai_model = "gpt-4.1-2025-04-14"
    llm = ChatOpenAI(temperature=0.0, model=openai_model)
    structured_llm = llm.with_structured_output(OutputFormat)

    print(f"evaluating next prompt")
    chain_one = (
            {"prompt": lambda x: x["prompt"]}
            | complete_prompt
            | structured_llm
            | {"score": lambda x: x.score}
    )

    response =  chain_one.invoke({"prompt": prompt})

    return response["score"]

In [8]:
prompts['objectives_score_2'] = prompts['message_text'].apply(categorize_prompt)

prompts

evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating 

Unnamed: 0,message_id,conversation_id,role,message_text,gender,user_id,language,ambiguity,distractive_content,examples,...,ask_itself,steps,abstract,flip_roles,questions_about_output,points_to_errors,age,work_exp_years,objectives_score,objectives_score_2
0,1,1,user,"parsing data from python iterator, how it coul...",Man (cisgender),6,en,1,0,0,...,0,0,0.0,0,0,0.0,19-25,3.0,3,3
1,730,32,user,Write python function to do operations with in...,Man (cisgender),6,en,0,0,0,...,0,1,0.0,0,0,0.0,19-25,3.0,5,5
2,1133,55,user,Write shortest tutorial on creating RAG on ema...,Man (cisgender),6,en,0,0,0,...,0,0,0.0,0,0,0.0,19-25,3.0,3,3
3,1135,55,user,what is FAISS,Man (cisgender),6,en,0,0,0,...,0,0,0.0,0,0,0.0,19-25,3.0,2,1
4,1137,55,user,Transform given code to process large .mbox file,Man (cisgender),6,en,0,0,0,...,0,0,0.0,0,0,0.0,19-25,3.0,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
748,1131,54,user,import pandas as pd\nimport numpy as np\nfrom ...,Man (cisgender),92,en,0,0,0,...,0,0,0.0,0,0,0.0,36-40,3.0,5,5
749,1532,71,user,"from transformers import AutoTokenizer, AutoMo...",Man (cisgender),92,en,0,0,0,...,0,1,0.0,0,0,0.0,36-40,3.0,4,4
750,1646,82,user,"def run_query(query, n_results):\n query_em...",Man (cisgender),92,en,1,0,0,...,0,0,0.0,0,0,,36-40,3.0,4,4
751,1849,2,user,\n I am working on the problem of reconstru...,Man (cisgender),8,en,0,0,0,...,0,0,0.0,0,0,0.0,19-25,1.0,8,7


## Save to Database

In [6]:
prompts.to_sql("annotated_prompts", conn, if_exists="replace")

753