# Rating Objective of the Prompt


In [1]:
import pandas as pd
import sqlite3
import os
import numpy as np

from helpers.make_latex_table import create_latex_tables
from helpers.normalization import remove_punctuation, remove_punctuation_and_newlines, remove_newlines
from helpers.statistical_tests import run_t_test_on_gender, compare_genders

db_path = "../../giicg.db"
if not os.path.exists(db_path):
    raise FileNotFoundError(f"Database file does not exist: {db_path}")

conn = sqlite3.connect(db_path)
prompts = pd.read_sql("SELECT ap.*, m.message_order from annotated_prompts ap JOIN messages m on ap.message_id = m.message_id ", conn)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
mean_obj_score = prompts["maj_objectives_score"].mean()
mean_comm_score = prompts["maj_communication_score"].mean()
mean_obj_score

np.float64(2.538563829787234)

## Set up LLM

In [3]:
# from langchain_openai import ChatOpenAI
# from dotenv import load_dotenv
# from pydantic import BaseModel, Field
# from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
#
# load_dotenv()
#
# class OutputFormat(BaseModel):
#     score: int = Field(description="Final score of the prompt")
#
# system_prompt = SystemMessagePromptTemplate.from_template(
#     "You are an assistant that helps categorize prompts."
# )
#
# user_prompt = HumanMessagePromptTemplate.from_template(
#     """
#     You are a highly experienced judge tasked with evaluating a code generation prompt on criteria.
#     The prompt given to you is provided below:
#     ---
#     {prompt}
#     ---
#     Your task is to evaluate the above prompt on the following criterion on a scale of 1-10:
#     Objectives: How well the prompt explicitly communicates the task objectives, including expected outputs, expected style of the output, formats constraints, audiences, relevant context and the programming language to use.
#     The scoring system is provided below:
#     - 1-2 (Poor): The prompt lacks any clear objectives or guidance.
#     - 3-4 (Below Average): Vague or incomplete objectives.
#     - 5-6 (Average): Outlines basic objectives but lacks depth.
#     - 7-8 (Good): Clearly communicates objectives, may miss edge cases.
#     - 9-10 (Excellent): Comprehensive and leaves no ambiguity.
#
#     Your evaluations must focus on explicit instructions rather than implicit instructions.
#     For example, if the prompt does not mention about the formats or constraints of the objectives
#     then you should not assume that the prompt is effective in communicating the objectives.
#     """,
#
# input_variables=["prompt"]
# )
#
# complete_prompt = ChatPromptTemplate.from_messages([system_prompt, user_prompt])
#
# def categorize_prompt(prompt):
#     OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
#     openai_model = "gpt-4.1-2025-04-14"
#     llm = ChatOpenAI(temperature=0.0, model=openai_model)
#     structured_llm = llm.with_structured_output(OutputFormat)
#
#     print(f"evaluating next prompt")
#     chain_one = (
#             {"prompt": lambda x: x["prompt"]}
#             | complete_prompt
#             | structured_llm
#             | {"score": lambda x: x.score}
#     )
#
#     response =  chain_one.invoke({"prompt": prompt})
#
#     return response["score"]

In [4]:
# prompts['objectives_score_3'] = prompts['message_text'].apply(categorize_prompt)
# prompts

In [5]:
#prompts = prompts.loc[:, ~prompts.columns.duplicated()]


## Save to Database

In [6]:
#prompts.to_sql("annotated_prompts", conn, if_exists="replace")

In [7]:
# import pandas as pd
#
# def majority_vote(row):
#     values = [row['objectives_score'], row['objectives_score_2'], row['objectives_score_3']]
#     counts = pd.Series(values).value_counts()
#     # Return the value with the highest count (majority). If tie, return the smallest value (or implement other logic).
#     return int(counts.index[0])
#
#
# # Apply to DataFrame:
# prompts['maj_objectives_score'] = prompts.apply(majority_vote, axis=1)
# prompts

In [8]:
from helpers.statistical_tests_new import compare_groups
from statsmodels.stats.multitest import multipletests


def groupwise_comparison_with_correction(df, columns, one_sided=False, direction="greater"):
    stat_vals = []
    dfs = []
    p_vals = []
    test_types = []
    directions = []
    one_sideds = []
    female_means = []
    male_means = []
    female_stds = []
    male_stds = []
    n_females = []
    n_males = []
    nonzero_females = []
    nonzero_males = []
    effsizes = []
    effsize_types = []
    cis = []
    for col in columns:
        result = compare_groups(df, col, group_column="gender", group_x="Man (cisgender)", group_y="Woman (cisgender)",
                                one_sided=one_sided, direction=direction)
        stat_vals.append(result['test_statistic'])
        dfs.append(result['df'])
        p_vals.append(result['p_value'])
        test_types.append(result['test_type'])
        directions.append(result['direction'])
        one_sideds.append(result['one_sided'])
        female_means.append(result['mean_y'])
        male_means.append(result['mean_x'])
        female_stds.append(result['std_y'])
        male_stds.append(result['std_x'])
        n_females.append(result['n_y'])
        n_males.append(result['n_x'])
        nonzero_females.append(result['hits_y'])
        nonzero_males.append(result['hits_x']),
        effsizes.append(result['effect_size']),
        effsize_types.append(result['effect_size_type'])
        cis.append(result['effect_size_CI'])
    reject, pvals_corrected, _, _ = multipletests(p_vals, alpha=0.05, method='fdr_bh')

    results = pd.DataFrame({
        'word': columns,
        'test type': test_types,
        'direction': directions,
        'one_sided': one_sideds,
        'df': dfs,
        'stat_value': stat_vals,
        'p_value': p_vals,
        'corrected p_value': pvals_corrected,
        'mean_f': female_means,
        'mean_m': male_means,
        'std_f': female_stds,
        'std_m': male_stds,
        'n_f': n_females,
        'n_m': n_males,
        'n_hits_f': nonzero_females,
        'n_hits_m': nonzero_males,
        'effsize': effsizes,
        'effsize_type': effsize_types,
        'ci': cis,
    })
    return results


In [9]:
user_prompts = (
    prompts
    .fillna({'work_exp_years': 'None'})
    .groupby(['user_id', 'gender', 'age', 'work_exp_years'])['maj_objectives_score']
    .mean()
    .reset_index()    # Reset index to create a DataFrame
)
user_prompts

Unnamed: 0,user_id,gender,age,work_exp_years,maj_objectives_score
0,6,Man (cisgender),19-25,3.0,2.444444
1,8,Man (cisgender),19-25,1.0,4.5
2,11,Woman (cisgender),26-30,1.0,3.454545
3,15,Man (cisgender),26-30,6.0,2.333333
4,16,Woman (cisgender),19-25,,3.208333
5,25,Man (cisgender),26-30,6.0,3.25
6,28,Woman (cisgender),31-35,5.0,3.095238
7,29,Woman (cisgender),26-30,,5.0
8,30,Non-binary,26-30,,6.0
9,31,Man (cisgender),36-40,22.0,3.4


In [10]:
prompts_sorted = prompts.sort_values(['conversation_id', 'message_order'])  # or whatever designates order

# Get the first prompt in each conversation
first_prompts = prompts_sorted.groupby('conversation_id').first().reset_index()

# Now group by user and aggregate (e.g., mean, sum)
user_first_prompts = (
    first_prompts
    .fillna({'work_exp_years': 'None'})
    .groupby(['user_id', 'gender', 'age', 'work_exp_years'])['maj_objectives_score']
    .mean()
    .reset_index()
)

user_prompts_comms = (
    prompts
    .fillna({'work_exp_years': 'None'})
    .groupby(['user_id', 'gender', 'age', 'work_exp_years'])['maj_communication_score']
    .mean()
    .reset_index()    # Reset index to create a DataFrame
)

combined = pd.concat([user_first_prompts, user_prompts_comms['maj_communication_score']], axis=1)
combined


Unnamed: 0,user_id,gender,age,work_exp_years,maj_objectives_score,maj_communication_score
0,6,Man (cisgender),19-25,3.0,3.666667,4.666667
1,8,Man (cisgender),19-25,1.0,7.0,5.5
2,11,Woman (cisgender),26-30,1.0,4.4,5.454545
3,15,Man (cisgender),26-30,6.0,2.0,4.0
4,16,Woman (cisgender),19-25,,4.5,4.875
5,25,Man (cisgender),26-30,6.0,4.0,5.25
6,28,Woman (cisgender),31-35,5.0,4.75,5.0
7,29,Woman (cisgender),26-30,,7.0,4.5
8,30,Non-binary,26-30,,6.0,5.0
9,31,Man (cisgender),36-40,22.0,4.666667,6.8


In [11]:
from helpers.make_latex_table import create_latex_tables
result = groupwise_comparison_with_correction(combined, ['maj_objectives_score','maj_communication_score'])
create_latex_tables(result, "latex/comms_obj_stats.tex", "latex/comms_obj_means_hits.tex")
result

Unnamed: 0,word,test type,direction,one_sided,df,stat_value,p_value,corrected p_value,mean_f,mean_m,std_f,std_m,n_f,n_m,n_hits_f,n_hits_m,effsize,effsize_type,ci
0,maj_objectives_score,Mann-Whitney U,greater,False,,126.0,0.19582,0.19582,3.666667,4.333333,1.074361,1.459993,13,15,13,15,0.292308,RBC,"[-0.14, 0.67]"
1,maj_communication_score,T-test,two-sided,False,25.778054,2.386308,0.024656,0.049311,4.265074,4.964862,0.753454,0.796817,13,15,13,15,0.900508,Cohen's d,"[0.26, 1.68]"
