# Qualitative Prompt Analysis

- manually annotated all prompts following an adapted schema from Long et al

In [13]:
import pandas as pd
import sqlite3

from sklearn.externals.array_api_extra import nunique

from helpers.statistical_tests import run_t_test_on_gender

conn = sqlite3.connect("../../giicg.db")
annotated_prompts = pd.read_sql("Select p.*, c.satisfaction, c.rated_complexity from annotated_prompts p JOIN conversations c on p.conversation_id == c.conversation_id", conn)
categorized_prompts = pd.read_sql("Select p.*, c.satisfaction, c.rated_complexity from categorized_prompts p JOIN conversations c on p.conversation_id == c.conversation_id", conn)
translated_prompts = pd.read_sql("Select p.*, c.satisfaction, c.rated_complexity from translated_prompts p JOIN conversations c on p.conversation_id == c.conversation_id", conn)

conn.close()

## Number of prompts in each category
in categorized prompts:
- Politeness (please, thank you) --> "please", "thanks"
- Directness (unpersonal_command, unpersonal_question, you_command, i_statement, we_command)


in annotated prompts:
- context: "context" (leave out, ill defined)
- programming language: "progr_language"
- redundant information: "redundant_info"
- abstract: "abstract"
- few-shot: "examples"
- cot: "CoT_style"
- steps: "steps"
- persona: "persona"
- penalty: "penalty"
- self monitoring: "self_verify"
- guide model to ask questions: "flip_roles"
- ask itself: "ask_itself"
- simple language: "simple_language"
- complex language: "complex_language"
- refine: "questions_about_output" OR "points_to_errors"
- output format : "text_style" OR "output_format"

Spelling --> each prompt that has one word from the corrections list
reward: no one did that

In [14]:
categorized_prompt_categories = {
    "Politeness": ["please", "thanks"],
    "Directness": [
        "unpersonal_command", "unpersonal_question",
        "you_command", "i_statement", "we_command"
    ],
    "Indirectness": ["can_you_question", "we_question", "i_question"]
}

annotated_prompt_categories = {
    "ambiguity": ["ambiguity"],
    "distractive content": ["distractive_content"],
    "context": ["context"],
    "programming language": ["progr_language"],
    "redundant information": ["redundant_info"],
    "abstract": ["abstract"],
    "few-shot": ["examples"],
    "cot": ["CoT_style"],
    "steps": ["steps"],
    "persona": ["persona"],
    "penalty": ["penalty"],
    "self monitoring": ["self_verify"],
    "guide model to ask questions": ["flip_roles"],
    "ask itself": ["ask_itself"],
    "simple language": ["simple_language"],
    "complex language": ["complex_language"],
    "questions output": ["questions_about_output", "ask_itself"],
    "points to errors": ["points_to_errors"],
    "output format": ["text_style", "output_format"]
}

translated_prompt_categories = {
    "spell errors": ["spell_error"]
}

def compute_category_counts(df, category_mapping):
    results = []
    total_female = df[df['gender'] == "Woman (cisgender)"]['user_id'].nunique()
    total_male = df[df['gender'] == "Man (cisgender)"]['user_id'].nunique()
    total_nonbinary = df[df['gender'] == "Non-binary"]['user_id'].nunique()
    total_another = df[df['gender'] == "Another gender"]['user_id'].nunique()

    for category, col_list in category_mapping.items():
        cat_mask = df[col_list].any(axis=1)
        subset = df[cat_mask]

        n_prompts = subset.shape[0]
        n_users = subset['user_id'].nunique()

        n_female_in_cat = subset[subset['gender'] == "Woman (cisgender)"]['user_id'].nunique()
        n_male_in_cat = subset[subset['gender'] == "Man (cisgender)"]['user_id'].nunique()
        n_nb_in_cat = subset[subset['gender'] == "Non-binary"]['user_id'].nunique()
        n_another_in_cat = subset[subset['gender'] == "Another gender"]['user_id'].nunique()

        rate_female = n_female_in_cat / total_female if total_female else 0
        rate_male = n_male_in_cat / total_male if total_male else 0
        rate_nonbinary = n_nb_in_cat / total_nonbinary if total_nonbinary else 0
        rate_another = n_another_in_cat / total_another if total_another else 0

        # Format: rate (count)
        results.append({
            "category": category,
            "num_prompts": n_prompts,
            "n_users": n_users,
            "rate_male": f"{rate_male:.2f} ({n_male_in_cat})",
            "rate_female": f"{rate_female:.2f} ({n_female_in_cat})",
            "rate_nb": f"{rate_nonbinary:.2f} ({n_nb_in_cat})",
            "rate_other": f"{rate_another:.2f} ({n_another_in_cat})",
        })

    return pd.DataFrame(results)


## Detect misspellings

In [15]:
import re

corrections = {
    'orthogogal': 'orthogonal',
    'Anothers' : 'Another',
    'follwoing' : 'following',
    'insted' : 'instead',
    'thats' : 'that\'s',
    'addtion' : 'addition',
    'develope' : 'develop',
    'familys' : 'families',
    'responsable' : 'responsible',
    'optimate' : 'optimize',
    'esier' : 'easier',
    'sentencs': 'sentences',
    'preveent' : 'prevent',
    'indexs': 'indices',
    'hundrets': 'hundreds',
    'orthotogonal' : 'orthogonal',
    'palatte' : 'plaette',
    'favoriting' : 'favoring',
    'orignal' : 'original',
    'resemblens' : 'resemblance',
    'appoinment' : 'appointment',
    'ealier' : 'earlier',
    'Rectlange' : 'Rectangle',
    'gesorächslauf' : 'chat history',
    'evry': 'every',
    'inlcude': 'include',
    'wrtie' : 'write',
    'tring' : 'trying',
    'im' : 'i\'m',
    'exaple' : 'example',
    'modalitieees': 'modalities',
    'doenst' : 'doesn\'t',
    'actualy': 'actually',
    'paremeter': 'parameter',
    'experiement': 'experiment',
    'concatination' : 'concatenation',
    'impliment' : 'implement',
    'impilmentation' : 'implementation',
    'nessacary' : 'necessary',
    'caclulated' : 'calculated',
    'matrixes' : 'matrices',
    'writie' : 'write',
    'follws' : 'follows',
    'inlcuded' : 'included',
    'origional' : 'original',
    'thta' : 'that',
    'heres' : 'here\'s',
    'laike' : 'like',
    'wehre' : 'where',
    'reuslts':'results',
    'intersetd' : 'interested',
    'concatition' : 'concatenation',
    'paramters' : 'parameters',
    'implyin' : 'implying',
    'guassion' : 'gaussian',
    'thigns' : 'things',
    'dockerfule' : 'dockerfile',
    'snipped'  : 'snippet',
    'viseos' : 'videos',
    'phne' : 'without',
    'deday' : 'decay',
    'seperate' : 'separate',
    'apporach' : 'approach',
    'indiviually' : 'individually',
    'allegeded' : 'alleged',
    'testset' : 'test set',
    'classificaiton' : 'classification',
    'optimice' : 'optimize',
    'adaptbefore' : 'adapt before',
    'onlz' : 'only',
    'sepearte' : 'separate',
    'informaiton' : 'information',
    'validaiton' : 'validation',
    'overfitiing' : 'overfitting',
    'eahc' : 'each',
    'directorys' : 'directories',
    'direcrtorys' : 'directories',
    'manipultaet' : 'manipulated',
    'aere' : 'are',
    'withoud' : 'without',
    'thorugh' : 'thorugh',
    'please provide example hot to use' : 'please provide example how to use',
    'rewertie' : 'rewrite',
    'inforamtion' : 'information',
    'visualiysing' : 'visualizing',
    'achive' : 'achieve',
    'sohuellete' : 'silhouette',
    'autimatically' : 'automatically',
    'entrys' : 'entries',
    'visualisaitons' : 'visualizations',
    'seachr' : 'search',
    'searhc' : 'search',
    'shoulndt' : 'shouldn\'t',
    'llookng' : 'looking',
    'clutering' : 'clustering',
    'restuks' : 'results',
    'mehtod' : 'method',
    'delimimetr' : 'delimiter',
    'strucuture' : 'structure',
    'neccessary' : 'necessary',
    'extractzion' : 'extraction',
    'suddently' : 'suddenly',
    'miminal' : 'minimal',
    'isnt' : 'isn\'t',
    'anomalious' : 'anomalous',
    'staistical' : 'statistical',
    'vlaues' : 'values',
    'calsses' : 'classes',
    'ommitted' : 'omitted',
    'approcheas' : 'approaches',
    'inthe' : 'in the',
    'sceintificly' : 'scientifically',
    'soonn' : 'soon',
    'dissappear' : 'disappear',
    'tryping' : 'trying',
    'undefinded' : 'undefined',
    'lenght' : 'length',
    'overritten' : 'overwritten',
    'exisiting' : 'existing',
    'caluclate' : 'calculate',
    'optimise' : 'optimize',
    'Fehlermeldungen' : 'error messages',
    'HEader' : 'Header',
    'nich tunterstrichen' : 'no underscore',
    'carussel': 'carousel',
    'adress' : 'address',
    'blancs' : 'blanks',
    'doesnt' : 'doesn\'t',
    'didnt' : 'didn\'t',
    'lets' : 'let\'s',
    'cant' : 'can\'t',
    'formated' : 'formatted',
    'unmanipluated': 'unmanipulated',
    'additonal': 'additional',

}

misspell_regex = {
    mis: re.compile(r'\b{}\b'.format(re.escape(mis)), flags=re.IGNORECASE)
    for mis in corrections.keys()
}

def contains_misspelling(text, misspell_regex):
    for pat in misspell_regex.values():
        if pat.search(text):
            return True
    return False


In [16]:
translated_prompts["spell_error"] = translated_prompts["conversational"].apply(
    lambda txt: int(contains_misspelling(txt, misspell_regex)) if isinstance(txt, str) else 0
)

translated_prompts


Unnamed: 0,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id,language,satisfaction,rated_complexity,spell_error
0,1,1,user,"parsing data from python iterator, how it coul...","parsing data from python iterator, how it coul...",,,Man (cisgender),6,en,neutral,neither complex nor simple,0
1,730,32,user,Write python function to do operations with in...,Write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6,en,somewhat satisfied,neither complex nor simple,1
2,1133,55,user,Write shortest tutorial on creating RAG on ema...,Write shortest tutorial on creating RAG on ema...,,,Man (cisgender),6,en,somewhat satisfied,relatively complex,0
3,1135,55,user,what is FAISS,what is FAISS,,,Man (cisgender),6,en,somewhat satisfied,relatively complex,0
4,1137,55,user,Transform given code to process large .mbox file,Transform given code to process large .mbox file,,Transform given code to process large .mbox file,Man (cisgender),6,en,somewhat satisfied,relatively complex,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
750,1646,82,user,"def run_query(query, n_results):\n query_em...",this is my code. I want to: Get nodes and edge...,"def run_query(query, n_results):\n query_em...",,Man (cisgender),92,en,very satisfied,relatively simple,0
751,1845,37,user,\n nun möchte ich judgement balancing m...,Now I want to bring judgement balancing into t...,,,Woman (cisgender),29,de,neutral,relatively complex,0
752,1847,37,user,\n ich sehe keine veränderung im Plot. Was ...,I don't see any change in the plot.,,,Woman (cisgender),29,de,neutral,relatively complex,0
753,1849,2,user,\n I am working on the problem of reconstru...,\n I am working on the problem of reconstru...,,Classic CV - Drone navigation\nIf you ever tho...,Man (cisgender),8,en,somewhat satisfied,relatively complex,0


## Count category hits


In [17]:
df_cat = compute_category_counts(categorized_prompts, categorized_prompt_categories)
df_anno = compute_category_counts(annotated_prompts, annotated_prompt_categories)
df_trans = compute_category_counts(translated_prompts, translated_prompt_categories)
all_categories_df = pd.concat([df_cat, df_anno, df_trans], ignore_index=True).sort_values(
    by="rate_female", ascending=False
)
all_categories_df

Unnamed: 0,category,num_prompts,n_users,rate_male,rate_female,rate_nb,rate_other
1,Directness,574,30,1.00 (15),1.00 (13),1.00 (1),1.00 (1)
5,context,165,28,0.87 (13),1.00 (13),1.00 (1),1.00 (1)
17,simple language,736,30,1.00 (15),1.00 (13),1.00 (1),1.00 (1)
2,Indirectness,189,22,0.67 (10),0.85 (11),0.00 (0),1.00 (1)
6,programming language,129,25,0.80 (12),0.85 (11),1.00 (1),1.00 (1)
22,spell errors,131,19,0.53 (8),0.77 (10),0.00 (0),1.00 (1)
20,points to errors,137,15,0.40 (6),0.69 (9),0.00 (0),0.00 (0)
4,distractive content,61,14,0.33 (5),0.62 (8),0.00 (0),1.00 (1)
19,questions output,41,11,0.20 (3),0.62 (8),0.00 (0),0.00 (0)
11,steps,48,17,0.53 (8),0.54 (7),1.00 (1),1.00 (1)


## Rate of prompts per conversation that point to error

In [18]:
msg_counts = annotated_prompts.groupby('conversation_id').size().reset_index(name='num_msgs')
long_convos = msg_counts[msg_counts['num_msgs'] > 2]['conversation_id']
filtered = annotated_prompts[annotated_prompts['conversation_id'].isin(long_convos)].copy()

# 2. Group by conversation_id
grouped = filtered.groupby('conversation_id')

# 3. Compute rate of 'points_to_errors == 1' per conversation
rate_df = grouped.apply(
    lambda group: (group['points_to_errors'] == 1).sum() / len(group)
).reset_index(name='points_to_errors_rate')

rate_df


  rate_df = grouped.apply(


Unnamed: 0,conversation_id,points_to_errors_rate
0,5,0.0
1,6,0.111111
2,8,0.2
3,11,0.0
4,12,0.448276
5,13,0.0
6,14,0.333333
7,15,0.0
8,16,0.25
9,20,0.0
