# Key Problem II: LLMs as classification models
# Depending on concept ambiguity

In [1]:
from openai import OpenAI
import pandas as pd
import json
from tqdm import tqdm
from sklearn.metrics import f1_score

In [2]:
# In order to run (some part of) this code, please insert your personal Open AI API Key here
client = OpenAI(api_key="YOUR API KEY HERE")

## Defining global variables
In the following we define the codebook for prompting GPT-4, and assign numerical to natural language codes for the classes, we'd like GPT-4 to identify.

In [3]:
CODEBOOK = "You are a data labeler, here are the labels and instructions of each label:\n\n\
1: exclusionary about outgroup'; which is assigned if the text points out realistic \
or unrealistic threats from the outgroup, or if the text makes members of the \
outgroup look stupid\n\
'0: inclusionary about in/both groups'; which is assigned if the text highlights \
positive characteristics of the ingroup or justifies their actions, or if the \
text is pointing out common characteristics or challenges of in- and outgroup\n\
'2: other'; which is assigned if the text does not have in- or outgroup thinking, \
or if the text has signs of in- or outgroup thinking, but the speaker's affiliation \
is not apparent.\n"

In [4]:
GOAL_MAP = {0: "inclusionary about in/both groups",
            1: "exclusionary about out-group",
            2: "other"}

## Reading the data
The training dataset are samples that we randomly draw from to provide examples to GPT-4 while prompting. The test dataset are samples that GPT-4 is supposed to annotate and where we know the true label based on human annotation.

In [5]:
def read_goal_data():
    training_data = pd.read_csv("data/goal_train_for_LLM.csv", delimiter=";")
    test_data = pd.read_csv("data/goal_test_for_LLM.csv", delimiter=";")

    return training_data, test_data

In [6]:
train, test = read_goal_data()

## Defining helper functions

In [7]:
def choose_random_training_sample(training_data):
    """ Chooses random examples for all three classes from the training data for prompting """
    df0 = training_data.query('label==0').sample(n=1)
    df1 = training_data.query('label==1').sample(n=1)
    df2 = training_data.query('label==2').sample(n=1)
    
    return pd.concat([df0, df1, df2])

In [8]:
def choose_random_test_sample(test_data):
    """ Draws one sample from the test data for labeling """
    df0 = test_data.sample(n=1)
    
    return df0

In [9]:
def infer_labels_from_response_text(resp_data):
    """ Extracts numerical labels from GPT-4s response messages """
    if "0" in resp_data["response_message"] or "inclusionary" in resp_data["response_message"]:
        label = "0"
    elif "1" in resp_data["response_message"] or "exclusionary" in resp_data["response_message"]:
        label = "1"
    elif "2" in resp_data["response_message"] or "other" in resp_data["response_message"]:
        label = "2"
    else:
        print("Label not found!", resp_data["response_message"])

    return label

In [10]:
def compute_F1_scores(filename):
    """ Computes the micro and macro F1 scores for GPT-4s performance """
    y_true = []
    y_pred = [] 
    with open(filename) as f:
        for line in f:
            resp_data = json.loads(line)
            predicted_label = infer_labels_from_response_text(resp_data)
            y_true.append(resp_data["true_label"])
            y_pred.append(predicted_label)
            
    print("macro", f1_score(y_true, y_pred, average='macro'))
    print("micro", f1_score(y_true, y_pred, average='micro'))

## Prompting example
To illustrate, let's print an exemplary prompt for GPT-4 in a few shot setting, without codebook.

In [11]:
def few_shot_prompting_ex(training_data, test_data):

    training_data, test_data = read_goal_data()

    random_few_shot_training = choose_random_training_sample(training_data)
    test_sample = choose_random_test_sample(test_data)

    print("You are a data labeler, here are the labels:\n\n\
0: inclusionary about in or both groups\n\
1: exclusionary about out-group\n\
2: other\n\n\
Here are a few examples:\n")

    for _, row in random_few_shot_training.iterrows():
        print(row["text"], f"// This is labeled as {GOAL_MAP[row['label']]}.")

    print("\nPlease label this message:", test_sample["text"].values[0])\
    
    print("\n***TRUE LABEL***", test_sample["label"].values[0])

In [12]:
few_shot_prompting_ex(train, test)

You are a data labeler, here are the labels:

0: inclusionary about in or both groups
1: exclusionary about out-group
2: other

Here are a few examples:

Quatsch. Europa fällt wegen der Flüchtlingsfrage auseinander. Schau mal richtig hin, Herr @xyz123. Alles geht den Bach runter. // This is labeled as inclusionary about in/both groups.
Für solche Aussagen gibt’s in der Türkei bald die Todesstrafe. Da ist dann mit Meinungsfreiheit echt Schluss. @randomUser123 // This is labeled as exclusionary about out-group.
1. Weil er @falschername die Trümmertruppe in Gefahr bringt.  
2. A. Murksel schafft es nicht, die gewählten Leute zu beschützen.  
3. Im Gegensatz zu Recep kauft er keine Panzer.  
4. Er muss erledigt werden – sonst könnten andere dem folgen.  
5. Er hat sich nicht als syrischer 14-jähriger Flüchtling ausgegeben. // This is labeled as other.

Please label this message: Die spd kann nicht nein sagen, weil rwe Jobs für Gewerkschaften bietet, sonst verlieren sie eigene Fans. ;) @coo

## Experiments
In the following, we run three experiments to evaluate GPT-4s performance in different settings: 1) Zero shot with codebook, 2) Few shot without codebook, 3) Few shot with codebook.

### Experiment 1: Zero shot, with codebook

In [13]:
def zero_shot_with_codebook():
    training_data, test_data = read_goal_data()

    random_few_shot_training = choose_random_training_sample(training_data)
    test_sample = choose_random_test_sample(test_data)

    extra_instr = "Using these labels and instructions please label the following text: "

    response = client.chat.completions.create(
                model="gpt-4",
                messages=[
                    {
                    "role": "system",
                    "content": CODEBOOK
                    },
                    {
                    "role": "user",
                    "content": extra_instr + test_sample["text"].values[0]
                    },
                ],
                temperature=1,
                max_tokens=256,
                top_p=1,
                frequency_penalty=0,
                presence_penalty=0
                )
    
    print(response)
    print("\n\n****TRUE LABEL***",test_sample["label"].values[0])
    
    return {"response_message": response.choices[0].message.content,
            "true_label": str(test_sample["label"].values[0]),
            "tweet_id": str(test_sample["tweet_id"].values[0])}

In [14]:
"""
Experiment 1: 100 trials
Zero Shot: codebook provided but no examples

Note: The following code reads the results from our experiments. To rerun the experiment,
un-comment the code. You might want to choose another file name to separate your results from
ours before running the code.
"""

trials = 100
output_file_name = "results/zero_shot_w_code.jsona"

#for i in tqdm(range(trials), total=trials):
#    predicted_label = zero_shot_with_codebook()
    
#    with open(output_file_name, "a") as f:
#        json.dump(predicted_label, f)
    
#    with open(output_file_name, "a") as f:
#        f.write("\n")

compute_F1_scores(output_file_name)

macro 0.5017320339563031
micro 0.6


In [15]:
# Run this cell for an example output
# Comment out if API Key not available
zero_shot_with_codebook()

ChatCompletion(id='chatcmpl-CfwdNMHDHltbRGpa2mlYl5mKLUk3I', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='1: exclusionary about outgroup', refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=None))], created=1764112873, model='gpt-4-0613', object='chat.completion', service_tier='default', system_fingerprint=None, usage=CompletionUsage(completion_tokens=7, prompt_tokens=209, total_tokens=216, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)))


****TRUE LABEL*** 1


{'response_message': '1: exclusionary about outgroup',
 'true_label': '1',
 'tweet_id': '859516817726943232'}

### Experiment 2: Few shot, without codebook

In [16]:
def few_shot_without_codebook():
    training_data, test_data = read_goal_data()

    random_few_shot_training = choose_random_training_sample(training_data)
    test_sample = choose_random_test_sample(test_data)
    examples_str = ""
    
    for _, row in random_few_shot_training.iterrows():
        examples_str = examples_str + row["text"] + f" // This is labeled as {GOAL_MAP[row['label']]}.\n"

    response = client.chat.completions.create(
                model="gpt-4",
                messages=[
                    {
                    "role": "system",
                    "content": "You are a data labeler, here are the labels:\n\n \
                                0: inclusionary about in or both groups\n \
                                1: exclusionary about out-group\n \
                                2: other \n \
                                Here are a few examples: " + examples_str
                    },
                    {
                    "role": "user",
                    "content": "Please label this message: " + test_sample["text"].values[0]
                    },
                ],
                temperature=1,
                max_tokens=256,
                top_p=1,
                frequency_penalty=0,
                presence_penalty=0
                )
    
    print(response)
    print("\n\n***TRUE LABEL***", test_sample["label"].values[0])
    
    return {"response_message": response.choices[0].message.content,
            "true_label": str(test_sample["label"].values[0]),
            "tweet_id": str(test_sample["tweet_id"].values[0])}

In [17]:
"""
Experiment 2: 100 trials
3 Random training examples are given with short labels (defined in func), no codebook

Note: The following code reads the results from our experiments. To rerun the experiment,
un-comment the code. You might want to choose another file name to separate your results from
ours before running the code.
"""

trials = 100
output_file_name = "results/few_shot_no_code.jsona"

#for i in tqdm(range(trials),total=trials):
#    predicted_label = few_shot_without_codebook()
    
#    with open(output_file_name, "a") as f:
#        json.dump(predicted_label, f)
    
#    with open(output_file_name, "a") as f:
#        f.write("\n")

compute_F1_scores(output_file_name)

macro 0.625756831942399
micro 0.65


In [18]:
# Run this cell for an example output
# Comment out if API Key not available
few_shot_without_codebook()

ChatCompletion(id='chatcmpl-CfwdPqj1PDvwCsyJ4VaftvHbiHEtr', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='This is labeled as exclusionary about out-group.', refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=None))], created=1764112875, model='gpt-4-0613', object='chat.completion', service_tier='default', system_fingerprint=None, usage=CompletionUsage(completion_tokens=10, prompt_tokens=290, total_tokens=300, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)))


***TRUE LABEL*** 1


{'response_message': 'This is labeled as exclusionary about out-group.',
 'true_label': '1',
 'tweet_id': '574259885254307840'}

### Experiment 3: Few shot, with codebook

In [19]:
def few_shot_with_codebook():
    training_data, test_data = read_goal_data()

    random_few_shot_training = choose_random_training_sample(training_data)
    test_sample = choose_random_test_sample(test_data)
    examples_str = ""
    
    for _, row in random_few_shot_training.iterrows():
        examples_str = examples_str + row["text"] + f" // This is labeled as {GOAL_MAP[row['label']]}.\n"

    extra_instr = "Using these labels, instructions and examples please label the following text: "

    response = client.chat.completions.create(
                model="gpt-4",
                messages=[
                    {
                    "role": "system",
                    "content": CODEBOOK + "Here are a few examples: " + examples_str
                    },
                    {
                    "role": "user",
                    "content": extra_instr + test_sample["text"].values[0]
                    },
                ],
                temperature=1,
                max_tokens=256,
                top_p=1,
                frequency_penalty=0,
                presence_penalty=0
                )
    
    print(response)
    print("\n\n****TRUE LABEL***",test_sample["label"].values[0])
    
    return {"response_message": response.choices[0].message.content,
            "true_label": str(test_sample["label"].values[0]),
            "tweet_id": str(test_sample["tweet_id"].values[0])}

In [20]:
"""
Experiment 3: 100 trials
3 Random training examples are given with labels and codebook is provided as context

Note: The following code reads the results from our experiments. To rerun the experiment,
un-comment the code. You might want to choose another file name to separate your results from
ours before running the code.
"""

trials = 100
output_file_name = "results/few_shot_w_code.jsona"

#for i in tqdm(range(trials),total=trials):
#    predicted_label = few_shot_with_codebook()
    
#    with open(output_file_name, "a") as f:
#        json.dump(predicted_label, f)
    
#    with open(output_file_name, "a") as f:
#        f.write("\n")

compute_F1_scores(output_file_name)

macro 0.4291982745798007
micro 0.6138613861386139


In [21]:
# Run this cell for an example output
# Comment out if API Key not available
few_shot_with_codebook()

ChatCompletion(id='chatcmpl-CfwdR9GBlc98Ge10YvGcuCfHbJQ21', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="This is labeled as '2: other'.", refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=None))], created=1764112877, model='gpt-4-0613', object='chat.completion', service_tier='default', system_fingerprint=None, usage=CompletionUsage(completion_tokens=9, prompt_tokens=362, total_tokens=371, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)))


****TRUE LABEL*** 1


{'response_message': "This is labeled as '2: other'.",
 'true_label': '1',
 'tweet_id': '925828356700016643'}