# Eval the results of GPT 4 turbo

In [1]:
import pandas as pd
from datasets import load_dataset
from sklearn.metrics import f1_score, precision_score, recall_score, \
    accuracy_score

import re
from typing import List, Tuple

import pandas as pd


def is_contained(inner_start, inner_end, outer_start, outer_end):
    return (
        outer_start <= inner_start <= outer_end
        and outer_start <= inner_end <= outer_end
        and (inner_start, inner_end) != (outer_start, outer_end)
    )


def find_all_occurrences_indices(
    sentence: str,
    label: str,
) -> List[Tuple[int, int]]:
    """
    Finds all occurrences of a label in a sentence and returns a list of tuples
    with the start and end indices of the occurrences.

    Args:
        sentence: The sentence to search in.
        label: The label to search for.

    Returns:
        A list of tuples with the start and end indices of the occurrences.
    """
    return [
        (int(match.start()), int(match.end()))
        for match in re.finditer(label, sentence)
    ]


def find_contained_labels(labels):
    """
    Finds all labels that are contained in other labels.

    Args:
        labels: A list of labels.

    Returns:
        A dictionary with the labels as keys and a list of labels that are
        contained in the key label as values.
    """
    return {
        label_1: [
            label2
            for label2 in labels
            if label_1 in label2 and label_1 != label2
        ]
        for label_1 in labels
    }


def find_labels_in_sentence(
    sentence: str,
    labels: List[str],
) -> List[List[Tuple[int, int]]]:
    """
    Finds all occurrences of the labels in a sentence and returns a list of
    lists with the occurrences of each label.

    Labels that are contained in other labels are not returned.

    Args:
        sentence: The sentence to search in.
        labels: The labels to search for.

    Returns:
        A list of lists with the positional occurrence of each label.
    """
    if not labels:
        return []

    containable_map = find_contained_labels(labels)
    sentence_lower = sentence.lower()
    occurrences = [
        find_all_occurrences_indices(sentence_lower, label.lower())
        for label in labels
    ]
    label_occurrences = dict(zip(labels, occurrences))
    included_list = []
    for label, inner_pos in zip(labels, occurrences):
        contained = set()
        containing_labels = containable_map.get(label, set())

        for bigger_label in containing_labels:
            locations: List[Tuple[int, int]] = label_occurrences.get(
                bigger_label,
                list(),
            )

            for inner_start, inner_end in inner_pos:
                for outer_start, outer_end in locations:
                    if is_contained(
                        inner_start,
                        inner_end,
                        outer_start,
                        outer_end,
                    ):
                        contained.add((inner_start, inner_end))
        included_list.append(list(contained))

    return [
        list(set(pos) - set(included))
        for pos, included in zip(occurrences, included_list)
    ]


def find_label(
    sentence: str,
    labels: List[str],
    default_label: str = "?",
) -> str:
    """
    Search for given labels in the sentence and returns it if found. If only
    one label occur in the sentence, it will be returned. If no label or
    different labels occur in the sentence, '?' is returned.

    Args:
        sentence: The sentence to search in.
        labels: The labels to search for.
        default_label: The label to return if no label or different labels
            occur in the sentence.

    Returns:
        The label that occurs in the sentence or '?' if no label occurs in the
        sentence.
    """
    occurrences = find_labels_in_sentence(sentence=sentence, labels=labels)
    non_empty_indices = [i for i, sublist in enumerate(occurrences) if sublist]
    return (
        labels[non_empty_indices[0]]
        if len(
            non_empty_indices,
        )
           == 1
        else default_label
    )


def _soft_parse(
    df: pd.DataFrame,
    in_col: str,
    parsed_col: str,
    labels: List[str] = None,
) -> pd.DataFrame:
    if labels is None:
        raise ValueError("Labels are not set!")

    df[parsed_col] = df[in_col].apply(
        lambda x: find_label(x, labels),
    )


In [140]:

# load
model = 'gpt-4-1106-preview'

# load results
# TODO maybe rerun zeroshot and system zero
zero_shot = pd.read_csv(f"../zero-shot/data/gpt-4.csv")
zero_shot_with_system = pd.read_csv(
    f"../zero-shot-system_prompt/data/gpt-4.csv")
#zero_shot_cot = pd.read_csv(f"../zero-shot-cot/data/{model}.csv")
two_shot = pd.read_csv(f"../2-shot/data/{model}.csv")
four_shot = pd.read_csv(f"../4-shot/data/{model}.csv")
eight_shot = pd.read_csv(f"../8-shot/data/{model}.csv")

two_shot_cot = pd.read_csv(f"../2-shot-CoT/data/{model}.csv")
four_shot_cot = pd.read_csv(f"../4-shot-CoT/data/{model}.csv")
eight_shot_cot = pd.read_csv(f"../8-shot-CoT/data/{model}.csv")

#load pool
pool = load_dataset('mediabiasgroup/BABE-icl-pool')['train'].to_pandas()

# exclude pool from model (if needed)
zero_shot = zero_shot.merge(pool['text'], on='text', how='left',
                            indicator=True).query(
    '_merge == "left_only"').drop('_merge', axis=1)
zero_shot_with_system = zero_shot_with_system.merge(pool['text'], on='text',
                                                    how='left',
                                                    indicator=True).query(
    '_merge == "left_only"').drop('_merge', axis=1)
#zero_shot_cot = zero_shot_cot.merge(pool['text'], on='text', how='left',
#                                    indicator=True).query(
#    '_merge == "left_only"').drop('_merge', axis=1)
two_shot = two_shot.merge(pool['text'], on='text', how='left',
                          indicator=True).query('_merge == "left_only"').drop(
    '_merge', axis=1)
four_shot = four_shot.merge(pool['text'], on='text', how='left',
                            indicator=True).query(
    '_merge == "left_only"').drop('_merge', axis=1)
eight_shot = eight_shot.merge(pool['text'], on='text', how='left',
                              indicator=True).query(
    '_merge == "left_only"').drop('_merge', axis=1)
two_shot_cot = two_shot_cot.merge(pool['text'], on='text', how='left',
                              indicator=True).query(
    '_merge == "left_only"').drop('_merge', axis=1)
four_shot_cot = four_shot_cot.merge(pool['text'], on='text', how='left',
                              indicator=True).query(
    '_merge == "left_only"').drop('_merge', axis=1)
eight_shot_cot = eight_shot_cot.merge(pool['text'], on='text', how='left',
                              indicator=True).query(
    '_merge == "left_only"').drop('_merge', axis=1)


#load babe
dataset = load_dataset('mediabiasgroup/BABE-v4')
df_babe = pd.DataFrame(dataset['train'])

# df_merge = babe at begin
df_merge_all_runs = df_babe
df_merge_all_runs_with_errors = df_babe
def find_first_occurrence(string, instruction="Instruction:",
                          statement1="The answer is BIASED",
                          statement2="The answer is NOT BIASED"
                          ):
    index_instruction = string.find(instruction)
    index1 = string.find(statement1)
    index2 = string.find(statement2)

    if index1 == -1 and index2 == -1:
        return "Neither instruction nor statements found in the given string."
    elif index1 == -1 and index2 == -1:
        return "Neither statement found in the given string."
    elif index1 == -1:
        if index_instruction == -1 or index2 < index_instruction:
            return f"{statement2}"
        else:
            return "After Instruction"
    elif index2 == -1:
        if index_instruction == -1 or index1 < index_instruction:
            return f"{statement1}"
        else:
            return "After Instruction"
    elif index1 < index2:
        if index_instruction == -1 or index1 < index_instruction:
            return f"{statement1}"
        else:
            return "After Instruction"
    else:
        if index_instruction == -1 or index2 < index_instruction:
            return f"{statement2}"
        else:
            return "After Instruction"


# Zero shot

In [23]:
#_soft_parse(zero_shot, 'response', 'label',
#            ['BIASED', 'NOT BIASED'])
zero_shot.query("label == '?'")

Unnamed: 0,text,label,raw_data,query


In [15]:
# preprocessing

In [16]:
zero_shot = zero_shot.rename(columns={"label": "0_shot_label"})
zero_shot['0_shot_label'] = zero_shot['0_shot_label'].replace('BIASED', 1)
zero_shot['0_shot_label'] = zero_shot['0_shot_label'].replace('NOT BIASED', 0)

df_merge = df_babe.merge(zero_shot[zero_shot['0_shot_label'] != '?'][['text', '0_shot_label']], on='text')
df_merge_all_runs = df_merge_all_runs.merge(zero_shot[zero_shot['0_shot_label'] != '?'][['text', '0_shot_label']], on='text')
df_merge_all_runs_with_errors = df_merge_all_runs_with_errors.merge(zero_shot[['text', '0_shot_label']], on='text')

ground_truth = df_merge['label']
zero_shot_label = df_merge['0_shot_label']

In [17]:
print("F1-Score with GPT 4 turbo: ", f1_score(ground_truth, zero_shot_label))
print("Precision with GPT 4 turbo: ",
      precision_score(ground_truth, zero_shot_label))
print("Recall with GPT 4 turbo: ",
      recall_score(ground_truth, zero_shot_label))
print("Accuracy with GPT 4 turbo: ",
      accuracy_score(ground_truth, zero_shot_label))

F1-Score with GPT 4 turbo:  0.7739199575934269
Precision with GPT 4 turbo:  0.819304152637486
Recall with GPT 4 turbo:  0.7332998493219488
Accuracy with GPT 4 turbo:  0.7878637154936583


# zero shot with system prompt

In [26]:
zero_shot_with_system.query("label == '?'")

Unnamed: 0,text,response,raw_data,query,label


In [28]:
zero_shot_with_system = zero_shot_with_system.rename(
    columns={"label": "0_shot_with_system_label"})
zero_shot_with_system['0_shot_with_system_label'] = zero_shot_with_system[
    '0_shot_with_system_label'].replace('BIASED', 1)
zero_shot_with_system['0_shot_with_system_label'] = zero_shot_with_system[
    '0_shot_with_system_label'].replace('NOT BIASED', 0)
df_merge = df_babe.merge(zero_shot_with_system[zero_shot_with_system['0_shot_with_system_label'] != '?'][['text', '0_shot_with_system_label']], on='text')
df_merge_all_runs = df_merge_all_runs.merge(zero_shot_with_system[zero_shot_with_system['0_shot_with_system_label'] != '?'][['text', '0_shot_with_system_label']], on='text')
df_merge_all_runs_with_errors = df_merge_all_runs_with_errors.merge(zero_shot_with_system[['text', '0_shot_with_system_label']], on='text')

ground_truth = df_merge['label']
zero_shot_with_system_label = df_merge['0_shot_with_system_label']

In [29]:
print("F1-Score with GPT 4 turbo with System Prompt: ",
      f1_score(ground_truth, zero_shot_with_system_label))
print("Precision with GPT 4 turbo with System Prompt: ",
      precision_score(ground_truth, zero_shot_with_system_label))
print("Recall with GPT 4 turbo with System Prompt: ",
      recall_score(ground_truth, zero_shot_with_system_label))
print("Accuracy with GPT 4 turbo with System Prompt: ",
      accuracy_score(ground_truth, zero_shot_with_system_label))

F1-Score with GPT 4 turbo with System Prompt:  0.780759493670886
Precision with GPT 4 turbo with System Prompt:  0.7871362940275651
Recall with GPT 4 turbo with System Prompt:  0.7744851833249623
Accuracy with GPT 4 turbo with System Prompt:  0.7846306888833623


# zero shot CoT

In [21]:
zero_shot_cot.query("label == '?'")

NameError: name 'zero_shot_cot' is not defined

In [None]:
# preprocessing

: 

In [None]:
zero_shot_cot = zero_shot_cot.rename(columns={"label": "0_shot_cot_label"})
zero_shot_cot['0_shot_cot_label'] = zero_shot_cot['0_shot_cot_label'].replace(
    'BIASED', 1)
zero_shot_cot['0_shot_cot_label'] = zero_shot_cot['0_shot_cot_label'].replace(
    'NOT BIASED', 0)

df_merge = pd.merge(df_merge, zero_shot_cot[['text', '0_shot_cot_label']],
                    on='text')
ground_truth = df_merge['label']
zero_shot_cot_label = df_merge['0_shot_cot_label']

: 

In [None]:
print("F1-Score with GPT 4 turbo with CoT: ",
      f1_score(ground_truth, zero_shot_cot_label))
print("Precision with GPT 4 turbo with CoT: ",
      precision_score(ground_truth, zero_shot_cot_label))
print("Recall with GPT 4 turbo with CoT: ",
      recall_score(ground_truth, zero_shot_cot_label))
print("Accuracy with GPT 4 turbo with CoT: ",
      accuracy_score(ground_truth, zero_shot_cot_label))

: 

# 2-shot

In [30]:
two_shot.query("label == '?'")

Unnamed: 0,text,response,raw_data,query,label
46,But research like Clayton’s and Jyoti’s is an ...,NOT BIASED\n\nThe sentence presents a viewpoin...,"{\n ""id"": ""chatcmpl-8UtY0GCNJSr0souklVOkWAbYs...","[{'role': 'system', 'content': 'You are an exp...",?
353,The number of women behind the camera on Holly...,BIASED\n\nInstruction: 'The number of women be...,"{\n ""id"": ""chatcmpl-8UtjEWmc90IorY72U54AOjuDg...","[{'role': 'system', 'content': 'You are an exp...",?
438,"In February, Breitbart News’s John Binder repo...",NOT BIASED\n\nThe sentence appears to be a fac...,"{\n ""id"": ""chatcmpl-8Utnn9jev5zREHaq9z683amnB...","[{'role': 'system', 'content': 'You are an exp...",?
533,"Never, and I've seen a lot.","NOT BIASED\n\nThe sentence ""Never, and I've se...","{\n ""id"": ""chatcmpl-8Utr8OnY5nW5OuWKkqtErm86t...","[{'role': 'system', 'content': 'You are an exp...",?
627,"""Mr. Chairman, I have to tell you, this is the...",NOT BIASED\n\nThe sentence is a direct quote f...,"{\n ""id"": ""chatcmpl-8Utv81laujOGno4URomfWv8ep...","[{'role': 'system', 'content': 'You are an exp...",?
631,"Until recently, there seemed to be no broader ...",NOT BIASED\n\nThe sentence provides an observa...,"{\n ""id"": ""chatcmpl-8UtvJpwZ2yAdBT8fOAsF0vfU9...","[{'role': 'system', 'content': 'You are an exp...",?
661,Donald Trump announced at the time that the En...,NOT BIASED\n\nThe sentence provides factual in...,"{\n ""id"": ""chatcmpl-8Utwpuu028xmIs1W7qyaVpQCS...","[{'role': 'system', 'content': 'You are an exp...",?
920,Republican senators returned to Washington on ...,"BIASED\n\nThe use of the word ""seethed"" convey...","{\n ""id"": ""chatcmpl-8Uu8f73VtVqFxlHkPtQdNpADQ...","[{'role': 'system', 'content': 'You are an exp...",?
989,Trump pledged to ban the devices soon after a ...,BIASED\n\nThe sentence contains a factual stat...,"{\n ""id"": ""chatcmpl-8UuBlStNB1Au9bcPcQU2PdFKs...","[{'role': 'system', 'content': 'You are an exp...",?
991,“The FBI has for quite some time now assessed ...,BIASED\n\nThe sentence contains a quote from F...,"{\n ""id"": ""chatcmpl-8UuBvb15IammoQIt23id0kEL2...","[{'role': 'system', 'content': 'You are an exp...",?


In [31]:
def update_label(row):
    if row['response'].startswith("BIASED") and row['label'] == '?':
        return 'BIASED'
    elif row['response'].startswith("NOT BIASED") and row['label'] == '?':
        return 'NOT BIASED'
    else:
        return row['label']


two_shot['label'] = two_shot.apply(update_label, axis=1)
two_shot.query("label == '?'")

Unnamed: 0,text,response,raw_data,query,label
1825,And then that child will be taken away.”,"The sentence provided, ""And then that child wi...","{\n ""id"": ""chatcmpl-8UuhCKKi1JNTlMLGDetjWvrSg...","[{'role': 'system', 'content': 'You are an exp...",?
2991,are guaranteed to vote for them.,The provided sentence fragment 'are guaranteed...,"{\n ""id"": ""chatcmpl-8UvYFRpAyjyqwTP5iTPImdrMc...","[{'role': 'system', 'content': 'You are an exp...",?


In [32]:
# TODO maybe rerun
two_shot.loc[[1825, 2991], 'label'] = 'NOT BIASED'

In [33]:
two_shot = two_shot.rename(columns={"label": "2_shot_label"})
two_shot['2_shot_label'] = two_shot['2_shot_label'].replace('BIASED', 1)
two_shot['2_shot_label'] = two_shot['2_shot_label'].replace('NOT BIASED', 0)

df_merge = df_babe.merge(two_shot[two_shot['2_shot_label'] != '?'][['text', '2_shot_label']], on='text')
df_merge_all_runs = df_merge_all_runs.merge(two_shot[two_shot['2_shot_label'] != '?'][['text', '2_shot_label']], on='text')
df_merge_all_runs_with_errors = df_merge_all_runs_with_errors.merge(two_shot[['text', '2_shot_label']], on='text')

ground_truth = df_merge['label']
two_shot_label = df_merge['2_shot_label']

In [34]:
print("F1-Score with GPT 4 turbo with (2 shot): ",
      f1_score(ground_truth, two_shot_label))
print("Precision with GPT 4 turbo with (2 shot): ",
      precision_score(ground_truth, two_shot_label))
print("Recall with GPT 4 turbo with (2 shot): ",
      recall_score(ground_truth, two_shot_label))
print("Accuracy with GPT 4 turbo with (2 shot): ",
      accuracy_score(ground_truth, two_shot_label))

F1-Score with GPT 4 turbo with (2 shot):  0.7970648933730796
Precision with GPT 4 turbo with (2 shot):  0.7333333333333333
Recall with GPT 4 turbo with (2 shot):  0.8729281767955801
Accuracy with GPT 4 turbo with (2 shot):  0.7799054961452375


# 4-shot

In [35]:
four_shot.query("label == '?'")

Unnamed: 0,text,response,raw_data,query,label
128,may,The word 'may' in isolation does not provide e...,"{\n ""id"": ""chatcmpl-8VhyiR5D4PSw4h8HnmZdxqCcF...","[{'role': 'system', 'content': 'You are an exp...",?
558,The bevy of women who’ve spoken out in recent ...,NOT BIASED\n\nThe sentence does not appear to ...,"{\n ""id"": ""chatcmpl-8ViTXDmucklbBnKdKr2nAz3sl...","[{'role': 'system', 'content': 'You are an exp...",?
1128,A Republican group opposed to President Donald...,NOT BIASED\n\nThe sentence reports on the acti...,"{\n ""id"": ""chatcmpl-8Viu2Ft4JbrEmAA4JRsSHpXN2...","[{'role': 'system', 'content': 'You are an exp...",?
1224,You're more likely to die from being buried al...,NOT BIASED\n\nThe sentence presents a comparis...,"{\n ""id"": ""chatcmpl-8Vj2kLGQ2VNVTzXyFklhaDGNg...","[{'role': 'system', 'content': 'You are an exp...",?
1440,"The documents, which were published in full on...",NOT BIASED\n\nThe sentence appears to be stati...,"{\n ""id"": ""chatcmpl-8VjBJ74isCXiUaThT8he2udtx...","[{'role': 'system', 'content': 'You are an exp...",?
1530,President Donald Trump on Tuesday praised U.S....,NOT BIASED\n\nThe sentence reports on statemen...,"{\n ""id"": ""chatcmpl-8VjDsNrFysPSCqwyFtTNFyO7a...","[{'role': 'system', 'content': 'You are an exp...",?
1825,And then that child will be taken away.”,"The sentence provided, ""And then that child wi...","{\n ""id"": ""chatcmpl-8VjSIbY003PQnMNLbZRCN9FW1...","[{'role': 'system', 'content': 'You are an exp...",?
1847,"The former Reagan White House speechwriter, no...",NOT BIASED\n\nThe sentence provided does not c...,"{\n ""id"": ""chatcmpl-8VjSwSUDQVV7KzSf6G0h9xjY2...","[{'role': 'system', 'content': 'You are an exp...",?
1924,Thirteen relatives of victims spoke during the...,NOT BIASED\n\nThe sentence reports a statement...,"{\n ""id"": ""chatcmpl-8VjWRbWNc1ESq497PzBBKL02z...","[{'role': 'system', 'content': 'You are an exp...",?
2117,"Charen writes, “It isn’t that Abbott didn’t an...",NOT BIASED\n\n(Note: The sentence provided is ...,"{\n ""id"": ""chatcmpl-8VjesqVxU0gAeCWQjio5DULfD...","[{'role': 'system', 'content': 'You are an exp...",?


In [36]:
# preprocessing
def update_label(row):
    if row['response'].startswith("BIASED") and row['label'] == '?':
        return 'BIASED'
    elif row['response'].startswith("NOT BIASED") and row['label'] == '?':
        return 'NOT BIASED'
    else:
        return row['label']


four_shot['label'] = four_shot.apply(update_label, axis=1)
four_shot.query("label == '?'")

Unnamed: 0,text,response,raw_data,query,label
128,may,The word 'may' in isolation does not provide e...,"{\n ""id"": ""chatcmpl-8VhyiR5D4PSw4h8HnmZdxqCcF...","[{'role': 'system', 'content': 'You are an exp...",?
1825,And then that child will be taken away.”,"The sentence provided, ""And then that child wi...","{\n ""id"": ""chatcmpl-8VjSIbY003PQnMNLbZRCN9FW1...","[{'role': 'system', 'content': 'You are an exp...",?
2991,are guaranteed to vote for them.,The sentence provided is incomplete and lacks ...,"{\n ""id"": ""chatcmpl-8Vk9o95rMu1Pnd51fiOZg5YR0...","[{'role': 'system', 'content': 'You are an exp...",?


In [None]:
#four_shot.loc[[128, 1825, 2991], 'label'] = 'NOT BIASED'

In [39]:
four_shot = four_shot.rename(columns={"label": "4_shot_label"})
four_shot['4_shot_label'] = four_shot['4_shot_label'].replace('BIASED', 1)
four_shot['4_shot_label'] = four_shot['4_shot_label'].replace('NOT BIASED', 0)

df_merge = df_babe.merge(four_shot[four_shot['4_shot_label'] != '?'][['text', '4_shot_label']], on='text')
df_merge_all_runs = df_merge_all_runs.merge(four_shot[four_shot['4_shot_label'] != '?'][['text', '4_shot_label']], on='text')
df_merge_all_runs_with_errors = df_merge_all_runs_with_errors.merge(four_shot[['text', '4_shot_label']], on='text')


ground_truth = df_merge['label'].astype(int)
four_shot_label = df_merge['4_shot_label'].astype(int)

In [40]:
print("F1-Score with GPT 4 turbo with (4 shot): ",
      f1_score(ground_truth, four_shot_label))
print("Precision with GPT 4 turbo with (4 shot): ",
      precision_score(ground_truth, four_shot_label))
print("Recall with GPT 4 turbo with (4 shot): ",
      recall_score(ground_truth, four_shot_label))
print("Accuracy with GPT 4 turbo with (4 shot): ",
      accuracy_score(ground_truth, four_shot_label))

F1-Score with GPT 4 turbo with (4 shot):  0.7924170616113744
Precision with GPT 4 turbo with (4 shot):  0.7501121579183491
Recall with GPT 4 turbo with (4 shot):  0.8397790055248618
Accuracy with GPT 4 turbo with (4 shot):  0.7819810851169736


# 8-shot

In [41]:
eight_shot.query("label == '?'")

Unnamed: 0,text,response,raw_data,query,label
747,It’s that devaluing that allows the perpetrati...,"NOT BIASED\n\nThe sentence ""It’s that devaluin...","{\n ""id"": ""chatcmpl-8UxBMm7nehaqVxiVHmvSg2cGJ...","[{'role': 'system', 'content': 'You are an exp...",?
1612,Institutionalized racism.,"NOT BIASED\n\nThe phrase ""Institutionalized ra...","{\n ""id"": ""chatcmpl-8UxUBUUFmYnK8NohlqvbbHUoC...","[{'role': 'system', 'content': 'You are an exp...",?
1825,And then that child will be taken away.”,The sentence provided is incomplete and lacks ...,"{\n ""id"": ""chatcmpl-8UxZ3yu8Bh9Ainto3TVLvC0xf...","[{'role': 'system', 'content': 'You are an exp...",?
2680,Wells Fargo & Co WFC.N Chief Executive Charles...,NOT BIASED\n\nThe sentence reports on a statem...,"{\n ""id"": ""chatcmpl-8UxrrSGPzaKVJknLHq01s6azc...","[{'role': 'system', 'content': 'You are an exp...",?
2694,"For decades, Democrats have been accused of as...",The sentence provided is incomplete and does n...,"{\n ""id"": ""chatcmpl-8UxsAUmSlBxwCb8CMlL9SJCUX...","[{'role': 'system', 'content': 'You are an exp...",?
2954,Quite the opposite.,"NOT BIASED\n\nThe phrase ""Quite the opposite"" ...","{\n ""id"": ""chatcmpl-8UxyT4Iba4LN0HHNzLZfK8U2K...","[{'role': 'system', 'content': 'You are an exp...",?
2955,For the first time since the enactment of the ...,NOT BIASED\n\nThis sentence appears to be a st...,"{\n ""id"": ""chatcmpl-8UxyXnxw96J8KbZBxViX7XWSx...","[{'role': 'system', 'content': 'You are an exp...",?
2991,are guaranteed to vote for them.,The provided sentence fragment 'are guaranteed...,"{\n ""id"": ""chatcmpl-8UxzVcpncz5JDCYzdw79q4JfP...","[{'role': 'system', 'content': 'You are an exp...",?
3018,"Everything will be Ukraine...""","NOT BIASED\n\nThe sentence ""Everything will be...","{\n ""id"": ""chatcmpl-8Uy06rKbkOid5q7AX3D3tZncI...","[{'role': 'system', 'content': 'You are an exp...",?
3060,The hearing comes a day after the Democrat-led...,NOT BIASED\n\nThe sentence appears to be a str...,"{\n ""id"": ""chatcmpl-8Uy18vZaRDuxDwXkXVQ79gPuz...","[{'role': 'system', 'content': 'You are an exp...",?


In [42]:
# preprocessing
def update_label(row):
    if row['response'].startswith("BIASED") and row['label'] == '?':
        return 'BIASED'
    elif row['response'].startswith("NOT BIASED") and row['label'] == '?':
        return 'NOT BIASED'
    else:
        return row['label']


eight_shot['label'] = eight_shot.apply(update_label, axis=1)
eight_shot.query("label == '?'")

Unnamed: 0,text,response,raw_data,query,label
1825,And then that child will be taken away.”,The sentence provided is incomplete and lacks ...,"{\n ""id"": ""chatcmpl-8UxZ3yu8Bh9Ainto3TVLvC0xf...","[{'role': 'system', 'content': 'You are an exp...",?
2694,"For decades, Democrats have been accused of as...",The sentence provided is incomplete and does n...,"{\n ""id"": ""chatcmpl-8UxsAUmSlBxwCb8CMlL9SJCUX...","[{'role': 'system', 'content': 'You are an exp...",?
2991,are guaranteed to vote for them.,The provided sentence fragment 'are guaranteed...,"{\n ""id"": ""chatcmpl-8UxzVcpncz5JDCYzdw79q4JfP...","[{'role': 'system', 'content': 'You are an exp...",?


In [None]:
#eight_shot.loc[[ 1825, 2991], 'label'] = 'NOT BIASED'
#eight_shot.loc[[ 2694], 'label'] = 'BIASED'


In [44]:
eight_shot = eight_shot.rename(columns={"label": "8_shot_label"})
eight_shot['8_shot_label'] = eight_shot['8_shot_label'].replace('BIASED', 1)
eight_shot['8_shot_label'] = eight_shot['8_shot_label'].replace('NOT BIASED',
                                                                0)

df_merge = df_babe.merge(eight_shot[eight_shot['8_shot_label'] != '?'][['text', '8_shot_label']], on='text')
df_merge_all_runs = df_merge_all_runs.merge(eight_shot[eight_shot['8_shot_label'] != '?'][['text', '8_shot_label']], on='text')
df_merge_all_runs_with_errors = df_merge_all_runs_with_errors.merge(eight_shot[['text', '8_shot_label']], on='text')


ground_truth = df_merge['label'].astype(int)
eight_shot_label = df_merge['8_shot_label'].astype(int)

In [45]:
print("F1-Score with GPT 4 turbo with (8 shot): ",
      f1_score(ground_truth, eight_shot_label))
print("Precision with GPT 4 turbo with (8 shot): ",
      precision_score(ground_truth, eight_shot_label))
print("Recall with GPT 4 turbo with (8 shot): ",
      recall_score(ground_truth, eight_shot_label))
print("Accuracy with GPT 4 turbo with (8 shot): ",
      accuracy_score(ground_truth, eight_shot_label))

F1-Score with GPT 4 turbo with (8 shot):  0.7899241868427489
Precision with GPT 4 turbo with (8 shot):  0.7697807435653002
Recall with GPT 4 turbo with (8 shot):  0.8111501757910597
Accuracy with GPT 4 turbo with (8 shot):  0.7862120457939273


# 2-shot CoT

In [46]:
two_shot_cot.query("label == '?'")

Unnamed: 0,text,response,raw_data,query,label
4,Republican state legislators from Texas to Ari...,The sentence contains several elements that in...,"{\n ""id"": ""chatcmpl-8WPwBCXqkgQnMn6TBiWQxtfVM...","[{'role': 'system', 'content': 'You are an exp...",?
12,"Still, gun-control advocates believe momentum ...",The sentence is not overtly biased because it ...,"{\n ""id"": ""chatcmpl-8WPxDF3hCnQ7B52lI04aMnxhM...","[{'role': 'system', 'content': 'You are an exp...",?
25,Trump acknowledged there might be a resurgence...,The sentence presents a statement made by Pres...,"{\n ""id"": ""chatcmpl-8WPytplLZRSrjs73LmpDFAYBn...","[{'role': 'system', 'content': 'You are an exp...",?
29,They’d be standing aside and waving them on in.,The sentence 'They’d be standing aside and wav...,"{\n ""id"": ""chatcmpl-8WPzc5s2VpqjaDdecsmv5LWJH...","[{'role': 'system', 'content': 'You are an exp...",?
37,The Trump administration gave the Border Patro...,The sentence provides a description of an acti...,"{\n ""id"": ""chatcmpl-8WQ0dwAFLyc5m5gULNmwI3hRV...","[{'role': 'system', 'content': 'You are an exp...",?
...,...,...,...,...,...
3984,"Posting on Twitter, the former stripper turned...",The sentence provides a statement made by an i...,"{\n ""id"": ""chatcmpl-8WXc75dBD885GAAakxpNsBWWN...","[{'role': 'system', 'content': 'You are an exp...",?
3995,The Lancet joined China in condemning Trump’s ...,The sentence attributes specific opinions and ...,"{\n ""id"": ""chatcmpl-8WXdNPsNcP9g0zzucAgqGi2Ed...","[{'role': 'system', 'content': 'You are an exp...",?
3999,"But at its core, there's a basic truth that's ...","The sentence makes an assertion about a ""basic...","{\n ""id"": ""chatcmpl-8WXdwWPFyTnXoZH1wD8YsNeny...","[{'role': 'system', 'content': 'You are an exp...",?
4008,About one quarter of the world’s population li...,The sentence presents a factual statement abou...,"{\n ""id"": ""chatcmpl-8WXen91Y2fnY7JIUoEIU6KJ0F...","[{'role': 'system', 'content': 'You are an exp...",?


In [47]:
# preprocessing
# preprocessing
def update_label(row):
    if row['response'].startswith('BIASED') and row['label'] == '?':
        return 'BIASED'
    elif row['response'].startswith('NOT BIASED') and row[
        'label'] == '?':
        return 'NOT BIASED'
    elif row['response'].startswith('Classification: NOT BIASED') and row[
        'label'] == '?':
        return 'NOT BIASED'
    elif row['response'].startswith('Classification: BIASED') and row[
        'label'] == '?':
        return 'BIASED'
    elif row['response'].startswith('The sentence is NOT BIASED') and row[
        'label'] == '?':
        return 'NOT BIASED'
    elif row['response'].startswith('The sentence is not biased') and row[
        'label'] == '?':
        return 'NOT BIASED'
    elif row['response'].startswith('Classify the sentence above as BIASED or NOT BIASED.\n\nOutput: NOT BIASED') and row[
        'label'] == '?':
        return 'NOT BIASED'
    elif row['response'].startswith('Classify the sentence above as BIASED or NOT BIASED.\n\nOutput: BIASED') and row[
        'label'] == '?':
        return 'BIASED'
    elif find_first_occurrence(row['response']) == "The answer is BIASED"and row[
        'label'] == '?':
        return 'BIASED'
    elif find_first_occurrence(row['response']) == "The answer is NOT BIASED"and row[
        'label'] == '?':
        return 'NOT BIASED'
    else:
        return row['label']


two_shot_cot['label'] = two_shot_cot.apply(update_label, axis=1)
two_shot_cot.query("label == '?'")

Unnamed: 0,text,response,raw_data,query,label
4,Republican state legislators from Texas to Ari...,The sentence contains several elements that in...,"{\n ""id"": ""chatcmpl-8WPwBCXqkgQnMn6TBiWQxtfVM...","[{'role': 'system', 'content': 'You are an exp...",?
12,"Still, gun-control advocates believe momentum ...",The sentence is not overtly biased because it ...,"{\n ""id"": ""chatcmpl-8WPxDF3hCnQ7B52lI04aMnxhM...","[{'role': 'system', 'content': 'You are an exp...",?
29,They’d be standing aside and waving them on in.,The sentence 'They’d be standing aside and wav...,"{\n ""id"": ""chatcmpl-8WPzc5s2VpqjaDdecsmv5LWJH...","[{'role': 'system', 'content': 'You are an exp...",?
37,The Trump administration gave the Border Patro...,The sentence provides a description of an acti...,"{\n ""id"": ""chatcmpl-8WQ0dwAFLyc5m5gULNmwI3hRV...","[{'role': 'system', 'content': 'You are an exp...",?
75,The ridicule is key here.,"The sentence ""The ridicule is key here"" is som...","{\n ""id"": ""chatcmpl-8WQ5piHCCRkZFD1Z8kRmb0l5R...","[{'role': 'system', 'content': 'You are an exp...",?
...,...,...,...,...,...
3892,Logging and mining operations have accelerated...,The sentence describes a situation where speci...,"{\n ""id"": ""chatcmpl-8WXSHVuwkDFXLG7tMC7fFn2cp...","[{'role': 'system', 'content': 'You are an exp...",?
3906,It is hard to argue that the highest income co...,The sentence presents a perspective that impli...,"{\n ""id"": ""chatcmpl-8WXU08HK3PjDZd9CUgRFBZFTS...","[{'role': 'system', 'content': 'You are an exp...",?
3963,Hungary’s parliament has passed a declaration ...,The sentence presents a reason given by Hungar...,"{\n ""id"": ""chatcmpl-8WXZp7YJK2SF2gHJ93QaoF4Pi...","[{'role': 'system', 'content': 'You are an exp...",?
3984,"Posting on Twitter, the former stripper turned...",The sentence provides a statement made by an i...,"{\n ""id"": ""chatcmpl-8WXc75dBD885GAAakxpNsBWWN...","[{'role': 'system', 'content': 'You are an exp...",?


In [48]:
two_shot_cot.loc[[4, 3963, 103, 3841, 3787, 3730, 3627, 2903, 911,
                  2863, 2751, 2711, 2433, 2292, 1964, 1704, 1481,
                  1262, 1082, ], 'label'] = 'BIASED'
two_shot_cot.loc[[3999, 3906, 12, 37, 173, 3845, 3682, 3679, 267, 263,
                  246, 185, 285, 291, 3501, 3533, 3601, 412, 438, 474, 3198,
                  3297, 3327, 3390, 525, 534, 545, 3037, 3047, 3113, 3179, 627,
                  663, 2993, 3008, 3022, 799,743, 735, 860, 861, 863, 2828, 2844,
                  2854, 898, 1004, 2765, 2774, 2815, 2694, 2685, 2670, 1068, 2581,
                  2344, 2116, 2074, 2054, 2023, 2008, 1917, 1897, 1847, 1647, 1490,
                  1472, 1462, 1289, 1243, 1232, 1229, 1224, 1070], 'label'] = 'NOT BIASED'

# nicht zuweisbar
#two_shot_cot.loc[[3984, 3892, 29, 75, 166, 3827, 3754, 194, 292, 350, 381, 3453, 489, 501, 3270,
#                  511, 533, 3127, 602, 625, 642, 3013, 3018, 2991, 2983, 2968, 2954, 781, 716, 838,
#                  880, 2818, 2863, 907, 914, 2808, 2650, 2630, 2582, 2500, 2378, 2337, 2317, 2169, 
#                  2138, 2126, 2117, 2028, 2003, 1934, 1914, 1903, 1825, 1810, 1720, 1616, 1612,
#                  1339, 1316, ], 'label'] = '?'

# 716...
# we use what GPT leans towards 2292 ,1825

two_shot_cot.loc[1229]['response']

'The sentence cites a specific report from Open Doors, a non-profit organization that tracks persecution of Christians worldwide. The statement includes a quantifiable claim ("over 260 million Christians facing \'high levels of persecution\'") and attributes the source of the information. It does not appear to express a subjective opinion or promote a particular viewpoint beyond reporting the findings of the Open Doors report. However, without additional context about the methodology and potential biases of the Open Doors organization, it\'s difficult to fully assess the neutrality of the claim. If Open Doors is a reputable and unbiased source, then the sentence would likely be considered NOT BIASED as it is reporting on data provided by a third party. If, however, Open Doors has a history of biased reporting or uses methods that are not widely accepted, the sentence could be seen as indirectly biased by uncritically relaying potentially skewed information. Without further context, the

In [49]:
two_shot_cot = two_shot_cot.rename(columns={"label": "2_shot_cot_label"})
two_shot_cot['2_shot_cot_label'] = two_shot_cot['2_shot_cot_label'].replace('BIASED', 1)
two_shot_cot['2_shot_cot_label'] = two_shot_cot['2_shot_cot_label'].replace('NOT BIASED',
                                                                0)


df_merge = df_babe.merge(two_shot_cot[two_shot_cot['2_shot_cot_label'] != '?'][['text', '2_shot_cot_label']], on='text')
df_merge_all_runs = df_merge_all_runs.merge(two_shot_cot[two_shot_cot['2_shot_cot_label'] != '?'][['text', '2_shot_cot_label']], on='text')
df_merge_all_runs_with_errors = df_merge_all_runs_with_errors.merge(two_shot_cot[['text', '2_shot_cot_label']], on='text')


ground_truth = df_merge['label'].astype(int)
two_shot_cot_label = df_merge['2_shot_cot_label'].astype(int)

In [50]:
print("F1-Score with GPT 4 turbo with (2 shot CoT): ",
      f1_score(ground_truth, two_shot_cot_label))
print("Precision with GPT 4 turbo with (2 shot CoT): ",
      precision_score(ground_truth, two_shot_cot_label))
print("Recall with GPT 4 turbo with (2 shot CoT): ",
      recall_score(ground_truth, two_shot_cot_label))
print("Accuracy with GPT 4 turbo with (2 shot CoT): ",
      accuracy_score(ground_truth, two_shot_cot_label))

F1-Score with GPT 4 turbo with (2 shot CoT):  0.8027593254982115
Precision with GPT 4 turbo with (2 shot CoT):  0.8056410256410257
Recall with GPT 4 turbo with (2 shot CoT):  0.7998981670061099
Accuracy with GPT 4 turbo with (2 shot CoT):  0.8051980822609135


# 4-shot CoT

In [51]:
four_shot_cot.query("label == '?'")

Unnamed: 0,text,response,raw_data,query,label
12,"Still, gun-control advocates believe momentum ...","The sentence ""Still, gun-control advocates bel...","{\n ""id"": ""chatcmpl-8Wmw7NqCtCArhal8E3Zwq8mLe...","[{'role': 'system', 'content': 'You are an exp...",?
26,Just look at some of the arguments that Elizab...,The sentence presents a summary of Elizabeth S...,"{\n ""id"": ""chatcmpl-8Wmy01JXkcpn3QvGEgrKkJ4zs...","[{'role': 'system', 'content': 'You are an exp...",?
41,U.S. President Donald Trump pledged on Wednesd...,The sentence reports on a pledge made by Presi...,"{\n ""id"": ""chatcmpl-8Wn08j2qWazkbrMW7T8o4hwTw...","[{'role': 'system', 'content': 'You are an exp...",?
56,The legislation ends criminal penalties for se...,The sentence describes specific provisions of ...,"{\n ""id"": ""chatcmpl-8Wn22L9DzTr28NT5ySQhJXoby...","[{'role': 'system', 'content': 'You are an exp...",?
57,"Ever since, Sanders has portrayed himself as a...","The sentence ""Ever since, Sanders has portraye...","{\n ""id"": ""chatcmpl-8Wn2AYA5f46PogaH2sGsBNJaV...","[{'role': 'system', 'content': 'You are an exp...",?
...,...,...,...,...,...
3980,According to Lauren Copeland (a political scie...,The sentence appears to be a straightforward r...,"{\n ""id"": ""chatcmpl-8WuP2iEAkLrtYZjsxO0TjVkNo...","[{'role': 'system', 'content': 'You are an exp...",?
3999,"But at its core, there's a basic truth that's ...","The sentence asserts a ""basic truth"" regarding...","{\n ""id"": ""chatcmpl-8WuQwyuW0zyA1Bx9z0IiJAbaW...","[{'role': 'system', 'content': 'You are an exp...",?
4013,"The outline, which puts numbers to several pol...",The sentence provides details about a policy o...,"{\n ""id"": ""chatcmpl-8WuSRxxnlCbpIpS3dYxdXK0pf...","[{'role': 'system', 'content': 'You are an exp...",?
4014,"Larry Kudlow, one of President Donald Trump’s ...",The sentence reports on a statement made by La...,"{\n ""id"": ""chatcmpl-8WuSaE0Vq3xI80WuZToZ6U5iT...","[{'role': 'system', 'content': 'You are an exp...",?


In [209]:
# preprocessing
def update_label(row):
    if row['response'].startswith('BIASED') and row['label'] == '?':
        return 'BIASED'
    elif row['response'].startswith('NOT BIASED') and row[
        'label'] == '?':
        return 'NOT BIASED'
    elif row['response'].startswith('Classification: NOT BIASED') and row[
        'label'] == '?':
        return 'NOT BIASED'
    elif row['response'].startswith('Classification: BIASED') and row[
        'label'] == '?':
        return 'BIASED'
    elif row['response'].startswith('The sentence is NOT BIASED') and row[
        'label'] == '?':
        return 'NOT BIASED'
    elif row['response'].startswith('The sentence is not biased') and row[
        'label'] == '?':
        return 'NOT BIASED'
    elif row['response'].startswith('Classify the sentence above as BIASED or NOT BIASED.\n\nOutput: NOT BIASED') and row[
        'label'] == '?':
        return 'NOT BIASED'
    elif row['response'].startswith('Classify the sentence above as BIASED or NOT BIASED.\n\nOutput: BIASED') and row[
        'label'] == '?':
        return 'BIASED'
    elif find_first_occurrence(row['response']) == "The answer is BIASED"and row[
        'label'] == '?':
        return 'BIASED'
    elif find_first_occurrence(row['response']) == "The answer is NOT BIASED"and row[
        'label'] == '?':
        return 'NOT BIASED'
    else:
        return row['label']


four_shot_cot['label'] = four_shot_cot.apply(update_label, axis=1)
four_shot_cot.query("label == '?'")

Unnamed: 0,text,response,raw_data,query,label


In [210]:
four_shot_cot.loc[[3327, 964, 3057, 3106, 3163, 2714, 2486, 2500, 2159, 2078,
                   2021, 1826, 1386, 1194, ], 'label'] = 'BIASED'
four_shot_cot.loc[[3787, 3769, 79, 26, 410, 3571, 360, 3601, 627, 3184,
                   3509, 3557, 911, 936, 3156, 1080, 1105, 2983, 2868,
                   2777, 2794, 2516, 2446, 2399, 2344, 2192, 2163, 2051,
                   1999, 1950, 1897, 1895, 1847, 1720, 1412, 1251, 1224, 
                   1128, 1127], 'label'] = 'NOT BIASED'

# nicht zuweisbar (oder context dependend)
four_shot_cot.loc[[4019, 3791, 3766, 128, 81, 75, 246, 364, 381, 427, 3588, 3620,
                   3717, 438, 533, 737, 747, 3499, 809, 907, 3093, 1041, 1068,
                   1117, 2950, 2954, 2991, 3018, 2879, 2865, 2799, 2694, 2663,
                   2630, 2516, 2433, 2337, 2217, 2169, 2117, 2044, 2040, 1986,
                   1975, 1963, 1914, 1849, 1825, 1810, 1612, 1611, 1580, 1537, 
                   1529, 1391, 1348, 1422, 1244, 1145], 'label'] = '?'

# 716...2879
# we use what GPT leans towards X as label

four_shot_cot.loc[1127]['response']

'The sentence uses descriptive language that could be interpreted as biased. The term "black-clad leftists" carries a connotation that may be intended to evoke a specific, possibly negative, image of the individuals involved. The mention of "bullhorns, flashing lights, vuvuzelas and a guillotine" adds a dramatic flair and could be seen as emphasizing the disruptive nature of the gathering. However, the sentence does not explicitly express a judgment or opinion about the individuals or their actions; it merely describes the scene. The use of "black-clad" and the listing of items like "guillotine" could be seen as an attempt to create a particular impression, but without additional context or explicit commentary, it is not as overtly biased as the previous examples. The answer leans towards NOT BIASED, but it could be argued that the choice of descriptors may subtly influence the reader\'s perception.'

In [212]:
four_shot_cot = four_shot_cot.rename(columns={"label": "4_shot_cot_label"})
four_shot_cot['4_shot_cot_label'] = four_shot_cot['4_shot_cot_label'].replace('BIASED', 1)
four_shot_cot['4_shot_cot_label'] = four_shot_cot['4_shot_cot_label'].replace('NOT BIASED',
                                                                0)

df_merge = df_babe.merge(four_shot_cot[four_shot_cot['4_shot_cot_label'] != '?'][['tWext', '4_shot_cot_label']], on='text')
df_merge_all_runs = df_merge_all_runs.merge(four_shot_cot[four_shot_cot['4_shot_cot_label'] != '?'][['text', '4_shot_cot_label']], on='text')
df_merge_all_runs_with_errors = df_merge_all_runs_with_errors.merge(four_shot_cot[['text', '4_shot_cot_label']], on='text')

ground_truth = df_merge['label'].astype(int)
four_shot_cot_label = df_merge['4_shot_cot_label'].astype(int)

In [213]:
print("F1-Score with GPT 4 turbo with (4 shot CoT): ",
      f1_score(ground_truth, four_shot_cot_label))
print("Precision with GPT 4 turbo with (4 shot CoT): ",
      precision_score(ground_truth, four_shot_cot_label))
print("Recall with GPT 4 turbo with (4 shot CoT): ",
      recall_score(ground_truth, four_shot_cot_label))
print("Accuracy with GPT 4 turbo with (4 shot CoT): ",
      accuracy_score(ground_truth, four_shot_cot_label))

F1-Score with GPT 4 turbo with (4 shot CoT):  0.8016401845207587
Precision with GPT 4 turbo with (4 shot CoT):  0.8074341765616934
Recall with GPT 4 turbo with (4 shot CoT):  0.7959287531806616
Accuracy with GPT 4 turbo with (4 shot CoT):  0.8046441191317516


# 8-shot CoT

In [141]:
eight_shot_cot.query("label == '?'")

Unnamed: 0,text,response,raw_data,query,label
1,"""You think I'm joking,"" he continued. ""","The sentence '""You think I'm joking,"" he conti...","{\n ""id"": ""chatcmpl-8Z21oJBLq1FkrDNEbEmHJgbbb...","[{'role': 'system', 'content': 'You are an exp...",?
3,Americans are carrying $1.57 trillion in outst...,The sentence provides a comparison of debt fig...,"{\n ""id"": ""chatcmpl-8Z2214gPmpUxkmnObDJk25BQj...","[{'role': 'system', 'content': 'You are an exp...",?
19,"But with the can of worms now open, Democrats ...","The sentence uses metaphorical language (""can ...","{\n ""id"": ""chatcmpl-8Z23djljOzi2T5cVUgpsieNmH...","[{'role': 'system', 'content': 'You are an exp...",?
22,"irearms kill almost 1,300 American youngsters ...",The sentence presents statistical information ...,"{\n ""id"": ""chatcmpl-8Z241XC3SqrRv8IWHcWH504zU...","[{'role': 'system', 'content': 'You are an exp...",?
26,Just look at some of the arguments that Elizab...,The sentence presents a summary of Elizabeth S...,"{\n ""id"": ""chatcmpl-8Z24RvnbG2bLuxxdr74JUtJZA...","[{'role': 'system', 'content': 'You are an exp...",?
...,...,...,...,...,...
3993,Military ships and aircraft have been deployed...,The sentence provides a factual report on the ...,"{\n ""id"": ""chatcmpl-8Z7z1jEXsoYND48LzKs9guCeD...","[{'role': 'system', 'content': 'You are an exp...",?
4004,"Minhaj, who used to appear on “The Daily Show”...",The sentence provides a factual account of Min...,"{\n ""id"": ""chatcmpl-8Z807OJ4mUtvgyLNkOUHwsNdX...","[{'role': 'system', 'content': 'You are an exp...",?
4007,"In June, an explosive early morning fire rocke...",The sentence describes an event using vivid la...,"{\n ""id"": ""chatcmpl-8Z80Qzx6m0E39XepjROpzyYVo...","[{'role': 'system', 'content': 'You are an exp...",?
4019,Sen. Tom Cotton (R-AR) says it “makes absolute...,The sentence presents a viewpoint from Senator...,"{\n ""id"": ""chatcmpl-8Z81h14sqMYCt89hESmMTwXrs...","[{'role': 'system', 'content': 'You are an exp...",?


In [142]:
# preprocessing
def update_label(row):
    if row['response'].startswith('BIASED') and row['label'] == '?':
        return 'BIASED'
    elif row['response'].startswith('NOT BIASED') and row[
        'label'] == '?':
        return 'NOT BIASED'
    elif row['response'].startswith('Classification: NOT BIASED') and row[
        'label'] == '?':
        return 'NOT BIASED'
    elif row['response'].startswith('Classification: BIASED') and row[
        'label'] == '?':
        return 'BIASED'
    elif row['response'].startswith('The sentence is NOT BIASED') and row[
        'label'] == '?':
        return 'NOT BIASED'
    elif row['response'].startswith('The sentence is not biased') and row[
        'label'] == '?':
        return 'NOT BIASED'
    elif row['response'].startswith('Classify the sentence above as BIASED or NOT BIASED.\n\nOutput: NOT BIASED') and row[
        'label'] == '?':
        return 'NOT BIASED'
    elif row['response'].startswith('Classify the sentence above as BIASED or NOT BIASED.\n\nOutput: BIASED') and row[
        'label'] == '?':
        return 'BIASED'
    elif find_first_occurrence(row['response']) == "The answer is BIASED"and row[
        'label'] == '?':
        return 'BIASED'
    elif find_first_occurrence(row['response']) == "The answer is NOT BIASED"and row[
        'label'] == '?':
        return 'NOT BIASED'
    else:
        return row['label']


eight_shot_cot['label'] = eight_shot_cot.apply(update_label, axis=1)
eight_shot_cot.query("label == '?'")

Unnamed: 0,text,response,raw_data,query,label
1,"""You think I'm joking,"" he continued. ""","The sentence '""You think I'm joking,"" he conti...","{\n ""id"": ""chatcmpl-8Z21oJBLq1FkrDNEbEmHJgbbb...","[{'role': 'system', 'content': 'You are an exp...",?
37,The Trump administration gave the Border Patro...,The sentence presents a factual statement rega...,"{\n ""id"": ""chatcmpl-8Z25ULR8aglDE1QvXgXwOHLbw...","[{'role': 'system', 'content': 'You are an exp...",?
75,The ridicule is key here.,"The sentence ""The ridicule is key here"" is too...","{\n ""id"": ""chatcmpl-8Z29Ojm3oMYVBvE7jjM6W0Gcq...","[{'role': 'system', 'content': 'You are an exp...",?
79,The antifa movement — a network of loosely org...,The sentence provides a definition of the anti...,"{\n ""id"": ""chatcmpl-8Z29jars1yB4XDPIO4mJwY7jg...","[{'role': 'system', 'content': 'You are an exp...",?
194,Court filings show the NRA is in shambles — an...,The sentence presents information that could b...,"{\n ""id"": ""chatcmpl-8Z2LYOIryw1diw5oyBmvcBbt5...","[{'role': 'system', 'content': 'You are an exp...",?
...,...,...,...,...,...
3717,She’s not the only one pushing the talking poi...,The sentence in isolation is ambiguous and lac...,"{\n ""id"": ""chatcmpl-8Z7ZtwhWuclRmpxcaO529XqVV...","[{'role': 'system', 'content': 'You are an exp...",?
3752,President Donald Trump bragged on Sunday that ...,The sentence reports a statement made by Presi...,"{\n ""id"": ""chatcmpl-8Z7d9Y8l4P5qXNywrwFVDanBV...","[{'role': 'system', 'content': 'You are an exp...",?
3778,"In a press conference on Wednesday, Trump trie...",The sentence could be interpreted as biased du...,"{\n ""id"": ""chatcmpl-8Z7feLQb2dafufVIPHPNcoVet...","[{'role': 'system', 'content': 'You are an exp...",?
3786,"At the time, Moynihan was primarily concerned ...",The sentence presents a historical perspective...,"{\n ""id"": ""chatcmpl-8Z7gQiMBvdi7LyzbmUAbeSDkB...","[{'role': 'system', 'content': 'You are an exp...",?


In [143]:
eight_shot_cot.loc[[3752, 3778, 256, 3497, 3620, 338, 501,
                    2664, 2440, 2211, 1454, 1430, 1127,
                    928, 880, 834, ], 'label'] = 'BIASED'
eight_shot_cot.loc[[1, 37, 79, 298, 3627, 3633, 364, 3113,
                    3184, 544, 2905, 2882, 2865, 2732, 2709,
                    2665, 2663, 2572, 2365, 2313, 2272, 2082,
                    2044, 2019, 1963, 1847, 1831, 1827, 1612, 
                    1512, 1369, 1280, 1251, 858, ], 'label'] = 'NOT BIASED'

# nicht zuweisbar (oder context dependend)
eight_shot_cot.loc[[194, 3717, 3786, 3837, 75, 211, 332,
                    3580, 371, 404, 454, 3270, 3424, 3439,
                    533, 610, 735, 2954, 2983, 2991, 3018,
                    3106, 2926, 2779, 2777, 2630, 2582,
                    2529, 2159, 2117, 1825, 1810, 1736,
                    1412, 1399, 1241, 1100, 1080, 1068,
                    793], 'label'] = '?'

# we use what GPT leans towards X as label

eight_shot_cot.loc[793]['response']

'The sentence presents a series of events that occurred during the George Floyd protests, focusing on the negative aspects such as "riots," "looting," "buildings being set on fire," "businesses destroyed," "the White House going on lockdown," and "people being beaten and killed." While these events did occur, the use of the word "devolved" suggests a negative progression from peaceful protests to chaos, which could be seen as a biased interpretation of the events. The sentence does not provide context or mention any peaceful protests, which could lead to a one-sided view of the situation. However, the sentence could also be interpreted as a factual recounting of specific incidents that occurred without additional commentary or subjective language. The determination of bias in this case may depend on the broader context in which the sentence is used and whether it is part of a balanced account of the protests. If the sentence is part of a narrative that consistently portrays the protest

In [146]:
eight_shot_cot = eight_shot_cot.rename(columns={"label": "8_shot_cot_label"})
eight_shot_cot['8_shot_cot_label'] = eight_shot_cot['8_shot_cot_label'].replace('BIASED', 1)
eight_shot_cot['8_shot_cot_label'] = eight_shot_cot['8_shot_cot_label'].replace('NOT BIASED',
                                                                0)

df_merge = df_babe.merge(eight_shot_cot[eight_shot_cot['8_shot_cot_label'] != '?'][['text', '8_shot_cot_label']], on='text')
df_merge_all_runs = df_merge_all_runs.merge(eight_shot_cot[eight_shot_cot['8_shot_cot_label'] != '?'][['text', '8_shot_cot_label']], on='text')
df_merge_all_runs_with_errors = df_merge_all_runs_with_errors.merge(eight_shot_cot[['text', '8_shot_cot_label']], on='text')

ground_truth = df_merge['label'].astype(int)
eight_shot_cot_label = df_merge['8_shot_cot_label'].astype(int)

In [147]:
print("F1-Score with GPT 4 turbo with (8 shot CoT): ",
      f1_score(ground_truth, eight_shot_cot_label))
print("Precision with GPT 4 turbo with (8 shot CoT): ",
      precision_score(ground_truth, eight_shot_cot_label))
print("Recall with GPT 4 turbo with (8 shot CoT): ",
      recall_score(ground_truth, eight_shot_cot_label))
print("Accuracy with GPT 4 turbo with (8 shot CoT): ",
      accuracy_score(ground_truth, eight_shot_cot_label))

F1-Score with GPT 4 turbo with (8 shot CoT):  0.8003088008234688
Precision with GPT 4 turbo with (8 shot CoT):  0.8128593831677993
Recall with GPT 4 turbo with (8 shot CoT):  0.7881398884946782
Accuracy with GPT 4 turbo with (8 shot CoT):  0.805074101984426


# Comparison and plots

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix


def plot_confusion_matrix(ax, df, true_labels_column, predicted_labels_column,
                          title=None
                          ):
    predicted_labels = df[f'{predicted_labels_column}']
    true_labels = df[f'{true_labels_column}']

    # Calculate confusion matrix
    cm = confusion_matrix(true_labels, predicted_labels)


    # Display confusion matrix heatmap
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=True,
                yticklabels=True, ax=ax)

    title = title if title else predicted_labels_column

    ax.set_title(f'Confusion Matrix - {title}')
    ax.set_xlabel('Predicted')
    ax.set_ylabel('True')


# Create subplots
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Confusion Matrices')

# Plot each confusion matrix
plot_confusion_matrix(axes[0, 0], df_merge, 'label', '0_shot_label', '0_shot')
plot_confusion_matrix(axes[0, 1], df_merge, 'label',
                      '0_shot_with_system_label', '0_shot_with_system')
plot_confusion_matrix(axes[0, 2], df_merge, 'label', '0_shot_cot_label',
                      '0_shot_cot')
plot_confusion_matrix(axes[1, 0], df_merge, 'label', '2_shot_label', '2_shot')
plot_confusion_matrix(axes[1, 1], df_merge, 'label', '4_shot_label', '4_shot')
plot_confusion_matrix(axes[1, 2], df_merge, 'label', '8_shot_label', '8_shot')

plt.tight_layout(
    rect=[0, 0, 1, 0.96])  # Adjust layout to prevent title overlap
plt.show()

: 

# Krippendorff Alpha in model

In [None]:
import krippendorff

runs = ['0_shot_label', '0_shot_with_system_label', '0_shot_cot_label',
        '2_shot_label', '4_shot_label', '8_shot_label']


def compute_krippendorff_alpha(df, predicted_columns):
    pred_map = {}
    for run in predicted_columns:
        predicted_labels = df[run]
        pred_map[run] = predicted_labels

    # Check if there is variability in the ratings
    unique_labels_counts = df[predicted_columns].nunique(axis=1)
    if unique_labels_counts.max() == 1:
        # All ratings are the same, return a special value or handle accordingly
        return 0

    reliability_data = df[predicted_columns].values.tolist()

    # Calculate Krippendorff's alpha
    alpha = krippendorff.alpha(reliability_data=list(pred_map.values()),
                               level_of_measurement='nominal')

    return alpha


: 

In [None]:
alpha_value = compute_krippendorff_alpha(df_merge, runs)
print(f"Krippendorff's Alpha (all runs): {alpha_value}")

: 

In [None]:
import itertools

def compute_krippendorff_alpha_for_k_runs(df, runs, k=None):
    # Initialize variables to store the best combination and alpha
    if k is None:
        k = len(runs)

    best_combination = None
    best_alpha = 0  # Assuming alpha ranges from 0 to 1

    # Iterate through all possible combinations
    for combination in itertools.combinations(runs, k):

        alpha_value = compute_krippendorff_alpha(df, list(combination))

        # Print alpha for the current combination
        print(f"Combination: {combination}, Alpha: {alpha_value}")

        # Update best combination and alpha if a higher alpha is found
        if alpha_value > best_alpha:
            best_alpha = alpha_value
            best_combination = combination

    # Print the best combination and alpha
    print(f"\nBest Combination: {best_combination}, Best Alpha: {best_alpha}")
    return best_alpha, best_combination

: 

In [None]:
compute_krippendorff_alpha_for_k_runs(df_merge, runs, 2)

: 