# Eval the results of Llama 13b

In [6]:
import pandas as pd
from datasets import load_dataset
from sklearn.metrics import f1_score, precision_score, recall_score, \
    accuracy_score

import re
from typing import List, Tuple

import pandas as pd


def is_contained(inner_start, inner_end, outer_start, outer_end):
    return (
        outer_start <= inner_start <= outer_end
        and outer_start <= inner_end <= outer_end
        and (inner_start, inner_end) != (outer_start, outer_end)
    )


def find_all_occurrences_indices(
    sentence: str,
    label: str,
) -> List[Tuple[int, int]]:
    """
    Finds all occurrences of a label in a sentence and returns a list of tuples
    with the start and end indices of the occurrences.

    Args:
        sentence: The sentence to search in.
        label: The label to search for.

    Returns:
        A list of tuples with the start and end indices of the occurrences.
    """
    return [
        (int(match.start()), int(match.end()))
        for match in re.finditer(label, sentence)
    ]


def find_contained_labels(labels):
    """
    Finds all labels that are contained in other labels.

    Args:
        labels: A list of labels.

    Returns:
        A dictionary with the labels as keys and a list of labels that are
        contained in the key label as values.
    """
    return {
        label_1: [
            label2
            for label2 in labels
            if label_1 in label2 and label_1 != label2
        ]
        for label_1 in labels
    }


def find_labels_in_sentence(
    sentence: str,
    labels: List[str],
) -> List[List[Tuple[int, int]]]:
    """
    Finds all occurrences of the labels in a sentence and returns a list of
    lists with the occurrences of each label.

    Labels that are contained in other labels are not returned.

    Args:
        sentence: The sentence to search in.
        labels: The labels to search for.

    Returns:
        A list of lists with the positional occurrence of each label.
    """
    if not labels:
        return []

    containable_map = find_contained_labels(labels)
    sentence_lower = sentence.lower()
    occurrences = [
        find_all_occurrences_indices(sentence_lower, label.lower())
        for label in labels
    ]
    label_occurrences = dict(zip(labels, occurrences))
    included_list = []
    for label, inner_pos in zip(labels, occurrences):
        contained = set()
        containing_labels = containable_map.get(label, set())

        for bigger_label in containing_labels:
            locations: List[Tuple[int, int]] = label_occurrences.get(
                bigger_label,
                list(),
            )

            for inner_start, inner_end in inner_pos:
                for outer_start, outer_end in locations:
                    if is_contained(
                        inner_start,
                        inner_end,
                        outer_start,
                        outer_end,
                    ):
                        contained.add((inner_start, inner_end))
        included_list.append(list(contained))

    return [
        list(set(pos) - set(included))
        for pos, included in zip(occurrences, included_list)
    ]


def find_label(
    sentence: str,
    labels: List[str],
    default_label: str = "?",
) -> str:
    """
    Search for given labels in the sentence and returns it if found. If only
    one label occur in the sentence, it will be returned. If no label or
    different labels occur in the sentence, '?' is returned.

    Args:
        sentence: The sentence to search in.
        labels: The labels to search for.
        default_label: The label to return if no label or different labels
            occur in the sentence.

    Returns:
        The label that occurs in the sentence or '?' if no label occurs in the
        sentence.
    """
    occurrences = find_labels_in_sentence(sentence=sentence, labels=labels)
    non_empty_indices = [i for i, sublist in enumerate(occurrences) if sublist]
    return (
        labels[non_empty_indices[0]]
        if len(
            non_empty_indices,
        )
           == 1
        else default_label
    )


def _soft_parse(
    df: pd.DataFrame,
    in_col: str,
    parsed_col: str,
    labels: List[str] = None,
) -> pd.DataFrame:
    if labels is None:
        raise ValueError("Labels are not set!")

    df[parsed_col] = df[in_col].apply(
        lambda x: find_label(x, labels),
    )


In [7]:

# load
model = 'Llama-2-13b-chat-hf'

# load results
zero_shot = pd.read_csv(f"../zero-shot/data/{model}.csv")
zero_shot_with_system = pd.read_csv(
    f"../zero-shot-system_prompt/data/{model}.csv")
#zero_shot_cot = pd.read_csv(f"../zero-shot-cot/data/{model}.csv")
two_shot = pd.read_csv(f"../2-shot/data/{model}.csv")
four_shot = pd.read_csv(f"../4-shot/data/{model}.csv")
eight_shot = pd.read_csv(f"../8-shot/data/{model}.csv")

two_shot_cot = pd.read_csv(f"../2-shot-CoT/data/{model}.csv")
four_shot_cot = pd.read_csv(f"../4-shot-CoT/data/{model}.csv")
eight_shot_cot = pd.read_csv(f"../8-shot-CoT/data/{model}.csv")

#load pool
pool = load_dataset('mediabiasgroup/BABE-icl-pool')['train'].to_pandas()

# exclude pool from model (if needed)
zero_shot = zero_shot.merge(pool['text'], on='text', how='left',
                            indicator=True).query(
    '_merge == "left_only"').drop('_merge', axis=1)
zero_shot_with_system = zero_shot_with_system.merge(pool['text'], on='text',
                                                    how='left',
                                                    indicator=True).query(
    '_merge == "left_only"').drop('_merge', axis=1)
#zero_shot_cot = zero_shot_cot.merge(pool['text'], on='text', how='left',
#                                    indicator=True).query(
#    '_merge == "left_only"').drop('_merge', axis=1)
two_shot = two_shot.merge(pool['text'], on='text', how='left',
                          indicator=True).query('_merge == "left_only"').drop(
    '_merge', axis=1)
four_shot = four_shot.merge(pool['text'], on='text', how='left',
                            indicator=True).query(
    '_merge == "left_only"').drop('_merge', axis=1)
eight_shot = eight_shot.merge(pool['text'], on='text', how='left',
                              indicator=True).query(
    '_merge == "left_only"').drop('_merge', axis=1)
two_shot_cot = two_shot_cot.merge(pool['text'], on='text', how='left',
                              indicator=True).query(
    '_merge == "left_only"').drop('_merge', axis=1)
four_shot_cot = four_shot_cot.merge(pool['text'], on='text', how='left',
                              indicator=True).query(
    '_merge == "left_only"').drop('_merge', axis=1)
eight_shot_cot = eight_shot_cot.merge(pool['text'], on='text', how='left',
                              indicator=True).query(
    '_merge == "left_only"').drop('_merge', axis=1)


#load babe
dataset = load_dataset('mediabiasgroup/BABE-v4')
df_babe = pd.DataFrame(dataset['train'])

# df_merge = babe at begin
df_merge_all_runs = df_babe
df_merge_all_runs_with_errors = df_babe


def find_first_occurrence(string, instruction="Instruction:",
                          statement1="The answer is BIASED",
                          statement2="The answer is NOT BIASED"
                          ):
    index_instruction = string.find(instruction)
    index1 = string.find(statement1)
    index2 = string.find(statement2)

    if index1 == -1 and index2 == -1:
        return "Neither instruction nor statements found in the given string."
    elif index1 == -1 and index2 == -1:
        return "Neither statement found in the given string."
    elif index1 == -1:
        if index_instruction == -1 or index2 < index_instruction:
            return f"{statement2}"
        else:
            return "After Instruction"
    elif index2 == -1:
        if index_instruction == -1 or index1 < index_instruction:
            return f"{statement1}"
        else:
            return "After Instruction"
    elif index1 < index2:
        if index_instruction == -1 or index1 < index_instruction:
            return f"{statement1}"
        else:
            return "After Instruction"
    else:
        if index_instruction == -1 or index2 < index_instruction:
            return f"{statement2}"
        else:
            return "After Instruction"



# Zero shot

In [8]:
_soft_parse(zero_shot, 'response', 'label',
            ['BIASED', 'NOT BIASED'])
zero_shot.query("label == '?'")

Unnamed: 0,text,response,raw_data,query,label
20,A Joe Biden presidency could reset ties with t...,NOT BIASED.\n\nReason: The sentence provides f...,Instruction: 'A Joe Biden presidency could res...,Instruction: 'A Joe Biden presidency could res...,?
31,Nobel laureate Paul Krugman breaks down Trump'...,NOT BIASED\n\nReason: The sentence simply repo...,Instruction: 'Nobel laureate Paul Krugman brea...,Instruction: 'Nobel laureate Paul Krugman brea...,?
62,"The leader of India, Prime Minister Narendra M...",NOT BIASED\n\nExplanation: The sentence provid...,"Instruction: 'The leader of India, Prime Minis...","Instruction: 'The leader of India, Prime Minis...",?
74,"Earlier in the month, Alphabet Inc's GOOGL.O G...",NOT BIASED\n\nRationale: The sentence is not b...,"Instruction: 'Earlier in the month, Alphabet I...","Instruction: 'Earlier in the month, Alphabet I...",?
77,Despite Democrats regularly characterizing Ame...,NOT BIASED\n\nExplanation: The sentence is NOT...,Instruction: 'Despite Democrats regularly char...,Instruction: 'Despite Democrats regularly char...,?
...,...,...,...,...,...
4013,"Trump, of course, is his most stunning when he...",NOT BIASED\n\nExplanation: The sentence is NOT...,"Instruction: 'Trump, of course, is his most st...","Instruction: 'Trump, of course, is his most st...",?
4019,If your “rights” mean taking everyone else’s r...,NOT BIASED\n\nReason: The sentence is expressi...,Instruction: 'If your “rights” mean taking eve...,Instruction: 'If your “rights” mean taking eve...,?
4028,"A cop shoots a Black man, and a police union f...",NOT BIASED\n\nReason: The sentence simply repo...,"Instruction: 'A cop shoots a Black man, and a ...","Instruction: 'A cop shoots a Black man, and a ...",?
4030,President Donald Trump campaigned in the battl...,NOT BIASED\n\nExplanation: The sentence is fac...,Instruction: 'President Donald Trump campaigne...,Instruction: 'President Donald Trump campaigne...,?


In [9]:
# preprocessing
def update_label(row):
    if row['response'].startswith('BIASED') and row['label'] == '?':
        return 'BIASED'
    elif row['response'].startswith('NOT BIASED') and row[
        'label'] == '?':
        return 'NOT BIASED'
    elif row['response'].startswith('The sentence is BIASED') and row['label'] == '?':
        return 'BIASED'
    elif row['response'].startswith('The sentence is NOT BIASED') and row[
        'label'] == '?':
        return 'NOT BIASED'
    else:
        return row['label']


zero_shot['label'] = zero_shot.apply(update_label, axis=1)
zero_shot.query("label == '?'")

Unnamed: 0,text,response,raw_data,query,label


In [10]:
zero_shot = zero_shot.rename(columns={"label": "0_shot_label"})
zero_shot['0_shot_label'] = zero_shot['0_shot_label'].replace('BIASED', 1)
zero_shot['0_shot_label'] = zero_shot['0_shot_label'].replace('NOT BIASED', 0)

df_merge = df_babe.merge(zero_shot[zero_shot['0_shot_label'] != '?'][['text', '0_shot_label']], on='text')
df_merge_all_runs = df_merge_all_runs.merge(zero_shot[zero_shot['0_shot_label'] != '?'][['text', '0_shot_label']], on='text')
df_merge_all_runs_with_errors = df_merge_all_runs_with_errors.merge(zero_shot[['text', '0_shot_label']], on='text')

ground_truth = df_merge['label']
zero_shot_label = df_merge['0_shot_label']

In [11]:
print("F1-Score with TODO: ", f1_score(ground_truth, zero_shot_label))
print("Precision with TODO: ",
      precision_score(ground_truth, zero_shot_label))
print("Recall with TODO: ",
      recall_score(ground_truth, zero_shot_label))
print("Accuracy with TODO: ",
      accuracy_score(ground_truth, zero_shot_label))

F1-Score with TODO:  0.6028280823616968
Precision with TODO:  0.5955882352941176
Recall with TODO:  0.6102461074836766
Accuracy with TODO:  0.6018403382243223


# zero shot with system prompt

In [12]:
zero_shot_with_system.query("label == '?'")

Unnamed: 0,text,response,raw_data,query,label
1,"""You think I'm joking,"" he continued. """,BIASED or NOT BIASED\n\nReason: \n\n(if BIASED...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
3,Americans are carrying $1.57 trillion in outst...,BIASED or NOT BIASED\n\nReasoning: \n\n(option...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
5,"Meanwhile, a pervasive celebrity culture — in ...",1 (BIASED) or 0 (NOT BIASED)\n\nPlease explain...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
7,New York City Mayor Bill de Blasio (D) suggest...,BIASED or NOT BIASED\n\nExplanation (optional)...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
9,Four states will hold their primary contests o...,BIASED or NOT BIASED\n\nPlease provide a brief...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
...,...,...,...,...,...
4114,"Larry Kudlow, one of President Donald Trump’s ...",BIASED or NOT BIASED.,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
4115,Trump sought to use the event to bring momentu...,BIASED or NOT BIASED\n\nPlease provide a brief...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
4118,When Republicans warned Democrats that impeach...,BIASED or NOT BIASED.\n\nPlease provide a brie...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
4119,Sen. Tom Cotton (R-AR) says it “makes absolute...,BIASED or NOT BIASED\n\nReasoning: \n\n(option...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?


In [13]:
# preprocessing
def update_label(row):
    if row['response'].startswith('BIASED') and row['label'] == '?':
        return 'BIASED'
    elif row['response'].startswith('NOT BIASED') and row[
        'label'] == '?':
        return 'NOT BIASED'
    elif row['response'].startswith('The sentence is BIASED') and row['label'] == '?':
        return 'BIASED'
    elif row['response'].startswith('The sentence is NOT BIASED') and row[
        'label'] == '?':
        return 'NOT BIASED'
    else:
        return row['label']


zero_shot_with_system['label'] = zero_shot_with_system.apply(update_label, axis=1)
zero_shot_with_system.query("label == '?'")

Unnamed: 0,text,response,raw_data,query,label
5,"Meanwhile, a pervasive celebrity culture — in ...",1 (BIASED) or 0 (NOT BIASED)\n\nPlease explain...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
10,"And, of course, along the way, the hipster CEO...",1 (BIASED) or 0 (NOT BIASED)\n\nPlease provide...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
28,Wray’s public comments come at a time of heigh...,1 (BIASED) or 0 (NOT BIASED)\n\nPlease explain...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
36,"Since the incident, the mainstream media — in ...",1 (BIASED) or 0 (NOT BIASED)\n\nPlease explain...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
45,"As the debt crisis unfolds, the idea of allowi...",[Bias Score]\n[Bias Type]\n\nPlease provide a ...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
...,...,...,...,...,...
4068,Privacy advocates on Monday raised concerns ov...,"1 for BIASED, 0 for NOT BIASED.",You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
4082,Left unsaid is that Obamacare — through its my...,Explain your reasoning.\n\nPlease provide your...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
4085,Spiers continues: “Researchers have a term for...,1 (BIASED) or 0 (NOT BIASED)\n\nPlease explain...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
4090,Lesh pins the blame mainly on the statist bure...,Explain your reasoning.\n\nPlease note that th...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?


In [14]:
zero_shot_with_system = zero_shot_with_system.rename(
    columns={"label": "0_shot_with_system_label"})
zero_shot_with_system['0_shot_with_system_label'] = zero_shot_with_system[
    '0_shot_with_system_label'].replace('BIASED', 1)
zero_shot_with_system['0_shot_with_system_label'] = zero_shot_with_system[
    '0_shot_with_system_label'].replace('NOT BIASED', 0)

df_merge = df_babe.merge(zero_shot_with_system[zero_shot_with_system['0_shot_with_system_label'] != '?'][['text', '0_shot_with_system_label']], on='text')
df_merge_all_runs = df_merge_all_runs.merge(zero_shot_with_system[zero_shot_with_system['0_shot_with_system_label'] != '?'][['text', '0_shot_with_system_label']], on='text')
df_merge_all_runs_with_errors = df_merge_all_runs_with_errors.merge(zero_shot_with_system[['text', '0_shot_with_system_label']], on='text')

ground_truth = df_merge['label'].astype(int)
zero_shot_with_system_label = df_merge['0_shot_with_system_label'].astype(int)

In [15]:
print("F1-Score with TODO with System Prompt: ",
      f1_score(ground_truth, zero_shot_with_system_label))
print("Precision with TODO with System Prompt: ",
      precision_score(ground_truth, zero_shot_with_system_label))
print("Recall with TODO with System Prompt: ",
      recall_score(ground_truth, zero_shot_with_system_label))
print("Accuracy with TODO with System Prompt: ",
      accuracy_score(ground_truth, zero_shot_with_system_label))

F1-Score with TODO with System Prompt:  0.6505542431401055
Precision with TODO with System Prompt:  0.4839145715058124
Recall with TODO with System Prompt:  0.9922394678492239
Accuracy with TODO with System Prompt:  0.4858288770053476


# zero shot CoT

In [16]:
zero_shot_cot.query("label == '?'")

NameError: name 'zero_shot_cot' is not defined

In [None]:
# preprocessing

In [None]:
zero_shot_cot = zero_shot_cot.rename(columns={"label": "0_shot_cot_label"})
zero_shot_cot['0_shot_cot_label'] = zero_shot_cot['0_shot_cot_label'].replace(
    'BIASED', 1)
zero_shot_cot['0_shot_cot_label'] = zero_shot_cot['0_shot_cot_label'].replace(
    'NOT BIASED', 0)

df_merge = df_babe.merge(zero_shot_cot[zero_shot_cot['0_shot_cot_label'] != '?'][['text', '0_shot_cot_label']], on='text')
df_merge_all_runs = df_merge_all_runs.merge(zero_shot_cot[zero_shot_cot['0_shot_cot_label'] != '?'][['text', '0_shot_cot_label']], on='text')
df_merge_all_runs_with_errors = df_merge_all_runs_with_errors.merge(zero_shot_cot[['text', '0_shot_cot_label']], on='text')

ground_truth = df_merge['label'].astype(int)
zero_shot_cot_label = df_merge['0_shot_cot_label'].astype(int)

In [None]:
print("F1-Score with TODO with CoT: ",
      f1_score(ground_truth, zero_shot_cot_label))
print("Precision with TODO with CoT: ",
      precision_score(ground_truth, zero_shot_cot_label))
print("Recall with TODO with CoT: ",
      recall_score(ground_truth, zero_shot_cot_label))
print("Accuracy with TODO with CoT: ",
      accuracy_score(ground_truth, zero_shot_cot_label))

# 2-shot

In [None]:
two_shot.query("label == '?'")

Unnamed: 0,text,response,raw_data,query,label
1,"""You think I'm joking,"" he continued. """,BIASED\n\nInstruction: 'The president's suppor...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
2,"Yes, the rich are rich and want to stay that w...",BIASED\n\nInstruction: 'The Left is not intere...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
4,Republican state legislators from Texas to Ari...,BIASED\n\nInstruction: 'The CDC has recommende...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
10,"And, of course, along the way, the hipster CEO...",BIASED\n\nInstruction: 'The CEO of a major cor...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
11,"On Monday, the Supreme Court voted in an 8-1 d...",BIASED\n\nInstruction: 'The Supreme Court’s de...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
...,...,...,...,...,...
4005,"The night before the rant, the Jesus Is King r...",BIASED\n\nInstruction: 'The Jesus Is King rapp...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
4006,Elizabeth Warren and Bernie Sanders disagreed ...,BIASED\n\nInstruction: 'Sanders has been gaini...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
4009,"In her Instagram story, Jedrzejczyk posted an ...",BIASED\n\nInstruction: 'The World Health Organ...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
4011,"Facebook posts, shared thousands of times, cla...",BIASED\n\nInstruction: 'The World Health Organ...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?


In [None]:
# preprocessing
def update_label(row):
    if row['response'].startswith('BIASED') and row['label'] == '?':
        return 'BIASED'
    elif row['response'].startswith('NOT BIASED') and row[
        'label'] == '?':
        return 'NOT BIASED'
    else:
        return row['label']


two_shot['label'] = two_shot.apply(update_label, axis=1)
two_shot.query("label == '?'")

Unnamed: 0,text,response,raw_data,query,label


In [None]:
two_shot = two_shot.rename(columns={"label": "2_shot_label"})
two_shot['2_shot_label'] = two_shot['2_shot_label'].replace('BIASED', 1)
two_shot['2_shot_label'] = two_shot['2_shot_label'].replace('NOT BIASED', 0)

df_merge = df_babe.merge(two_shot[two_shot['2_shot_label'] != '?'][['text', '2_shot_label']], on='text')
df_merge_all_runs = df_merge_all_runs.merge(two_shot[two_shot['2_shot_label'] != '?'][['text', '2_shot_label']], on='text')
df_merge_all_runs_with_errors = df_merge_all_runs_with_errors.merge(two_shot[['text', '2_shot_label']], on='text')

ground_truth = df_merge['label'].astype(int)
two_shot_label = df_merge['2_shot_label'].astype(int)

In [None]:
print("F1-Score with TODO with (2 shot): ",
      f1_score(ground_truth, two_shot_label))
print("Precision with TODO with (2 shot): ",
      precision_score(ground_truth, two_shot_label))
print("Recall with TODO with (2 shot): ",
      recall_score(ground_truth, two_shot_label))
print("Accuracy with TODO with (2 shot): ",
      accuracy_score(ground_truth, two_shot_label))

F1-Score with TODO with (2 shot):  0.699244966442953
Precision with TODO with (2 shot):  0.6002880806625855
Recall with TODO with (2 shot):  0.8372677046710196
Accuracy with TODO with (2 shot):  0.6433722954488933


# 4-shot

In [None]:
four_shot.query("label == '?'")

Unnamed: 0,text,response,raw_data,query,label
0,But instead of closing loopholes that allow cr...,NOT BIASED\n\nInstruction: 'The NRA has been a...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
1,"""You think I'm joking,"" he continued. """,NOT BIASED\n\nInstruction: 'The president’s li...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
2,"Yes, the rich are rich and want to stay that w...",BIASED\n\nInstruction: 'The Left has been succ...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
3,Americans are carrying $1.57 trillion in outst...,NOT BIASED\n\nInstruction: 'The U.S. economy i...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
4,Republican state legislators from Texas to Ari...,BIASED\n\nInstruction: 'The Democratic Party i...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
...,...,...,...,...,...
4008,About one quarter of the world’s population li...,"NOT BIASED\n\nInstruction: 'The USCCB, which h...",You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
4009,"In her Instagram story, Jedrzejczyk posted an ...",BIASED\n\nInstruction: 'The World Health Organ...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
4010,"Yes, there may still be time for the union's m...",NOT BIASED\n\nInstruction: 'The Democratic Par...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
4011,"Facebook posts, shared thousands of times, cla...",BIASED\n\nInstruction: 'The World Health Organ...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?


In [None]:
# preprocessing
def update_label(row):
    if row['response'].startswith('BIASED') and row['label'] == '?':
        return 'BIASED'
    elif row['response'].startswith('NOT BIASED') and row[
        'label'] == '?':
        return 'NOT BIASED'
    else:
        return row['label']


four_shot['label'] = four_shot.apply(update_label, axis=1)
four_shot.query("label == '?'")

Unnamed: 0,text,response,raw_data,query,label


In [None]:
four_shot = four_shot.rename(columns={"label": "4_shot_label"})
four_shot['4_shot_label'] = four_shot['4_shot_label'].replace('BIASED', 1)
four_shot['4_shot_label'] = four_shot['4_shot_label'].replace('NOT BIASED', 0)

df_merge = df_babe.merge(four_shot[four_shot['4_shot_label'] != '?'][['text', '4_shot_label']], on='text')
df_merge_all_runs = df_merge_all_runs.merge(four_shot[four_shot['4_shot_label'] != '?'][['text', '4_shot_label']], on='text')
df_merge_all_runs_with_errors = df_merge_all_runs_with_errors.merge(four_shot[['text', '4_shot_label']], on='text')

ground_truth = df_merge['label'].astype(int)
four_shot_label = df_merge['4_shot_label'].astype(int)

In [None]:
print("F1-Score with TODO with (4 shot): ",
      f1_score(ground_truth, four_shot_label))
print("Precision with TODO with (4 shot): ",
      precision_score(ground_truth, four_shot_label))
print("Recall with TODO with (4 shot): ",
      recall_score(ground_truth, four_shot_label))
print("Accuracy with TODO with (4 shot): ",
      accuracy_score(ground_truth, four_shot_label))

F1-Score with TODO with (4 shot):  0.7176368375325803
Precision with TODO with (4 shot):  0.6322234978951397
Recall with TODO with (4 shot):  0.8297338021094928
Accuracy with TODO with (4 shot):  0.6766973389704054


# 8-shot

In [None]:
eight_shot.query("label == '?'")

Unnamed: 0,text,response,raw_data,query,label
0,But instead of closing loopholes that allow cr...,BIASED\n\nInstruction: 'The National Rifle Ass...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
1,"""You think I'm joking,"" he continued. """,NOT BIASED\n\nInstruction: 'The president’s ap...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
2,"Yes, the rich are rich and want to stay that w...",BIASED\n\nInstruction: 'The president’s likely...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
3,Americans are carrying $1.57 trillion in outst...,NOT BIASED\n\nInstruction: 'The U.S. economy i...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
5,"Meanwhile, a pervasive celebrity culture — in ...",BIASED\n\nInstruction: 'The media has been acc...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
...,...,...,...,...,...
4013,"The outline, which puts numbers to several pol...","NOT BIASED\n\nInstruction: 'The new tax law, w...",You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
4014,"Larry Kudlow, one of President Donald Trump’s ...",BIASED\n\nInstruction: 'The Trump administrati...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
4015,Trump sought to use the event to bring momentu...,BIASED\n\nInstruction: 'President Donald Trump...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
4016,Far-right talk show host Glenn Beck is being s...,BIASED\n\nInstruction: 'The COVID-19 pandemic ...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?


In [None]:
# preprocessing
# preprocessing
def update_label(row):
    if row['response'].startswith('BIASED') and row['label'] == '?':
        return 'BIASED'
    elif row['response'].startswith('NOT BIASED') and row[
        'label'] == '?':
        return 'NOT BIASED'
    else:
        return row['label']


eight_shot['label'] = eight_shot.apply(update_label, axis=1)
eight_shot.query("label == '?'")

Unnamed: 0,text,response,raw_data,query,label


In [None]:
eight_shot = eight_shot.rename(columns={"label": "8_shot_label"})
eight_shot['8_shot_label'] = eight_shot['8_shot_label'].replace('BIASED', 1)
eight_shot['8_shot_label'] = eight_shot['8_shot_label'].replace('NOT BIASED',
                                                                0)

df_merge = df_babe.merge(eight_shot[eight_shot['8_shot_label'] != '?'][['text', '8_shot_label']], on='text')
df_merge_all_runs = df_merge_all_runs.merge(eight_shot[eight_shot['8_shot_label'] != '?'][['text', '8_shot_label']], on='text')
df_merge_all_runs_with_errors = df_merge_all_runs_with_errors.merge(eight_shot[['text', '8_shot_label']], on='text')

ground_truth = df_merge['label']
eight_shot_label = df_merge['8_shot_label']

In [None]:
print("F1-Score with TODO with (8 shot): ",
      f1_score(ground_truth, eight_shot_label))
print("Precision with TODO with (8 shot): ",
      precision_score(ground_truth, eight_shot_label))
print("Recall with TODO with (8 shot): ",
      recall_score(ground_truth, eight_shot_label))
print("Accuracy with TODO with (8 shot): ",
      accuracy_score(ground_truth, eight_shot_label))

F1-Score with TODO with (8 shot):  0.7357780912082745
Precision with TODO with (8 shot):  0.6915598762704375
Recall with TODO with (8 shot):  0.7860371672526368
Accuracy with TODO with (8 shot):  0.7204675453867198


# 2-shot CoT

In [None]:
two_shot_cot.query("label == '?'")

Unnamed: 0,text,response,raw_data,query,label
13,"The legislation, worked out between leading la...",The sentence provides factual information abou...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
14,"Taken together, these trends leave many young ...",The sentence presents a bleak picture of the c...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
29,They’d be standing aside and waving them on in.,The sentence is neutral as it simply describes...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
33,Few would consider a global pandemic reason to...,The sentence offers a subjective perspective o...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
43,Female genital mutilation is illegal in German...,"The sentence uses the word ""gruesome"" to descr...",You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
...,...,...,...,...,...
3948,The figure of 78 cents to a man’s dollar is fa...,The sentence presents a widely-cited statistic...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
3971,Student debt increased by a whopping 107 perce...,The sentence presents data and statistics with...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
3976,A report by the Institute on Taxation and Econ...,The sentence presents factual information abou...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
3980,According to Lauren Copeland (a political scie...,The sentence provides a quote from a specific ...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?


In [None]:
# preprocessing
def update_label(row):
    if row['response'].startswith('BIASED') and row['label'] == '?':
        return 'BIASED'
    elif row['response'].startswith('NOT BIASED') and row[
        'label'] == '?':
        return 'NOT BIASED'
    elif row['response'].startswith('Classification: NOT BIASED') and row[
        'label'] == '?':
        return 'NOT BIASED'
    elif row['response'].startswith('Classification: BIASED') and row[
        'label'] == '?':
        return 'BIASED'
    elif row['response'].startswith('Classify the sentence above as BIASED or NOT BIASED.\n\nOutput: NOT BIASED') and row[
        'label'] == '?':
        return 'NOT BIASED'
    elif row['response'].startswith('Classify the sentence above as BIASED or NOT BIASED.\n\nOutput: BIASED') and row[
        'label'] == '?':
        return 'BIASED'
    elif row['response'].startswith('The sentence is biased') and row[
        'label'] == '?':
        return 'BIASED'
    elif find_first_occurrence(row['response']) == "The answer is BIASED"and row[
        'label'] == '?':
        return 'BIASED'
    elif find_first_occurrence(row['response']) == "The answer is NOT BIASED"and row[
        'label'] == '?':
        return 'NOT BIASED'
    else:
        return row['label']


two_shot_cot['label'] = two_shot_cot.apply(update_label, axis=1)
two_shot_cot.query("label == '?'")

Unnamed: 0,text,response,raw_data,query,label
46,But research like Clayton’s and Jyoti’s is an ...,The sentence presents a perspective that resea...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
116,"But behind the scenes, Trump was getting brief...",The sentence contains some subjective language...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
195,A third of the participants were indirectly af...,The sentence reports on the impact of wildfire...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
236,President Donald Trump had a lot to say about ...,The sentence presents a statement made by Pres...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
256,A 10-hour hearing broadcast on the court’s web...,The sentence is neutral in its wording and fac...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
262,The mass protests could put Trumpism to a seri...,The sentence is NOT BIASED. It reports on the ...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
310,"Ahmad Amiri Farahani, the member of parliament...",The sentence is NOT BIASED. It reports a state...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
420,By the time Officer Joseph Ferrigno shot a Bla...,The sentence presents factual information abou...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
454,Some experts estimate that one in five America...,The sentence presents a statistic about the im...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
478,"Sen. Bernie Sanders, I-Vt., went after Pete Bu...",The sentence identifies Sanders' attack on But...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?


In [None]:
two_shot_cot.loc[[3946, 3870, 3750, 3733, 3650, 2189, 2119, 2111, 1789,
                  1432, 1111, 755, 420, 236], 'label'] = 'BIASED'
two_shot_cot.loc[[3981, 3904, 3863, 3719, 3539, 3531, 3424, 3374, 3159,
                  2908, 2883, 2875, 2551, 2533, 2179, 1757, 1749, 1563,
                  1385, 1230, 1133, 1004, 948, 640, 478, 454, 310, 262, 256, 195, 116, 46 ], 'label'] = 'NOT BIASED'

# undefined
two_shot_cot.loc[46]['response']

"The sentence presents a perspective that research on climate trauma and eco anxiety is important and that raising awareness of these issues is crucial. This perspective is not necessarily biased, as it is a widely accepted fact that mental health professionals should be aware of the impact of climate change on human well-being. The sentence does not express a subjective opinion or advocate for a particular approach to treating climate trauma and eco anxiety. It simply acknowledges the importance of research in addressing these issues. Therefore, the sentence is classified as NOT BIASED.\n\nInstruction: 'The COVID-19 pandemic may be a unique situation where we can actually transform the economy to not only make it better for workers but also for the climate.'\n\nClassify the sentence above as BIASED or NOT BIASED.\n\nOutput: Let's think step by step. The sentence presents an opportunity-focused perspective, exploring the potential for positive outcomes from a crisis. It does not lean t

In [None]:
two_shot_cot = two_shot_cot.rename(columns={"label": "2_shot_cot_label"})
two_shot_cot['2_shot_cot_label'] = two_shot_cot['2_shot_cot_label'].replace('BIASED', 1)
two_shot_cot['2_shot_cot_label'] = two_shot_cot['2_shot_cot_label'].replace('NOT BIASED',
                                                                0)

df_merge = df_babe.merge(two_shot_cot[two_shot_cot['2_shot_cot_label'] != '?'][['text', '2_shot_cot_label']], on='text')
df_merge_all_runs = df_merge_all_runs.merge(two_shot_cot[two_shot_cot['2_shot_cot_label'] != '?'][['text', '2_shot_cot_label']], on='text')
df_merge_all_runs_with_errors = df_merge_all_runs_with_errors.merge(two_shot_cot[['text', '2_shot_cot_label']], on='text')

ground_truth = df_merge['label'].astype(int)
two_shot_cot_label = df_merge['2_shot_cot_label'].astype(int)

In [None]:
print("F1-Score with TODO with (2 shot CoT): ",
      f1_score(ground_truth, two_shot_cot_label))
print("Precision with TODO with (2 shot CoT): ",
      precision_score(ground_truth, two_shot_cot_label))
print("Recall with TODO with (2 shot CoT): ",
      recall_score(ground_truth, two_shot_cot_label))
print("Accuracy with TODO with (2 shot CoT): ",
      accuracy_score(ground_truth, two_shot_cot_label))

F1-Score with TODO with (2 shot CoT):  0.7606096131301289
Precision with TODO with (2 shot CoT):  0.7132805628847845
Recall with TODO with (2 shot CoT):  0.8146659969864389
Accuracy with TODO with (2 shot CoT):  0.7460830639144491


# 4-shot CoT

In [None]:
four_shot_cot.query("label == '?'")

Unnamed: 0,text,response,raw_data,query,label
10,"And, of course, along the way, the hipster CEO...",The sentence is biased because it implies that...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
13,"The legislation, worked out between leading la...",The sentence presents factual information abou...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
26,Just look at some of the arguments that Elizab...,The sentence presents a factual statement abou...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
33,Few would consider a global pandemic reason to...,The sentence presents a balanced perspective b...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
43,Female genital mutilation is illegal in German...,The sentence is biased because it uses emotive...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
...,...,...,...,...,...
3966,Bloomberg did not reference specific jokes. AB...,The sentence is not biased because it reports ...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
3976,A report by the Institute on Taxation and Econ...,The sentence presents factual information abou...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
3981,Five of eight gun control bills endorsed by De...,The sentence is not biased because it simply r...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
3987,Wall Street surged on Thursday as investors we...,"The sentence uses the term ""bellicose"" to desc...",You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?


In [None]:
# preprocessing
def update_label(row):
    if row['response'].startswith('BIASED') and row['label'] == '?':
        return 'BIASED'
    elif row['response'].startswith('NOT BIASED') and row[
        'label'] == '?':
        return 'NOT BIASED'
    elif row['response'].startswith('Classification: NOT BIASED') and row[
        'label'] == '?':
        return 'NOT BIASED'
    elif row['response'].startswith('Classification: BIASED') and row[
        'label'] == '?':
        return 'BIASED'
    elif row['response'].startswith('Classify the sentence above as BIASED or NOT BIASED.\n\nOutput: NOT BIASED') and row[
        'label'] == '?':
        return 'NOT BIASED'
    elif row['response'].startswith('Classify the sentence above as BIASED or NOT BIASED.\n\nOutput: BIASED') and row[
        'label'] == '?':
        return 'BIASED'
    elif row['response'].startswith('The sentence is classified as NOT BIASED') and row[
        'label'] == '?':
        return 'BIASED'
    elif row['response'].startswith('The sentence is NOT BIASED') and row[
        'label'] == '?':
        return 'NOT BIASED'
    elif find_first_occurrence(row['response']) == "The answer is BIASED"and row[
        'label'] == '?':
        return 'BIASED'
    elif find_first_occurrence(row['response']) == "The answer is NOT BIASED"and row[
        'label'] == '?':
        return 'NOT BIASED'
    else:
        return row['label']


four_shot_cot['label'] = four_shot_cot.apply(update_label, axis=1)
four_shot_cot.query("label == '?'")

Unnamed: 0,text,response,raw_data,query,label
26,Just look at some of the arguments that Elizab...,The sentence presents a factual statement abou...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
195,A third of the participants were indirectly af...,The sentence is neutral and not biased because...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
232,The idea of exposing your child to something t...,"The sentence is not biased in itself, as it si...",You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
357,"Some states see sports betting, like lotteries...",The sentence provides factual information abou...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
370,"Juneteenth, a portmanteau of June and 19th, al...","The sentence is neutral and factual, providing...",You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
498,A report compiled by members of the French Sen...,The sentence is neutral in its presentation of...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
543,Biden’s proposal seeks to make a dent in colle...,The sentence is neutral in its wording and ton...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
656,"The law, passed in December 2018, allows a thi...",The sentence provides factual information abou...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
784,Many monuments of historical figures have been...,The sentence frames the protesters' actions as...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
1105,WADA wants to investigate all the athletes who...,"The sentence is neutral and factual, simply st...",You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?


In [None]:
four_shot_cot.loc[[3987, 2445, 1670, 1275, 1111, 656, 232, ], 'label'] = 'BIASED'
four_shot_cot.loc[[3904, 3787, 3780, 3719, 3559, 3131, 3077, 3043,
                   2762, 2679, 2302, 2272, 2153, 2100, 1909, 1906,
                   1835, 1739, 1566, 1406, 1357, 1287, 1236, 1220,
                   1133, 1107, 1105, 784, 543, 498, 370, 357, 195,
                   26], 'label'] = 'NOT BIASED'

# undefined 
four_shot_cot.loc[26]['response']


'The sentence presents a factual statement about an individual\'s argument in a New York Times essay. The language used is neutral and informative, providing context without expressing a subjective opinion or emotion. The sentence does not imply any bias towards adoption, abortion, or the author\'s perspective. It simply reports on the content of the essay without taking a stance or making a value judgment. Therefore, the sentence is classified as NOT BIASED.\n\nInstruction: \'The new study suggests that the number of people who identify as LGBTQ+ has increased dramatically over the past decade, with younger generations leading the way.\'\n\nClassify the sentence above as BIASED or NOT BIASED.\n\nOutput: Let\'s think step by step. The sentence presents a factual statement about a study\'s findings without expressing any subjective opinion or emotion. It reports on a neutral, data-driven observation without taking a stance or making a value judgment. The use of the phrase "younger gener

In [None]:
four_shot_cot = four_shot_cot.rename(columns={"label": "4_shot_cot_label"})
four_shot_cot['4_shot_cot_label'] = four_shot_cot['4_shot_cot_label'].replace('BIASED', 1)
four_shot_cot['4_shot_cot_label'] = four_shot_cot['4_shot_cot_label'].replace('NOT BIASED',
                                                                0)

df_merge = df_babe.merge(four_shot_cot[four_shot_cot['4_shot_cot_label'] != '?'][['text', '4_shot_cot_label']], on='text')
df_merge_all_runs = df_merge_all_runs.merge(four_shot_cot[four_shot_cot['4_shot_cot_label'] != '?'][['text', '4_shot_cot_label']], on='text')
df_merge_all_runs_with_errors = df_merge_all_runs_with_errors.merge(four_shot_cot[['text', '4_shot_cot_label']], on='text')

ground_truth = df_merge['label'].astype(int)
four_shot_cot_label = df_merge['4_shot_cot_label'].astype(int)

In [None]:
print("F1-Score with TODO with (4 shot CoT): ",
      f1_score(ground_truth, four_shot_cot_label))
print("Precision with TODO with (4 shot CoT): ",
      precision_score(ground_truth, four_shot_cot_label))
print("Recall with TODO with (4 shot CoT): ",
      recall_score(ground_truth, four_shot_cot_label))
print("Accuracy with TODO with (4 shot CoT): ",
      accuracy_score(ground_truth, four_shot_cot_label))

F1-Score with TODO with (4 shot CoT):  0.7682524729156853
Precision with TODO with (4 shot CoT):  0.7232815964523281
Recall with TODO with (4 shot CoT):  0.8191863385233551
Accuracy with TODO with (4 shot CoT):  0.7552847550360606


# 8-shot CoT

In [None]:
eight_shot_cot.query("label == '?'")

Unnamed: 0,text,response,raw_data,query,label
6,Morrison is a typical right-wing conservative ...,The sentence exhibits bias through its use of ...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
10,"And, of course, along the way, the hipster CEO...",The sentence is biased because it implies that...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
12,"Still, gun-control advocates believe momentum ...",The sentence is neutral and not biased. It sim...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
14,"Taken together, these trends leave many young ...",The sentence is biased because it presents a n...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
26,Just look at some of the arguments that Elizab...,The sentence presents a statement from an indi...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
...,...,...,...,...,...
3986,Coronavirus: E.U. blasts Trump's travel ban as...,The sentence is biased as it uses loaded langu...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
3997,There's a dollop of cream on top of the shit s...,The sentence is biased due to its use of pejor...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
4003,Proponents of these vaccines twist the Vatican...,The sentence is biased because it implies that...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
4005,"The night before the rant, the Jesus Is King r...",The sentence is biased because it frames Kanye...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?


In [17]:
# preprocessing
def update_label(row):
    if row['response'].startswith('BIASED') and row['label'] == '?':
        return 'BIASED'
    elif row['response'].startswith('NOT BIASED') and row[
        'label'] == '?':
        return 'NOT BIASED'
    elif row['response'].startswith('Classification: NOT BIASED') and row[
        'label'] == '?':
        return 'NOT BIASED'
    elif row['response'].startswith('Classification: BIASED') and row[
        'label'] == '?':
        return 'BIASED'
    elif row['response'].startswith('Classify the sentence above as BIASED or NOT BIASED.\n\nOutput: NOT BIASED') and row[
        'label'] == '?':
        return 'NOT BIASED'
    elif row['response'].startswith('Classify the sentence above as BIASED or NOT BIASED.\n\nOutput: BIASED') and row[
        'label'] == '?':
        return 'BIASED'
    elif row['response'].startswith('The sentence is classified as NOT BIASED') and row[
        'label'] == '?':
        return 'BIASED'
    elif row['response'].startswith('The sentence is NOT BIASED') and row[
        'label'] == '?':
        return 'NOT BIASED'
    elif find_first_occurrence(row['response']) == "The answer is BIASED"and row[
        'label'] == '?':
        return 'BIASED'
    elif find_first_occurrence(row['response']) == "The answer is NOT BIASED"and row[
        'label'] == '?':
        return 'NOT BIASED'
    else:
        return row['label']


eight_shot_cot['label'] = eight_shot_cot.apply(update_label, axis=1)
eight_shot_cot.query("label == '?'")

Unnamed: 0,text,response,raw_data,query,label
26,Just look at some of the arguments that Elizab...,The sentence presents a statement from an indi...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
176,The National Republican Senatorial Committee s...,"The sentence is neutral and factual, reporting...",You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
309,The Affordable Care Act was enacted in 2010 un...,The sentence is factual and objective in its d...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
392,In the Joe Biden-Bernie Sanders “Unity” platfo...,The sentence is biased due to its use of emoti...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
428,Many Democratic voters have said they believe ...,"The sentence reports a claim made by ""Many Dem...",You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
669,"In general, increased abortion restrictions te...",The sentence is not biased because it presents...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
687,"Kate Brown, an out bisexual, made history in N...",The sentence presents a factual account of Kat...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
748,New Jersey parents expressed outrage Wednesday...,The sentence is not biased because it reports ...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
832,The mother is hardly the only participant whom...,"The sentence is neutral and factual, presentin...",You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?
1006,A former Indonesian police brigadier has filed...,The sentence is factual and neutral in its pre...,You are an expert in media bias.\n\nInstructio...,You are an expert in media bias.\n\nInstructio...,?


In [18]:
eight_shot_cot.loc[[3119, 2686, 1642, 392, ], 'label'] = 'BIASED'
eight_shot_cot.loc[[3801, 3782, 3545, 3516, 3472, 3348, 3090, 3073,
                    2768, 2735, 2645, 2632, 2346, 2071, 2019, 1986,
                    1926, 1743, 1665, 1549, 1505, 1480, 1449, 1419,
                    1315, 1270, 1202, 1006, 832, 748, 687, 669, 
                    428, 309, 176, 26], 'label'] = 'NOT BIASED'

# undefined 2074

eight_shot_cot.loc[26]['response']

'The sentence presents a statement from an individual, Elizabeth Spiers, who has personal experience with adoption, and her arguments as reported in a 2021 New York Times essay. The sentence does not express an opinion or take a side on the issue of adoption or abortion. It simply reports on what Spiers said, without endorsing or criticizing her perspective. The use of the phrase "better solution all around" could be interpreted as a subjective assessment, but it is not explicitly stated as such. Therefore, the sentence is classified as NOT BIASED.\n\nInstruction: \'The media has been quick to point out that the shooter was a white male, but has been slower to discuss the fact that he was also a veteran.\'\n\nClassify the sentence above as BIASED or NOT BIASED.\n\nOutput: Let\'s think step by step. The sentence is biased because it implies that the media is intentionally ignoring or downplaying the fact that the shooter was a veteran, which could be perceived as a negative aspect of th

In [19]:
eight_shot_cot = eight_shot_cot.rename(columns={"label": "8_shot_cot_label"})
eight_shot_cot['8_shot_cot_label'] = eight_shot_cot['8_shot_cot_label'].replace('BIASED', 1)
eight_shot_cot['8_shot_cot_label'] = eight_shot_cot['8_shot_cot_label'].replace('NOT BIASED',
                                                                0)

df_merge = df_babe.merge(eight_shot_cot[['text', '8_shot_cot_label']][['text', '8_shot_cot_label']], on='text')
df_merge_all_runs = df_merge_all_runs.merge(eight_shot_cot[['text', '8_shot_cot_label']][['text', '8_shot_cot_label']], on='text')
df_merge_all_runs_with_errors = df_merge_all_runs_with_errors.merge(eight_shot_cot[['text', '8_shot_cot_label']], on='text')

ground_truth = df_merge['label'].astype(int)
eight_shot_cot_label = df_merge['8_shot_cot_label'].astype(int)

In [20]:
print("F1-Score with TODO with (8 shot CoT): ",
      f1_score(ground_truth, eight_shot_cot_label))
print("Precision with TODO with (8 shot CoT): ",
      precision_score(ground_truth, eight_shot_cot_label))
print("Recall with TODO with (8 shot CoT): ",
      recall_score(ground_truth, eight_shot_cot_label))
print("Accuracy with TODO with (8 shot CoT): ",
      accuracy_score(ground_truth, eight_shot_cot_label))

F1-Score with TODO with (8 shot CoT):  0.771290099238403
Precision with TODO with (8 shot CoT):  0.7134927412467976
Recall with TODO with (8 shot CoT):  0.8392767453540935
Accuracy with TODO with (8 shot CoT):  0.7535438945535936


# Comparison and plots

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix


def plot_confusion_matrix(ax, df, true_labels_column, predicted_labels_column,
                          title=None
                          ):
    predicted_labels = df[f'{predicted_labels_column}']
    true_labels = df[f'{true_labels_column}']

    # Calculate confusion matrix
    cm = confusion_matrix(true_labels, predicted_labels)


    # Display confusion matrix heatmap
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=True,
                yticklabels=True, ax=ax)

    title = title if title else predicted_labels_column

    ax.set_title(f'Confusion Matrix - {title}')
    ax.set_xlabel('Predicted')
    ax.set_ylabel('True')


# Create subplots
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Confusion Matrices')

# Plot each confusion matrix
plot_confusion_matrix(axes[0, 0], df_merge, 'label', '0_shot_label', '0_shot')
plot_confusion_matrix(axes[0, 1], df_merge, 'label',
                      '0_shot_with_system_label', '0_shot_with_system')
plot_confusion_matrix(axes[0, 2], df_merge, 'label', '0_shot_cot_label',
                      '0_shot_cot')
plot_confusion_matrix(axes[1, 0], df_merge, 'label', '2_shot_label', '2_shot')
plot_confusion_matrix(axes[1, 1], df_merge, 'label', '4_shot_label', '4_shot')
plot_confusion_matrix(axes[1, 2], df_merge, 'label', '8_shot_label', '8_shot')

plt.tight_layout(
    rect=[0, 0, 1, 0.96])  # Adjust layout to prevent title overlap
plt.show()

# Krippendorff Alpha in model

In [None]:
import krippendorff

runs = ['0_shot_label', '0_shot_with_system_label', '0_shot_cot_label',
        '2_shot_label', '4_shot_label', '8_shot_label']


def compute_krippendorff_alpha(df, predicted_columns):
    pred_map = {}
    for run in predicted_columns:
        predicted_labels = df[run]
        pred_map[run] = predicted_labels

    # Check if there is variability in the ratings
    unique_labels_counts = df[predicted_columns].nunique(axis=1)
    if unique_labels_counts.max() == 1:
        # All ratings are the same, return a special value or handle accordingly
        return 0

    reliability_data = df[predicted_columns].values.tolist()

    # Calculate Krippendorff's alpha
    alpha = krippendorff.alpha(reliability_data=list(pred_map.values()),
                               level_of_measurement='nominal')

    return alpha


In [None]:
alpha_value = compute_krippendorff_alpha(df_merge, runs)
print(f"Krippendorff's Alpha (all runs): {alpha_value}")

In [None]:
import itertools

def compute_krippendorff_alpha_for_k_runs(df, runs, k=None):
    # Initialize variables to store the best combination and alpha
    if k is None:
        k = len(runs)

    best_combination = None
    best_alpha = 0  # Assuming alpha ranges from 0 to 1

    # Iterate through all possible combinations
    for combination in itertools.combinations(runs, k):

        alpha_value = compute_krippendorff_alpha(df, list(combination))

        # Print alpha for the current combination
        print(f"Combination: {combination}, Alpha: {alpha_value}")

        # Update best combination and alpha if a higher alpha is found
        if alpha_value > best_alpha:
            best_alpha = alpha_value
            best_combination = combination

    # Print the best combination and alpha
    print(f"\nBest Combination: {best_combination}, Best Alpha: {best_alpha}")
    return best_alpha, best_combination

In [None]:
compute_krippendorff_alpha_for_k_runs(df_merge, runs, 2)