## Import Modules & Data


In [None]:
!pip install haystack-ai

In [2]:
import pandas as pd
from tqdm import tqdm
import time
import logging
from google.colab import userdata
from nltk.tokenize import sent_tokenize
import numpy as np
from collections import defaultdict

from haystack import Pipeline
from haystack.components.generators import OpenAIGenerator
from haystack.components.builders import PromptBuilder
from haystack.components.generators import HuggingFaceAPIGenerator
from haystack.utils import Secret

In [None]:
df = pd.read_csv('manually_labelled.csv', header=0)
df = df.drop(index=0).reset_index(drop=True)
df = df.rename(columns={'relevance': 'manual_label'})
df['manual_label'] = df['manual_label'].str.lower()
df['manual_label'] = df['manual_label'].replace('discarded', 'not relevant')
df = df[df['manual_label'] == 'relevant']
print('N of relevant articles that have bias definitions based on human evaluation:', len(df))

In [None]:
df['n_words'] = df['body_text'].apply(lambda x: len(str(x).split()) if isinstance(x, str) else 0)


# General Configurations



In [5]:
# Prompt template for definition extraction
extraction_prompt = """
In this task, your goal is to extract the media bias related definitions from the provided article text. You must identify a definition, whether it is explicit or implicit. Focus on introductory chapters, methodology, or sections discussing theoretical frameworks, as these areas are most likely to contain definitions. If explicit definitions are present, extract them directly. If explicit definitions are not found, you are required to infer the bias-related definition from the context. Ensure the definition is short, precise, consistent, and non-circular (avoid circular reasoning; the term being defined should not be used in the definition).
\nTask: {{task_mode}}
\nArticle Text: {{article_text}}
\nDefinitions (Ensure a definition is extracted):
"""

mode_instructions = {
    'zero_shot': "Rely solely on your own understanding of media bias to extract definitions from the text.",
    'contextual_casual': "Consider the broader context of media bias when making your determination. Often, the same news is presented in different ways. Sometimes wordings differ, other times only a certain part of the information is presented. Depending on wording and selection, news can carry more than just factual information, for example, opinions or ideologies. This is called Media Bias",
    'contextual_academic': "Consider the broader context of media bias when making your determination. In the academic field, media bias is defined as  'slanted news coverage, can strongly impact the public perception of the reported topics'",
    'cot': "Analyze the content step-by-step. First, search for explicit definitions in the relevant sections. If none are found, infer implicit definitions from the context. List all relevant definitions clearly.",
    'role': "You are an expert in media bias studies. Use your knowledge and expertise to identify and extract explicit definitions of critical terms such as 'bias'."
}

modes = ['zero_shot', 'contextual_casual', 'contextual_academic', 'cot', 'role']


# Initialize & run

In [None]:
pip install anthropic-haystack
from haystack_integrations.components.generators.anthropic import AnthropicGenerator

In [None]:
# Initialize components
claude_api_key = userdata.get('claude_api_key')
prompt_builder = PromptBuilder(template=extraction_prompt)
claude_generator = AnthropicGenerator(api_key=Secret.from_token(claude_api_key), model="claude-3-sonnet-20240229") 

# Create pipeline
p_claude = Pipeline()
p_claude.add_component("relevance_prompt", prompt_builder)
p_claude.add_component("claude", claude_generator)
p_claude.connect("relevance_prompt", "claude")

In [8]:
def extract_definitions_chatgpt(mode, body_text):
  mode = f"{mode_instructions[mode]}"
  prediction = p_claude.run({"extraction_prompt": {"task_mode": mode, "article_text": body_text}})
  result = []
  result.append(prediction['claude']['replies'][0])
  time.sleep(3)

  return result

In [None]:
definitions_results = defaultdict(list) # 40 sec/iteration for row

log_data = {
    'errors': [],
    'info': []
}

for index, row in tqdm(df.iloc[50:].iterrows(), total=63):
    definitions_results['id_check'].append(row['paperid'] )

    for mode in modes:
        try:
            prediction = extract_definitions_chatgpt(mode, row['body_text'])
        except Exception as e:
            print(f'errors started, {e}')
            prediction = ['error']
            log_data['errors'].append(f"Error at index {index}, mode {mode}: {e}")

        definitions_results[f'{mode}'].append(prediction)

log_data['info'].append('Experiment completed successfully')

In [None]:
for mode in modes:
    df[f'{mode}_definition'] = definitions_results[mode]

In [17]:
df.to_csv('extracted_definitions.csv')