# Classification of relevant papers
- Goal: Use various LLMs to iterate over title+abstract of academic papers and determine whether they are relevant to media bias or not (relevant vs non-relevant).

# 1. Import Modules & Data
Haystack is an open source framework for building production-ready LLM applications, retrieval-augmented generative pipelines and state-of-the-art search systems that work intelligently over large document collections. It lets you quickly try out the latest AI models while being flexible and easy to use.

https://haystack.deepset.ai/



In [None]:
!pip install haystack-ai

In [2]:
import pandas as pd
from tqdm import tqdm
import time
import logging
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import numpy as np
from google.colab import userdata
from collections import defaultdict

from haystack import Pipeline
from haystack.components.generators import OpenAIGenerator
from haystack.components.builders import PromptBuilder
from haystack.components.generators import HuggingFaceAPIGenerator
from haystack.utils import Secret

In [None]:
# Data loading
df = pd.read_csv('/content/drive/MyDrive/Thesis/manually_labelled.csv', header=0)
df = df.drop(index=0).reset_index(drop=True)
df = df.rename(columns={'relevance': 'manual_label'})
df['manual_label'] = df['manual_label'].str.lower()
df['manual_label'] = df['manual_label'].replace('discarded', 'not relevant')
print(len(df))
df.head(3)

# 2. Prompting Techniques

In [4]:
# load diverse_examples
diverse_examples_df = pd.read_csv('diverse_examples.csv', delimiter = ';')
diverse_texts = "\n".join(
    [f"Title: {row['title']}, Abstract: {row['abstract']}, Label: {row['manual_label']}"
    for _, row in diverse_examples_df.iterrows()]
    )

diverse_examples_df

Unnamed: 0,paperId,title,abstract,manual_label,cluster
0,c84a169e6df175c4662012d3ba7dbf8fa1b5abc9,Fake news’ is the invention of a liar: How fal...,Alarmed by the oversimplifications related to ...,relevant,0
1,667b43c8adad628c60c20810197cdeec6679714e,Deciphering the biology of Mycobacterium tuber...,Countless millions of people have died from tu...,not relevant,1
2,70c93e5e38a8176590f69c0491fd63ab2a9e67c4,Confirmation Bias: A Ubiquitous Phenomenon in ...,"Confirmation bias, as the term is typically us...",relevant,2
3,8293d31675a21a99bcb7785ccfe670f95fce1557,Psychological research and global climate change.,Human behaviour is integral not only to causin...,not relevant,3


In [5]:
prompt_template = """
    In this task, your goal is to determine whether a given article is relevant to the field of media bias based on the article title and abstract.
    \nTask: {{query}}
    \Instance: {{title}}, {{abstract}}
    \End: = 'Now, with these instructions in mind and the given text, please reply with one of the following options: 1) relevant; 2) not relevant; 3) can\'t say (use this option as least as possible).'
    \nAnswer:
    """

# Mode instructions (include this as different query, depending on the mode)
mode_instructions = {
    'zero_shot': 'You have no prior examples to guide you. Rely solely on your understanding of media bias.',
    # contextual_casual_similar and contextual_academic_similar are created in another flow
    'contextual_casual_diverse': f"""Consider the broader context of media bias when making your determination. Often, the same news is presented in different ways. Sometimes wordings differ, other times only a certain part of the information is presented. Depending on wording and selection, news can carry more than just factual information, for example, opinions or ideologies. This is called Media Bias.'\nRely on the following examples:{diverse_texts}""",
    'contextual_academic_diverse': f"""Consider the broader context of media bias when making your determination. In the academic field, media bias is defined as 'slanted news coverage, can strongly impact the public perception of the reported topics'.'\nRely on the following examples:{diverse_texts}""",
    'cot': 'Think step-by-step through the content to determine its relevance to media bias. First, analyze the title for keywords related to media bias, then, review the abstract. Use a structured approach to decide relevance.',
    'role': 'You are an expert in media bias studies. Use your knowledge and expertise to determine the relevance of a paper to the field of media bias studies.',
    'emotional': 'Are you sure that’s your final answer? Believe in your abilities and strive for excellence. Your hard work will yield remarkable results!'
}

#modes = ['zero_shot', 'contextual_casual_similar', 'contextual_academic_similar', 'contextual_casual_diverse', 'contextual_academic_diverse', 'cot', 'role', 'emotional']
modes = ['zero_shot', 'contextual_casual_diverse', 'contextual_academic_diverse', 'cot', 'role', 'emotional']


In [6]:
def contextual_query_similar(title, abstract, df, embeddings, exclude_index, k):
    query = f"{title} {abstract}"
    similar_examples = find_similar_examples(query, k, df, embeddings, exclude_index)
    similar_texts = "\n".join(
        [f"Title: {row['title']}, Abstract: {row['abstract']}, Label: {row['manual_label']}"
         for _, row in similar_examples.iterrows()]
    )
    contextual_casual_prompt = f"""
    Consider the broader context of media bias when making your determination. Often, the same news is presented in different ways. Sometimes wordings differ, other times only a certain part of the information is presented. Depending on wording and selection, news can carry more than just factual information, for example, opinions or ideologies. This is called Media Bias.',
    \nRely on the following examples:{similar_texts}
    """
    contextual_academic_prompt = f"""
    Consider the broader context of media bias when making your determination. In the academic field, media bias is defined as 'slanted news coverage, can strongly impact the public perception of the reported topics',
    \nRely on the following examples:{similar_texts}
    """

    return contextual_casual_prompt, contextual_academic_prompt


## 2.1 Diversity & Similarity Principles for In-Context Active Learning

In [None]:
pip install -U sentence-transformers

In [8]:
from sentence_transformers import SentenceTransformer, util
import torch

In [None]:
# Load pre-trained Sentence-BERT model
model = SentenceTransformer('BAAI/llm-embedder')

In [None]:
# Compute embeddings for titles and abstracts - 10 mins for 2.2 manual papers

def compute_embeddings(df, text_column):
    texts = df[text_column].tolist()
    embeddings = model.encode(texts, convert_to_tensor=True, show_progress_bar=True)
    return embeddings

df['combined_text'] = df['title'] + ' ' + df['abstract']
embeddings = compute_embeddings(df, 'combined_text')

In [11]:
# Function to find & use the most similar examples

def find_similar_examples(query, k, df, embeddings, exclude_index):
    query_embedding = model.encode(query, convert_to_tensor=True, show_progress_bar=False)
    cos_similarities = util.pytorch_cos_sim(query_embedding, embeddings)
    top_k_indices = torch.topk(cos_similarities, k + 1).indices[0].tolist()

    if exclude_index in top_k_indices:
        top_k_indices.remove(exclude_index)
    else:
        top_k_indices = top_k_indices[:k]

    return df.iloc[top_k_indices]

#### Diverse examples
- Already saved, no need to run again

In [None]:
# Function to find the optimal number of clusters using the elbow method

def find_optimal_k(embeddings, max_k=10):
    sse = []
    for k in range(1, max_k + 1):
        kmeans = KMeans(n_clusters=k, random_state=0).fit(embeddings.cpu().numpy())
        sse.append(kmeans.inertia_)

    plt.figure(figsize=(10, 5))
    plt.plot(range(1, max_k + 1), sse, marker='o')
    plt.xlabel('Number of clusters')
    plt.ylabel('Sum of squared distances')
    plt.title('Elbow Method for Optimal k')
    plt.show()

# Find the optimal k
find_optimal_k(embeddings)

In [None]:
kmeans = KMeans(n_clusters=3, random_state=17).fit(embeddings)
df['cluster'] = kmeans.labels_
df.groupby('cluster')['manual_label'].value_counts()

In [None]:
diverse_examples = df.groupby('cluster').apply(lambda x: x.sample(1)).reset_index(drop=True)
diverse_examples

In [None]:
diverse_examples_df.to_csv('diverse_examples.csv', index = False)
diverse_examples_df

# 3. Running the experiments

## Chat GPT 3.5
- Status: works, outputs are clean
- limits: 500 RPM, 10,000 RPD -400 rows per day max

In [None]:
# Initialize components
openai_api_key = userdata.get('personal_OPENAI_key')
prompt_builder = PromptBuilder(template=prompt_template)
chatgpt_generator = OpenAIGenerator(api_key=Secret.from_token(openai_api_key), model="gpt-3.5-turbo")

# Create pipeline
p_chatgpt = Pipeline()
p_chatgpt.add_component("relevance_prompt", prompt_builder)
p_chatgpt.add_component("chatgpt", chatgpt_generator)
p_chatgpt.connect("relevance_prompt", "chatgpt")

In [None]:
def relevancy_classification_chatgpt(title, abstract, mode, df, exclude_index, k=3):
    if mode in ['contextual_casual_similar', 'contextual_academic_similar']:
        casual_prompt, academic_prompt = contextual_query_similar(title, abstract, df, embeddings, exclude_index, k)
        if mode == 'contextual_casual_similar':
            query = casual_prompt
        else:
            query = academic_prompt
    else:
        query = f"{mode_instructions[mode]}"

    results = []
    for _ in range(3):
        prediction = p_chatgpt.run({"relevance_prompt": {"query": query, "title": title, "abstract": abstract}})
        results.append(prediction['chatgpt']['replies'][0])
        time.sleep(0.5)

    return results

In [None]:
results_chatgpt = defaultdict(list) # 25 sec/iteration

log_data_chatgpt = {
    'errors': [],
    'info': []
}

for index, row in tqdm(df.iloc[0:2165].iterrows(), total=2165):
    results_chatgpt['id_check'].append(row['paperid'] )

    for mode in modes:
        try:
            predictions = relevancy_classification_chatgpt(row['title'], row['abstract'], mode, df, exclude_index=index)
        except Exception as e:
            print(f'errors started, {e}')
            predictions = ['error', 'error', 'error']
            log_data_chatgpt['errors'].append(f"Haystack Error at index {index}, mode {mode}: {e}")

        results_chatgpt[f'{mode}_run1'].append(predictions[0])
        results_chatgpt[f'{mode}_run2'].append(predictions[1])
        results_chatgpt[f'{mode}_run3'].append(predictions[2])

log_data_chatgpt['info'].append('Experiment completed successfully')

In [None]:
for mode in modes:
    df[f'chatgpt_{mode}_run1'] = results_chatgpt[f'{mode}_run1']
    df[f'chatgpt_{mode}_run2'] = results_chatgpt[f'{mode}_run2']
    df[f'chatgpt_{mode}_run3'] = results_chatgpt[f'{mode}_run3']

df

In [None]:
df.to_csv('chatgpt_relevancy.csv', index=False)

## Mistral-7B

In [None]:
hf_api_token = userdata.get('HF_TOKEN') # read token is needed here
mistral7_endpoint_url = userdata.get('mistral7_url')
prompt_builder = PromptBuilder(template=prompt_template)

mistral_generator = HuggingFaceAPIGenerator(
    api_type="inference_endpoints",
    api_params={"url": mistral7_endpoint_url},
    token=Secret.from_token(hf_api_token),
    generation_kwargs={"max_new_tokens": 50}
)

In [None]:
p_mistral = Pipeline()
p_mistral.add_component("relevance_prompt", prompt_builder)
p_mistral.add_component("mistral", mistral_generator)
p_mistral.connect("relevance_prompt", "mistral")

In [None]:
def relevancy_classification_mistral(title, abstract, mode, df, exclude_index, k=3):
    if mode in ['contextual_casual_similar', 'contextual_academic_similar']:
        casual_prompt, academic_prompt = contextual_query_similar(title, abstract, df, embeddings, exclude_index, k)
        if mode == 'contextual_casual_similar':
            query = casual_prompt
        else:
            query = academic_prompt
    else:
        query = f"{mode_instructions[mode]}"

    results = []
    for _ in range(3):
        prediction = p_mistral.run({"relevance_prompt": {"query": query, "title": title, "abstract": abstract}})
        results.append(prediction['mistral']['replies'][0])

    return results

In [None]:
results_mistral = defaultdict(list) # 80 sec/iteration for 3 runs

log_data_mistral = {
    'errors': [],
    'info': []
}

# Iterate over rows
for index, row in tqdm(df.iloc[0:2165].iterrows(), total=2165):
    for mode in modes:
        try:
            predictions = relevancy_classification_mistral(row['title'], row['abstract'], mode, df, exclude_index=index)
        except Exception as e:
            print(f'errors started, {e}')
            predictions = ['error', 'error', 'error']
            log_data_mistral['errors'].append(f"Haystack Error at index {index}, mode {mode}: {e}")

        results_mistral[f'{mode}_run1'].append(predictions[0])
        results_mistral[f'{mode}_run2'].append(predictions[1])
        results_mistral[f'{mode}_run3'].append(predictions[2])

log_data_mistral['info'].append('Experiment completed successfully')

In [None]:
for mode in modes:
    df[f'{mode}_run1'] = results_mistral[f'{mode}_run1']
    df[f'{mode}_run2'] = results_mistral[f'{mode}_run2']
    df[f'{mode}_run3'] = results_mistral[f'{mode}_run3']
df


In [None]:
df.to_csv('mistral_relevancy.csv', index=False)

## Openchat
- openchat/openchat-3.6-8b-20240522

In [None]:
hf_api_token = userdata.get('HF_TOKEN')
openchat_endpoint_url = userdata.get('openchat_url')
prompt_builder = PromptBuilder(template=prompt_template)

openchat_generator = HuggingFaceAPIGenerator(
    api_type="inference_endpoints",
    api_params={"url": openchat_endpoint_url},
    token=Secret.from_token(hf_api_token),
    generation_kwargs={"max_new_tokens": 50}
)

In [None]:
p_openchat = Pipeline()
p_openchat.add_component("relevance_prompt", prompt_builder)
p_openchat.add_component("openchat", openchat_generator)
p_openchat.connect("relevance_prompt", "openchat")

In [None]:
def relevancy_classification_openchat(title, abstract, mode, df, exclude_index, k=3):
    if mode in ['contextual_casual_similar', 'contextual_academic_similar']:
        casual_prompt, academic_prompt = contextual_query_similar(title, abstract, df, embeddings, exclude_index, k)
        if mode == 'contextual_casual_similar':
            query = casual_prompt
        else:
            query = academic_prompt
    else:
        query = f"{mode_instructions[mode]}"

    results = []
    for _ in range(3):
        prediction = p_openchat.run({"relevance_prompt": {"query": query, "title": title, "abstract": abstract}})
        results.append(prediction['openchat']['replies'][0])

    return results

In [None]:
results_openchat = defaultdict(list) #avg is 18 sec/iteration for one run

log_data_openchat = {
    'errors': [],
    'info': []
}

for index, row in tqdm(df.iloc[0:2165].iterrows(), total=2165):
    for mode in modes:
        try:
            predictions = relevancy_classification_openchat(row['title'], row['abstract'], mode, df, exclude_index=index)
        except Exception as e:
            predictions = ['error', 'error', 'error']
            log_data_openchat['errors'].append(f"Haystack Error at index {index}, mode {mode}: {e}")

        results_openchat[f'{mode}_run1'].append(predictions[0])
        results_openchat[f'{mode}_run2'].append(predictions[1])
        results_openchat[f'{mode}_run3'].append(predictions[2])

log_data_openchat['info'].append('Experiment completed successfully')

In [None]:
for mode in modes:
    df[f'{mode}_run1'] = results_openchat[f'{mode}_run1']
    df[f'{mode}_run2'] = results_openchat[f'{mode}_run2']
    df[f'{mode}_run3'] = results_openchat[f'{mode}_run3']
df


In [None]:
df.to_csv('openchat_relevancy.csv', index=False)

## Anthropic: Claude

In [None]:
pip install anthropic-haystack

In [None]:
from haystack_integrations.components.generators.anthropic import AnthropicGenerator

In [None]:
# Initialize components
claude_api_key = userdata.get('claude_api_key')
prompt_builder = PromptBuilder(template=prompt_template)
claude_generator = AnthropicGenerator(api_key=Secret.from_token(claude_api_key), model="claude-3-sonnet-20240229") 

# Create pipeline
p_claude = Pipeline()
p_claude.add_component("relevance_prompt", prompt_builder)
p_claude.add_component("claude", claude_generator)
p_claude.connect("relevance_prompt", "claude")

In [None]:
def relevancy_classification_claude(title, abstract, mode, df, exclude_index, k=3):
    if mode in ['contextual_casual_similar', 'contextual_academic_similar']:
        casual_prompt, academic_prompt = contextual_query_similar(title, abstract, df, embeddings, exclude_index, k)
        query = casual_prompt if mode == 'contextual_casual_similar' else academic_prompt
    else:
        query = f"{mode_instructions.get(mode, '')}"

    try:
        prediction = p_claude.run({"relevance_prompt": {"query": query, "title": title, "abstract": abstract}})
        result = prediction['claude']['replies'][0]
    except Exception as e:
        print(f'Error during Claude API call: {e}')
        result = 'error'

    return result

In [None]:
results_claude = defaultdict(list) # 30 sec/it, 50/MINUTE, 1M tokens/day for claude-3-sonnet-20240229
log_data_claude = {'errors': [], 'info': []}

for index, row in tqdm(df.iloc[0:2165].iterrows(), total=2165):
    for mode in modes:
        try:
            prediction = relevancy_classification_claude(row['title'], row['abstract'], mode, df, exclude_index=index)
        except Exception as e:
            print(f'Error at index {index}, mode {mode}: {e}')
            prediction = 'error'
            log_data_claude['errors'].append(f"Error at index {index}, mode {mode}: {e}")

        results_claude[f'{mode}'].append(prediction)

log_data_claude['info'].append('Experiment completed successfully')

In [None]:

for mode in modes:
    df[f'claude_{mode}'] = results_claude[f'{mode}']

In [None]:
df.to_csv('claude_relevancy.csv', index=False)

## Vicuna is a fine-tuned version of the original LLaMA
- vicuna-7b-v1-5-tne

In [8]:
hf_api_token = userdata.get('HF_TOKEN')
vicuna_endpoint_url = userdata.get('vicuna_url')
prompt_builder = PromptBuilder(template=prompt_template)

vicuna_generator = HuggingFaceAPIGenerator(
    api_type="inference_endpoints",
    api_params={"url": vicuna_endpoint_url},
    token=Secret.from_token(hf_api_token),
    generation_kwargs={"max_new_tokens": 50, "temperature": 0.7, "top_p": 0.9}
)

In [None]:
p_vicuna = Pipeline()
p_vicuna.add_component("relevance_prompt", prompt_builder)
p_vicuna.add_component("vicuna", vicuna_generator)
p_vicuna.connect("relevance_prompt", "vicuna")

In [10]:
def relevancy_classification_vicuna(title, abstract, mode, df, exclude_index, k=3):
    if mode in ['contextual_casual_similar', 'contextual_academic_similar']:
        casual_prompt, academic_prompt = contextual_query_similar(title, abstract, df, embeddings, exclude_index, k)
        if mode == 'contextual_casual_similar':
            query = casual_prompt
        else:
            query = academic_prompt
    else:
        query = f"{mode_instructions[mode]}"

    results = []
    for _ in range(1):
        prediction = p_vicuna.run({"relevance_prompt": {"query": query, "title": title, "abstract": abstract}})
        results.append(prediction['vicuna']['replies'][0])

    return results

In [None]:
results_vicuna = defaultdict(list) #avg is 5sec/iteration

log_data_vicuna = {
    'errors': [],
    'info': []
}

for index, row in tqdm(df.iloc[0:2165].iterrows(), total=2165):
    for mode in modes:
        try:
            predictions = relevancy_classification_vicuna(row['title'], row['abstract'], mode, df, exclude_index=index)
        except Exception as e:
            predictions = ['error']
            print(f'Error at index {index}, mode {mode}: {e}')
            log_data_vicuna['errors'].append(f"Haystack Error at index {index}, mode {mode}: {e}")

        results_vicuna[f'{mode}'].append(predictions[0])

log_data_vicuna['info'].append('Experiment completed successfully')

In [None]:
for mode in modes:
    df[f'{mode}'] = results_vicuna[f'{mode}']

In [21]:
df.to_csv('vicuna_relevancy.csv', index=False)