In [1]:
import torch
import os
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
import subprocess

# Replace with your actual Hugging Face token
HUGGINGFACE_TOKEN = ""

# Login to Hugging Face CLI
subprocess.run(["huggingface-cli", "login", "--token", HUGGINGFACE_TOKEN], check=True)

In [3]:
!pip install --upgrade transformers datasets --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m72.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25h

## Load llama model from Hugging Face

In [None]:
llama_model_name = "meta-llama/Llama-2-7b-hf"
tokenizer_llama = AutoTokenizer.from_pretrained(llama_model_name)
model_llama = AutoModelForCausalLM.from_pretrained(
    llama_model_name,
    device_map="auto",
    torch_dtype=torch.float16
)

## Load UniLC Climate Fever Dataset

In [16]:
from datasets import load_dataset
# climate dataset: https://huggingface.co/datasets/tdiggelm/climate_fever
ds = load_dataset("tdiggelm/climate_fever")

README.md:   0%|          | 0.00/8.09k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/869k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/1535 [00:00<?, ? examples/s]

In [17]:
# Filter the data points with 'support' and 'refute' labels
ds_filtered = ds.filter(lambda example: example["claim_label"] in [0,1])
print(ds_filtered)

Filter:   0%|          | 0/1535 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['claim_id', 'claim', 'claim_label', 'evidences'],
        num_rows: 907
    })
})


Sample Data

In [18]:
split_name = list(ds_filtered.keys())[0]
print(f"Viewing samples from the '{split_name}' split:")
print(ds_filtered[split_name][:1])

Viewing samples from the 'test' split:
{'claim_id': ['0'], 'claim': ['Global warming is driving polar bears toward extinction'], 'claim_label': [0], 'evidences': [[{'evidence_id': 'Extinction risk from global warming:170', 'evidence_label': 2, 'article': 'Extinction risk from global warming', 'evidence': '"Recent Research Shows Human Activity Driving Earth Towards Global Extinction Event".', 'entropy': 0.6931471824645996, 'votes': ['SUPPORTS', 'NOT_ENOUGH_INFO', None, None, None]}, {'evidence_id': 'Global warming:14', 'evidence_label': 0, 'article': 'Global warming', 'evidence': 'Environmental impacts include the extinction or relocation of many species as their ecosystems change, most immediately the environments of coral reefs, mountains, and the Arctic.', 'entropy': 0.0, 'votes': ['SUPPORTS', 'SUPPORTS', None, None, None]}, {'evidence_id': 'Global warming:178', 'evidence_label': 2, 'article': 'Global warming', 'evidence': 'Rising temperatures push bees to their physiological limits,

## Climate Claim Classification with Llama and Comparison Against GPT-4o-mini

In [None]:
from openai import OpenAI
import openai
import torch
import re
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.metrics import accuracy_score, f1_score

def create_prompt(claim):
    """
    Returns a one-word answer prompt for classifying the climate claim.
    """
    return (
        "Is the claim about climate correct or wrong based on your judgement? "
        "Please only answer 'correct' or 'wrong'. One word, no sentence.\n"
        f"Claim: {claim}\n"
        "AnsweR:"
    )


def extract_after_answer(response_text):
    """
    Extracts the classification result ('correct' or 'wrong') from the response text.

    Returns:
    - 1 if the answer is "wrong"
    - 0 if the answer is "correct"
    - None if no valid answer is found
    """
    match = re.search(r'(?i)answer:\s*(correct|wrong)\b', response_text.strip())

    if match:
        return 1 if match.group(1).lower() == "wrong" else 0
    
    return 0

def classify_claim_llama(claim, model, tokenizer):
    """
    Uses Llama to classify a claim as 'correct' (0) or 'wrong' (1).
    """
    prompt = create_prompt(claim)
    print(f"3) The prompt is {prompt}")
    
    if "chat" in tokenizer.name_or_path:
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
        input_text = tokenizer.apply_chat_template(messages, tokenize=False)
    else:
        input_text = prompt
        
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=10)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("4) Llama Response:", response)
    return extract_after_answer(response)


# Set your GPT-4o-mini credentials and base URL

os.environ["OPENAI_API_KEY"] = ""
os.environ["OPENAI_BASE_URL"] = "https://cmu.litellm.ai/v1"

client = OpenAI(
    # Defaults to os.environ.get("OPENAI_API_KEY")
    api_key=os.environ.get("OPENAI_API_KEY"),
    base_url = "https://cmu.litellm.ai/v1"
)

# ---------------------
# Example Loop
# ---------------------

predictions_llama = [] 
ground_truths = []

# We'll assume ds_filtered["test"] is your dataset filtered to claims you want to evaluate
# e.g., only "SUPPORTS" or "REFUTES" claims. Adjust as needed.
for example in ds_filtered["test"]:
    claim = example["claim"]

    # GPT-4o-mini "ground truth"
    prompt = create_prompt(claim)
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=50,
        temperature=0.7
    )

    gpt_response_text = response.choices[0].message.content
    print("1) GPT-4 Response: ", gpt_response_text)
    gt = extract_after_answer("Answer: "+ gpt_response_text)
    print(f"2) GPT-4 Ground Truth: {gt}")
    print("---------------------------")
    
    ground_truths.append(gt)

    pred_llama = classify_claim_llama(claim, model_llama, tokenizer_llama)
    predictions_llama.append(pred_llama)

    # 4) Print for inspection
    print(f"\nClaim: {claim}")
    print("Ground Truth (GPT-4o-mini):", "WRONG" if gt == 1 else "CORRECT")
    # print("SAIL-7b Prediction:", "WRONG" if pred == 1 else "CORRECT")
    print("Llama Prediction:", "WRONG" if pred_llama == 1 else "CORRECT")
    print("-" * 50)

# ---------------------
# Evaluation
# ---------------------

acc_llama = accuracy_score(ground_truths, predictions_llama)
f1_llama = f1_score(ground_truths, predictions_llama, average='macro')

print("Llama Accuracy vs GPT-4o-mini Ground Truth:", acc_llama)
print("Llama F1 Score vs GPT-4o-mini Ground Truth:", f1_llama)