In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0, 1"
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch

MODEL = "meta-llama/Llama-2-13b-chat-hf"
model = AutoModelForCausalLM.from_pretrained(MODEL,
                                            torch_dtype=torch.float16,
                                            trust_remote_code=True,
                                            device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL)
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 3/3 [00:33<00:00, 11.20s/it]


In [24]:
from utils import load_jsonl

gt = load_jsonl("benchmark_v_0_5.jsonl")
pred = load_jsonl("llama2-13b-posthoc.jsonl")

In [3]:
template = \
"""<s>[INST] You are an assessor to give judgement on a reasoning problem.
Here is the text to be assessed:

<text>
{input_reasoning}
</text>

Does the above text mention or contain the following reference reasoning step:

<reference>
{reference}
</reference>

Answer (yes or no):[/INST]
"""

In [18]:
def llm_generate(input_txt):
    sequences = pipeline(
            input_txt,
            max_new_tokens=16,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            do_sample=False,
            return_full_text=False
    )
    return sequences[0]['generated_text']

In [15]:
import re
from nltk import sent_tokenize
def process_human_reasoning(human_reason):
    human_reason = re.sub(r'\n+', '\n', human_reason).strip()
    # items = sent_tokenize(human_reason)
    items = human_reason.split("\n")
    items = [re.sub(r'^\d+\.\s*', '', item).strip() for item in items]
    # items = [item for item in items if len(item) > 0]
    return items
print("\n".join(process_human_reasoning(gt[1]["human_reason"])))

The summary mentions five ambitious clubs are locked in a bid for two champions league places.
But in the article it never mentions the word 'bid', the truth in article is five ambitious clubs are locked in a scramble for two Champions League places.
As the summary has the word should not be contained, it is inconsistent with article.
Therefore, the answer is no, the summary is not consistent with the article.


In [19]:
sample1 = template.format(input_reasoning=pred[1]["reason"], reference=process_human_reasoning(gt[1]["human_reason"])[1])

print(llm_generate(sample1))




No, the text does not mention or contain the reference reasoning step you provided


In [30]:
import string

def swap_punctuation(input:str):
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    return input.translate(translator)

def answer_parse(input:str):
    words = swap_punctuation(input.lower()).split()
    try: 
        assert ("not" in words) or ("no" in words) or ("yes" in words)
    except:
        print(words)

    return 0 if ("no" in words) or ("not" in words) else 1

In [35]:
from tqdm import tqdm
scores = []
for idx, human in enumerate(tqdm(gt[1266:])):
    predicted = pred[idx]["reason"]
    # print(predicted)
    ground_truth = process_human_reasoning(human["human_reason"])
    points = []
    for item in ground_truth:
        answer = llm_generate(template.format(input_reasoning=predicted, reference=item))
        points.append(answer_parse(answer))
    # print(points)
    scores.append(sum(points)/len(points))

 95%|█████████▌| 1268/1330 [59:23<02:54,  2.81s/it] 


AssertionError: 