In [1]:
import re
import torch
import datasets
import torch
import os
import sys
import time
from typing import List
from transformers import LlamaTokenizer
from inference.safety_utils import get_safety_checker
from inference.model_utils import load_model, load_peft_model, load_llama_from_config


In [2]:
class Config():
    model_name: str='/root/Model/llama-2-7b'
    peft_model: str='/root/Model/llama-output/checkpoint-384'
    # peft_model: str=None
    quantization: bool=True
    max_new_tokens: int=6 #The maximum numbers of tokens to generate
    seed: int=42 #seed value for reproducibility
    do_sample: bool=True #Whether or not to use sampling ; use greedy decoding otherwise.
    min_length: int=None #The minimum length of the sequence to be generated, input prompt + min_new_tokens
    use_cache: bool=True  #[optional] Whether or not the model should use the past last key/values attentions Whether or not the model should use the past last key/values attentions (if applicable to the model) to speed up decoding.
    top_p: float=0.9 # [optional] If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
    temperature: float=0.01 # [optional] The value used to modulate the next token probabilities.
    top_k: int=50 # [optional] The number of highest probability vocabulary tokens to keep for top-k-filtering.
    repetition_penalty: float=2.0 #The parameter for repetition penalty. 1.0 means no penalty.
    length_penalty: int=1 #[optional] Exponential penalty to the length that is used with beam-based generation. 


conf = Config()

In [3]:
# Set the seeds for reproducibility
torch.cuda.manual_seed(conf.seed)
torch.manual_seed(conf.seed)

model = load_model(conf.model_name, conf.quantization)
tokenizer = LlamaTokenizer.from_pretrained(conf.model_name)
tokenizer.add_special_tokens(
    {
        "pad_token": "<PAD>",
    }
)

if conf.peft_model:
    model = load_peft_model(model, conf.peft_model)

model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear8bitLt(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
            

In [4]:
def inference(user_prompt):
    batch = tokenizer(user_prompt, return_tensors="pt")
    batch = {k: v.to("cuda") for k, v in batch.items()}
    start = time.perf_counter()
    with torch.no_grad():
        outputs = model.generate(
            **batch,
            max_new_tokens=conf.max_new_tokens,
            do_sample=conf.do_sample,
            top_p=conf.top_p,
            temperature=conf.temperature,
            min_length=conf.min_length,
            use_cache=conf.use_cache,
            top_k=conf.top_k,
            repetition_penalty=conf.repetition_penalty,
            length_penalty=conf.length_penalty,
        )
    e2e_inference_time = (time.perf_counter()-start)*1000
    # print(f"the inference time is {e2e_inference_time} ms")
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [5]:
pattern = re.compile(r'Answer:\nOption[ :-]*\(*([\w])\)*')

def get_result(response):
    # extract the answer
    res = pattern.findall(response.strip())
    if len(res) == 1:
        answer = res[0]  # 'A', 'B', ...
    else:
        answer = "FAILED"

    return answer.capitalize()

get_result('Answer:\nOption ((C).')

'B'

In [6]:
def load_test_dataset():
    dataset = datasets.load_dataset("metaeval/ScienceQA_text_only", split='test')
    choice_prefixes = [chr(ord('A') + i) for i in range(26)] # A-Z

    prompt = '''Context: {hint}\nQuestion: {question}\nOptions: {options}\n---\nAnswer:\n'''

    def format_options(options):
        return '\n'.join([f'({c}) {o}' for c, o in zip(choice_prefixes, options)])

    def apply_prompt_template(r):
        options = format_options(r['choices'])
        return {
            "text": prompt.format(
                hint=r["hint"],
                question=r["question"],
                options=options,
            ),
            "answer": f"({choice_prefixes[r['answer']]})"
        }

    return dataset.map(apply_prompt_template, remove_columns=list(dataset.features))
        
test_dataset = load_test_dataset()

In [7]:
id = 100
print(test_dataset['answer'][id])

response = inference(test_dataset['text'][id])
print(response)
get_result(response.split('---')[1])

(A)


Context: Read the description of a trait.
Michelle has wavy hair.
Question: What information supports the conclusion that Michelle inherited this trait?
Options: (A) Michelle's parents were born with wavy hair. They passed down this trait to Michelle.
(B) Michelle and her mother both have short hair.
---
Answer:
Option A


'A'

In [11]:
from tqdm import tqdm

responses, preds = [], []

for x in tqdm(test_dataset['text']):
    response = inference(x).split('---')[1]
    responses.append(response)
    preds.append(get_result(response))

100%|██████████| 2224/2224 [38:23<00:00,  1.04s/it]


In [12]:
from sklearn.metrics import accuracy_score

accuracy_score([x[1:2] for x in test_dataset['answer']], preds)

0.2972122302158273

In [39]:
new_preds = [get_result(x) for x in responses]
accuracy_score([x[1:2] for x in test_dataset['answer']], new_preds)

0.3035071942446043

In [40]:
for i, x in zip(range(100), new_preds[:100]):
    print(i,x)

0 C
1 B
2 B
3 A
4 C
5 A
6 A
7 D
8 B
9 B
10 Failed
11 A
12 E
13 Failed
14 B
15 B
16 E
17 C
18 Failed
19 Failed
20 A
21 C
22 Failed
23 B
24 A
25 Failed
26 A
27 C
28 A
29 B
30 B
31 Failed
32 B
33 B
34 B
35 A
36 Failed
37 Failed
38 C
39 C
40 A
41 B
42 A
43 C
44 C
45 C
46 Failed
47 B
48 B
49 Failed
50 Failed
51 Failed
52 B
53 C
54 Failed
55 B
56 C
57 A
58 Failed
59 B
60 C
61 Failed
62 B
63 Failed
64 B
65 B
66 C
67 Failed
68 A
69 Failed
70 B
71 B
72 A
73 C
74 E
75 Failed
76 C
77 A
78 B
79 B
80 A
81 B
82 Failed
83 B
84 A
85 A
86 Failed
87 C
88 A
89 Failed
90 D
91 C
92 B
93 D
94 A
95 Failed
96 B
97 Failed
98 C
99 Failed


In [37]:
for i, x in zip(range(100), responses[:100]):
    print(i,x.replace('\n', ''))

0 Answer:Option ((C)}.
1 Answer:Option B
2 Answer:Option B
3 Answer:Option A
4 Answer:Option ((C).
5 Answer:Option A
6 Answer:Option A
7 Answer:Option d
8 Answer:Option B
9 Answer:Option B
10 Answer:Streaked is between ser
11 Answer:Option A
12 Answer:Option ((E))
13 Answer:Option ()
14 Answer:Option B
15 Answer:Option B
16 Answer:Option ((Earth's
17 Answer:Option ((C).
18 Answer:## Answer
19 Answer:This question is phrased as
20 Answer:Option A
21 Answer:Option ((C).
22 Answer:Option ()
23 Answer:Option B
24 Answer:Option A
25 Answer:Option ()
26 Answer:Option A
27 Answer:Option c
28 Answer:Option A
29 Answer:Option B
30 Answer:Option - B
31 Answer:Most mamas have
32 Answer:Option b
33 Answer:Option B
34 Answer:Option B
35 Answer:Option A
36 Answer:Open SyLLABE -
37 Answer:Option ()
38 Answer:Option ((C
39 Answer:Option ((C))
40 Answer:Option A
41 Answer:Option b
42 Answer:Option ((a))
43 Answer:Option ((C).
44 Answer:Option:(C)
45 Answer:Option ((C).
46 Answer:Verify that motion and 