In [11]:
# !jupyter nbconvert --to python llama_test.ipynb

In [3]:
import re
import torch
import datasets
import torch
import os
import sys
import time
from typing import List
from transformers import LlamaTokenizer
from inference.safety_utils import get_safety_checker
from inference.model_utils import load_model, load_peft_model, load_llama_from_config


In [1]:
class Config():
    model_name: str='/root/Model/llama-2-7b'
    peft_model: str='/root/Model/llama-output-01/checkpoint-850'
    # peft_model: str=None
    quantization: bool=True
    max_new_tokens: int=6 #The maximum numbers of tokens to generate
    seed: int=42 #seed value for reproducibility
    do_sample: bool=True #Whether or not to use sampling ; use greedy decoding otherwise.
    min_length: int=None #The minimum length of the sequence to be generated, input prompt + min_new_tokens
    use_cache: bool=True  #[optional] Whether or not the model should use the past last key/values attentions Whether or not the model should use the past last key/values attentions (if applicable to the model) to speed up decoding.
    top_p: float=0.9 # [optional] If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
    temperature: float=0.01 # [optional] The value used to modulate the next token probabilities.
    top_k: int=50 # [optional] The number of highest probability vocabulary tokens to keep for top-k-filtering.
    repetition_penalty: float=2.0 #The parameter for repetition penalty. 1.0 means no penalty.
    length_penalty: int=1 #[optional] Exponential penalty to the length that is used with beam-based generation. 


conf = Config()

In [4]:
# Set the seeds for reproducibility
torch.cuda.manual_seed(conf.seed)
torch.manual_seed(conf.seed)

model = load_model(conf.model_name, conf.quantization)
tokenizer = LlamaTokenizer.from_pretrained(conf.model_name)
tokenizer.add_special_tokens(
    {
        "pad_token": "<PAD>",
    }
)

if conf.peft_model:
    model = load_peft_model(model, conf.peft_model)

model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear8bitLt(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
            

In [5]:
def inference(user_prompt):
    batch = tokenizer(user_prompt, return_tensors="pt")
    batch = {k: v.to("cuda") for k, v in batch.items()}
    start = time.perf_counter()
    with torch.no_grad():
        outputs = model.generate(
            **batch,
            max_new_tokens=conf.max_new_tokens,
            do_sample=conf.do_sample,
            top_p=conf.top_p,
            temperature=conf.temperature,
            min_length=conf.min_length,
            use_cache=conf.use_cache,
            top_k=conf.top_k,
            repetition_penalty=conf.repetition_penalty,
            length_penalty=conf.length_penalty,
        )
    e2e_inference_time = (time.perf_counter()-start)*1000
    # print(f"the inference time is {e2e_inference_time} ms")
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [6]:
pattern = re.compile(r'Answer:\nOption[ :-]*\(*([\w])\)*')

def get_result(response):
    # extract the answer
    res = pattern.findall(response.strip())
    if len(res) == 1:
        answer = res[0]  # 'A', 'B', ...
    else:
        answer = "FAILED"

    return answer.capitalize()

get_result('Answer:\nOption ((C).')

'C'

In [7]:
def load_test_dataset():
    dataset = datasets.load_dataset("metaeval/ScienceQA_text_only", split='test')
    choice_prefixes = [chr(ord('A') + i) for i in range(26)] # A-Z

    prompt = '''Context: {hint}\nQuestion: {question}\nOptions: {options}\n---\nAnswer:\n'''

    def format_options(options):
        return '\n'.join([f'({c}) {o}' for c, o in zip(choice_prefixes, options)])

    def apply_prompt_template(r):
        options = format_options(r['choices'])
        return {
            "text": prompt.format(
                hint=r["hint"],
                question=r["question"],
                options=options,
            ),
            "answer": f"({choice_prefixes[r['answer']]})"
        }

    return dataset.map(apply_prompt_template, remove_columns=list(dataset.features))
        
test_dataset = load_test_dataset()

In [8]:
id = 100
print(test_dataset['answer'][id])

response = inference(test_dataset['text'][id])
print(response)
get_result(response.split('---')[1])

(A)


Context: Read the description of a trait.
Michelle has wavy hair.
Question: What information supports the conclusion that Michelle inherited this trait?
Options: (A) Michelle's parents were born with wavy hair. They passed down this trait to Michelle.
(B) Michelle and her mother both have short hair.
---
Answer:
Option B


'B'

In [9]:
from tqdm import tqdm

responses, preds = [], []

for x in tqdm(test_dataset['text']):
    response = inference(x).split('---')[1]
    responses.append(response)
    preds.append(get_result(response))

100%|██████████| 2224/2224 [35:52<00:00,  1.03it/s]


In [10]:
from sklearn.metrics import accuracy_score

print(accuracy_score([x[1:2] for x in test_dataset['answer']], preds))

0.6344424460431655


In [12]:
new_preds = [get_result(x) for x in responses]
accuracy_score([x[1:2] for x in test_dataset['answer']], new_preds)

0.6344424460431655

In [16]:
for i, x, y in zip(range(len(new_preds)), new_preds, responses):
    if len(x) > 1:
        print(i,x, y.replace('\n', ''))

92 Failed Answer:Observe trevors physical
189 Failed Answer:Future
262 Failed Answer:Option 
288 Failed Answer:Option 
417 Failed Answer:Option ***
440 Failed Answer:Metaphore
521 Failed Answer:Option 
567 Failed Answer:The liquid in this pressure cook
664 Failed Answer:Mass can be measured in
736 Failed Answer:Option
737 Failed Answer:The correct answer is Option B
860 Failed Answer:Option 
1033 Failed Answer:Carbon is a chemical comp
1197 Failed Answer:Option 
1318 Failed Answer:Option 
1418 Failed Answer:Option--
1688 Failed Answer:Option ***
1754 Failed Answer:Option
1776 Failed Answer:
1925 Failed Answer:Pouring out water from
1983 Failed Answer:Option 
2008 Failed Answer:The gravitational force exert
2113 Failed Answer:Lawyer
2156 Failed Answer:Cell walls are made of dead
2167 Failed Answer:Option 
