In [11]:
# !jupyter nbconvert --to python llama_test.ipynb

In [1]:
import re
import torch
import datasets
import torch
import os
import sys
import time
import pandas as pd
from typing import List
from tqdm import tqdm
from transformers import LlamaTokenizer
from inference.safety_utils import get_safety_checker
from inference.model_utils import load_model, load_peft_model, load_llama_from_config


In [2]:
class Config():
    model_name: str='/root/Model/llama_2_13B_chat'
    # peft_model: str='/root/Codes/QA-llama-recipes/llama2_lora_autotrain_2/checkpoint-463'
    peft_model: str=None
    quantization: bool=True
    max_new_tokens: int=6 #The maximum numbers of tokens to generate
    seed: int=42 #seed value for reproducibility
    do_sample: bool=True #Whether or not to use sampling ; use greedy decoding otherwise.
    min_length: int=None #The minimum length of the sequence to be generated, input prompt + min_new_tokens
    use_cache: bool=True  #[optional] Whether or not the model should use the past last key/values attentions Whether or not the model should use the past last key/values attentions (if applicable to the model) to speed up decoding.
    top_p: float=0.9 # [optional] If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
    temperature: float=0.01 # [optional] The value used to modulate the next token probabilities.
    top_k: int=50 # [optional] The number of highest probability vocabulary tokens to keep for top-k-filtering.
    repetition_penalty: float=2.0 #The parameter for repetition penalty. 1.0 means no penalty.
    length_penalty: int=1 #[optional] Exponential penalty to the length that is used with beam-based generation. 


conf = Config()

In [3]:
# Set the seeds for reproducibility
torch.cuda.manual_seed(conf.seed)
torch.manual_seed(conf.seed)

model = load_model(conf.model_name, conf.quantization)
tokenizer = LlamaTokenizer.from_pretrained(conf.model_name)
tokenizer.add_special_tokens(
    {
        "pad_token": "<PAD>",
    }
)

if conf.peft_model:
    model = load_peft_model(model, conf.peft_model)

model.eval()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 5120, padding_idx=0)
    (layers): ModuleList(
      (0-39): 40 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear8bitLt(in_features=5120, out_features=5120, bias=False)
          (k_proj): Linear8bitLt(in_features=5120, out_features=5120, bias=False)
          (v_proj): Linear8bitLt(in_features=5120, out_features=5120, bias=False)
          (o_proj): Linear8bitLt(in_features=5120, out_features=5120, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=5120, out_features=13824, bias=False)
          (up_proj): Linear8bitLt(in_features=5120, out_features=13824, bias=False)
          (down_proj): Linear8bitLt(in_features=13824, out_features=5120, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSN

In [4]:
def load_test_dataset(file):
    test_df = pd.read_csv(file)
    dataset = datasets.Dataset.from_pandas(test_df)

    def func(r):
        return {
            "text": r['input'],
            "answer": r['output']
        }

    return dataset.map(func, remove_columns=list(dataset.features))

test_dataset = load_test_dataset('./data/autotrain/valid.csv')
# test_dataset = load_test_dataset('./data/autotrain/valid_no_context.csv')

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [5]:
print(test_dataset['text'][0])

<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
<</SYS>>

### Instruction:
According to the context, select the most accurate answer for the quesiton.
### Question:
Which of the following statements accurately describes the impact of Modified Newtonian Dynamics (MOND) on the observed "missing baryonic mass" discrepancy in galaxy clusters?
### Options:
(A) MOND is a theory that reduces the observed missing baryonic mass in galaxy clusters by postulating the existence of a new form of matter called "fuzzy dark matter.

In [6]:
def precision_at_k(r, k):
    """Precision at k"""
    assert k <= len(r)
    assert k != 0
    return sum(int(x) for x in r[:k]) / k

def MAP_at_3(predictions, true_items):
    """Score is mean average precision at 3"""
    U = len(predictions)
    map_at_3 = 0.0
    for u in range(U):
        user_preds = predictions[u]
        user_true = true_items[u]
        user_results = [1 if item == user_true else 0 for item in user_preds]
        for k in range(min(len(user_preds), 3)):
            map_at_3 += precision_at_k(user_results, k+1) * user_results[k]
    return map_at_3 / U


def get_ans(text):
    inputs = tokenizer(text, return_tensors='pt')
    logits = model(input_ids=inputs['input_ids'].cuda(), attention_mask=inputs['attention_mask'].cuda()).logits[0, -1]
    
    # Create a list of tuples having (logit, 'option') format
    options_list = zip([logits[tokenizer(x).input_ids[-1]] for x in 'ABCDE'], list('ABCDE'))

    del inputs
    del logits

    options_list = sorted(options_list, reverse=True)
    ans_list = []
    for i in range(3):
        ans_list.append(options_list[i][1])
    
    del options_list

    return ans_list


import gc

preds = []
count = 0

with torch.no_grad():
    for x in tqdm(test_dataset['text'], total=len(test_dataset)):
        if count % 10 == 0:
            gc.collect()
            torch.cuda.empty_cache()
        pred = ' '.join(get_ans(x))
        preds.append(pred)
        count += 1

100%|██████████| 200/200 [09:18<00:00,  2.79s/it]


In [7]:
print(MAP_at_3(preds, test_dataset['answer']))

0.6933333333333329


In [8]:
from sklearn.metrics import accuracy_score

accuracy_score([x[0] for x in preds], test_dataset['answer'])

0.645