In [1]:
from template_dataset import get_prompt_dataset, get_eval_dataloader, collate_fn
from datasets import load_dataset
from peft import LoraConfig, PeftConfig, PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
import torch


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues


In [2]:
model_id = "EleutherAI/polyglot-ko-1.3b"
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/polyglot-ko-1.3b")
model = AutoModelForCausalLM.from_pretrained(
    "EleutherAI/polyglot-ko-1.3b",
    load_in_8bit=False,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
).eval()

In [3]:
# Load adapters from the Hub and generate some output texts:

peft_model_id = "./checkpoints/polyglot-ko-1.3b-lora-nsmc/2023-03-21_03:46:37"
config = PeftConfig.from_pretrained(peft_model_id)

# Load the Lora model
peft_model = PeftModel.from_pretrained(model, peft_model_id).to('cuda')
# You can then directly use the trained model or the model that you have loaded from the 🤗 Hub for inference



In [4]:
batch = tokenizer("""
다음 문장은 긍정일까요 부정일까요?
아 이 영화 진짜 골때리네 정말 무슨 생각으로 개봉한거야?
정답:
""".strip(), return_tensors="pt").to('cuda')
with torch.cuda.amp.autocast():
    output_tokens = peft_model.generate(
        input_ids=batch['input_ids'].to(model.device),
        max_new_tokens=2,
        eos_token_id = tokenizer.eos_token_id,
        pad_token_id = tokenizer.pad_token_id,
    )
    
print(f"Generated: {tokenizer.decode(output_tokens[0], skip_special_tokens=True)}")

Generated: 다음 문장은 긍정일까요 부정일까요?
아 이 영화 진짜 골때리네 정말 무슨 생각으로 개봉한거야?
정답:부정


In [5]:
ids_to_labels = {0:"부정", 1:"긍정"}
labels_to_ids = {v:k for k, v in ids_to_labels.items()}
max_label_len = 2
prefix = "다음 문장은 긍정일까요 부정일까요?\n"
suffix = "\n정답:"
columns = ["document", "label"]

data = load_dataset("nsmc")
data = get_prompt_dataset(data, tokenizer, max_label_len=max_label_len, ids_to_labels=ids_to_labels)
eval_dataloader = get_eval_dataloader(data['test'], 256)

Found cached dataset nsmc (/home/girinman/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /home/girinman/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3/cache-e490f3a13bf254f4.arrow
Loading cached processed dataset at /home/girinman/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3/cache-d6184956744f867d.arrow


In [6]:
def validation_step(model, tokenizer, batch, first):
    with torch.cuda.amp.autocast():
        generated_ids = model.generate(
            input_ids=batch['input_ids'].to(model.device),
            attention_mask=batch['attention_mask'].to(model.device),
            max_new_tokens=max_label_len,
            eos_token_id = tokenizer.eos_token_id,
            pad_token_id = tokenizer.pad_token_id,
        )
    generated_txt = []
    for i, g in enumerate(generated_ids):
        decoded_txt = tokenizer.decode(g.tolist(), skip_special_tokens=True).split(suffix)
        generated_txt.append(decoded_txt[-1].strip())
    
    labels = batch['decoded_labels']

    if first:
        count = 0
        for gold, gen_txt in zip(labels, generated_txt):
            print(f'gold: {ids_to_labels[gold]} pred: {gen_txt}')
            count += 1
            if count > 4:
                break

    return {'generated': generated_txt, 'labels': labels}

def validation_epoch_end(outputs):
    generated_txt = []
    labels = []
    preds = []

    for i in outputs:
        generated_txt.extend(i['generated'])
        labels.extend(i['labels'])
        for txt in i['generated']:
            try:
                pred_id = labels_to_ids[txt]
            except:
                pred_id = -100
            preds.append(pred_id)

    class_ids = [key for key, value in ids_to_labels.items()]
    is_binary = False
    if -100 not in preds:
        if (0 in class_ids) and (1 in class_ids) and (len(class_ids) == 2):
            is_binary = True
    else:
        class_ids.append(-100)

    acc = accuracy_score(labels, preds)
    if is_binary:
        f1 = f1_score(labels, preds)
    else:
        f1 = f1_score(y_true=labels, y_pred=preds, labels=class_ids, average="macro")

    metrics = {}
    metrics['accuracy'] = acc
    metrics['f1'] = f1
    metrics['error'] = accuracy_score([-100] * len(preds), preds)

    return metrics

def evaluate(eval_dataloader):
    eval_results = []
    for i, batch in tqdm(enumerate(eval_dataloader), "Generating predictions", total=len(eval_dataloader)):
        eval_results.append(validation_step(model, tokenizer, batch, i == 0))
    return validation_epoch_end(eval_results)

In [9]:
print(f"***Evaluation begins***")
metrics = evaluate(eval_dataloader)
print(f"***Evaluation results***")
print(f"accuracy: {metrics['accuracy']}")
print(f"f1: {metrics['f1']}")
print(f"error rate: {metrics['error']}")

***Evaluation begins***


Generating predictions:   1%|          | 1/196 [00:00<03:08,  1.04it/s]

gold: 긍정 pred: 긍정
gold: 부정 pred: 긍정
gold: 부정 pred: 부정
gold: 부정 pred: 부정
gold: 부정 pred: 부정


Generating predictions: 100%|██████████| 196/196 [03:08<00:00,  1.04it/s]

***Evaluation results***
accuracy: 0.91062
f1: 0.9110876788094622
error rate: 0.0



