In [None]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    logging,
)

from peft import LoraConfig, PeftModel

In [None]:
model_name = "NousResearch/Llama-2-7b-chat-hf"
device_map = {"": 0}
new_model = "Llama-2-finetuned-model"

In [None]:
# Load base model (Pre-trained) in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)

In [None]:
# Fine-tuned model
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

In [None]:
# Load LLAMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

In [None]:
# Run text generation pipeline with fine-tuned model
prompt = "[INST] How did you do in this class and how was it in general? Iâ€˜m starting this class next week and Iâ€™m really nervous. People I know who have taken this class either did really well with mid eighties to high 90s or failed/dropped it. I barely passed chem in grade 11 because of the quad system and my teacher. I also found the material kinda hard. I donâ€™t even remember to be honest because it went by super quickly. If you did well in this class please leave some tips and tricks in the comments and let me know how you passed lol! [/INST] "
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])