In [None]:
!pip install transformers torch accelerate bitsandbytes

In [None]:
pip install --upgrade transformers

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig, pipeline

In [4]:
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

In [5]:
from google.colab import userdata
my_secret_token = userdata.get('huggingface_token')

In [6]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_id, token = my_secret_token)
tokenizer.pad_token = tokenizer.eos_token

In [8]:
config = AutoConfig.from_pretrained(model_id, token = my_secret_token)
config.rope_scaling = { "type": "linear", "factor": 8.0 }

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config = bnb_config,
                                             device_map = "auto",
                                             token=my_secret_token)

In [None]:
text_generator = pipeline(
    "text-generation",
    model = model_id,
    tokenizer = tokenizer,
    max_new_tokens = 128
)

In [None]:
def get_response(prompt):
  response = text_generator(prompt)
  gen_text = response[0]['generated_text']
  return gen_text

In [None]:
prompt = "Explain the concept of fine-tuning of Large Language Models"
llama_response = get_response(prompt)
print(llama_response)