In [1]:
!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import transformers
from huggingface_hub import login
from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch

In [3]:
!git config --global credential.helper store

In [4]:
HF_TOKEN= userdata.get("HF_TOKEN")
login(HF_TOKEN, add_to_git_credential=True)

In [13]:
# instruct models

LLAMA = "meta-llama/Llama-3.2-1B-Instruct"
PHI3 = "microsoft/Phi-3-mini-4k-instruct"
GEMMA2 = "google/gemma-2-2b-it"
QWEN2 = "Qwen/Qwen2-7B-Instruct" # exercise for you
MIXTRAL = "mistralai/Mixtral-8x7B-Instruct-v0.1" # If this doesn't fit it your GPU memory, try others from the hub

In [6]:
messages = [
    {"role": "system", "content": "You are a helpful assistant"},
    {"role": "user", "content": "Tell a light-hearted joke for a room of Data Scientists"}
  ]

In [7]:
quant_config=BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_quant_type="nf4"

)

In [24]:
tokenizer =AutoTokenizer.from_pretrained(LLAMA)
tokenizer.pad_token= tokenizer.eos_token
inputs =tokenizer.apply_chat_template(messages,return_tensors="pt").to("cuda")

In [15]:
print(inputs)

tensor([[128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,
             25,   6790,    220,   2366,     18,    198,  15724,   2696,     25,
            220,    966,   3799,    220,   2366,     19,    271,   2675,    527,
            264,  11190,  18328, 128009, 128006,    882, 128007,    271,  41551,
            264,   3177,  70395,  22380,    369,    264,   3130,    315,   2956,
          57116, 128009]], device='cuda:0')


In [25]:
model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map="auto", quantization_config= quant_config)

In [19]:
memory = model.get_memory_footprint()/1e6
print(f'Memory footprint:{memory:,.1f} MB')

Memory footprint:1,012.0 MB


In [20]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-0

In [22]:
outputs=model.generate(inputs, max_new_tokens=80)
print(tokenizer.decode (outputs[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 30 Dec 2024

You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>

Tell a light-hearted joke for a room of Data Scientists<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Here's a joke tailored for a room of Data Scientists:

Why did the data scientist break up with his girlfriend?

Because he wanted to analyze their relationship, and he found it was just a correlation, not an expectation!

(Sorry, I know it's a bit of a nerdy pun, but I hope it brought a smile to their faces!)<|eot_id|>


In [23]:
del inputs , outputs, model
torch.cuda.empty_cache()

In [29]:
def generate (model, messages):
  tokenizer =AutoTokenizer.from_pretrained(LLAMA)
  tokenizer.pad_token= tokenizer.eos_token
  inputs =tokenizer.apply_chat_template(messages,return_tensors="pt").to("cuda")
  streamer = TextStreamer(tokenizer)
  model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map="auto", quantization_config= quant_config)
  memory = model.get_memory_footprint()/1e6
  print(f'Memory footprint:{memory:,.1f} MB')
  outputs=model.generate(inputs, max_new_tokens=80, streamer = streamer)
  del inputs , outputs, model, streamer
  torch.cuda.empty_cache()


In [31]:
generate(PHI3,messages)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Memory footprint:1,012.0 MB
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 30 Dec 2024

You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>

Tell a light-hearted joke for a room of Data Scientists<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Here's a light-hearted joke for a room of data scientists:

Why did the data scientist break up with his girlfriend?

Because he realized he had a lot of 'data' to process, but she was just a 'variable' in his life. Now he's just seasoning' his heart!

(I hope that brings a smile to their faces!)<|eot_id|>


UnboundLocalError: local variable 'inputs' referenced before assignment

In [None]:
# instruct models

LLAMA = "meta-llama/Llama-3.2-1B-Instruct"
PHI3 = "microsoft/Phi-3-mini-4k-instruct"
GEMMA2 = "google/gemma-2-2b-it"
QWEN2 = "Qwen/Qwen2-7B-Instruct" # exercise for you
MIXTRAL = "mistralai/Mixtral-8x7B-Instruct-v0.1" # If this doesn't fit it your GPU memory, try others from the hub