In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

 ### Running the model on a CPU

In [None]:
checkpoint = "google/gemma-2b"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint)

In [None]:
input_text = "Write a function to sort a list "
input_ids = tokenizer(input_text, return_tensors="pt")
print(input_ids)

In [None]:
outputs = model.generate(**input_ids)
print(tokenizer.decode(outputs[0]))

 ### Running the model on a GPU

In [2]:
checkpoint = "google/gemma-2b"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
input_text = "def print_hello_world(): "
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")

In [14]:
outputs = model.generate(**input_ids, max_new_tokens=200, do_sample=True)
print(tokenizer.decode(outputs[0]))

<bos>def print_hello_world(): 
   print("Hello, World!") 

def get_name(): 
   name = input("What is your name? ") 
   return name 


if __name__=="__main__": 
   print_hello_world() 
   my_name = get_name() 
   print(f"Hello {my_name}")
<eos>


### Running on GPU Using 4-bit precision and GPU

In [3]:
checkpoint = "google/gemma-2b"                # change the model name to the one you want to use

In [6]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
    checkpoint,
    quantization_config=quantization_config,
    device_map ="auto",
    attn_implementation="eager"
)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
input_text = "def print_hello_world(): "
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")

In [13]:
outputs = model.generate(**input_ids, do_sample=True, max_new_tokens=200)
print(tokenizer.decode(outputs[0]))

<bos>def print_hello_world(): 
	print "Hello, World!"

def print_hello_you():
	print "Hello, You!"

def print_hello_programmer():
	print "Hello, Programmer"

def main():
	return

if __name__ == "__main__":
	try:
		main()
	except (KeyboardInterrupt, SystemExit):
		print "Good-bye!"
<eos>
