In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "gpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

W0901 16:42:26.167000 30564 Lib\site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


In [3]:
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)
model = model.to(device)

cuda:0


In [4]:
prompt = "Hello world, today is a very"
inputs = tokenizer(prompt, return_tensors="pt")
print(inputs)

{'input_ids': tensor([[15496,   995,    11,  1909,   318,   257,   845]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}


In [5]:
outputs = model.generate(
    inputs.input_ids.to(device), ## move the input ids to the gpu 
    max_length=30, ## length of prompt + output tokens
    do_sample=True, # sample from a probability distribution (to add randomness)
    top_k=50, # consider the top 50 tokens
    top_p=0.9, # nucleus sampling -> consider the smallest set of tokens that have a cumulative probability of 0.9
    temperature=0.8 # sharpens or flattens the distribution (lower = more conservative, higher = more random). Done before the softmax
)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [6]:
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Hello world, today is a very long day, it's a very long day for me and my family. I'm sorry, but it's okay


### LoRA config

In [7]:
from peft import LoraConfig, get_peft_model

In [8]:
# LoRA (Low-Rank Adaptation) adds a few tiny trainable matrices (adapters) inside the model, while freezing the original weights.
lora_config = LoraConfig(
    r=8, # rank of LoRA matrices
    lora_alpha=32, # scaling
    target_modules=["c_attn"], # which layers get LoRA (attention projection for GPT2)
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

# wrap model
model = get_peft_model(model, lora_config)

# check how many params are trainable
model.print_trainable_parameters()

trainable params: 294,912 || all params: 124,734,720 || trainable%: 0.2364




In [9]:
# Forward pass (just to test)
inputs = tokenizer("Hello world!", return_tensors="pt")
with torch.no_grad():
    outputs = model(inputs.input_ids.to(device))

print("Forward pass successful!")

Forward pass successful!
