In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [2]:
# Initialize the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")



In [3]:
# Encoding the prompt to get the input ids
prompt = "Dear boss ..."
input_ids = tokenizer.encode(prompt, return_tensors="pt") # pt = pytorch
# input_ids
# Generate text using the model
output = model.generate(input_ids, max_length=100)
tokenizer.decode(output[0], skip_special_tokens=True)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


"Dear boss ... I'm not going to be able to do this anymore. I'm not going to be able to do this anymore. I'm not going to be able to do this anymore. I'm not going to be able to do this anymore. I'm not going to be able to do this anymore. I'm not going to be able to do this anymore. I'm not going to be able to do this anymore. I'm not going to be able to do this anymore. I"

In [4]:
# Simplified text generation function
def simple_text_generation(prompt, model, tokenizer, max_length=100):
    input_ids = tokenizer.encode(prompt, return_tensors="pt") # pt = pytorch
    # Generate text using the model
    output = model.generate(input_ids, max_length=100)
    return tokenizer.decode(output[0], skip_special_tokens=True)

In [5]:
prompt = "Hello. How are you?"
text_generated = simple_text_generation(prompt,
                                        model,
                                        tokenizer,
                                        max_length=100)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [6]:
text_generated

"Hello. How are you?\n\nI'm fine. I'm fine.\n\nI'm fine.\n\nI'm fine.\n\nI'm fine.\n\nI'm fine.\n\nI'm fine.\n\nI'm fine.\n\nI'm fine.\n\nI'm fine.\n\nI'm fine.\n\nI'm fine.\n\nI'm fine.\n\nI'm fine.\n\nI'm fine.\n\nI'm fine."

In [42]:
documents = ["Let's think step by step",
"",
"## Persona",
"## Context",
"- The user has a PowerPoint template that they need to fill with specific information.",
"- The user needs guidance on optimal color matching for each paragraph and background.",
"- Your goal is to assist the user in creating a high-quality, engaging PowerPoint presentation based on the information they provide.",
"- You have extensive experience supporting others in similar situations.",
"",
"## Clarifications",
"1. Upon receiving the content to be adapted for the presentation, carefully **read and memorize** the information to accurately apply design elements.",
"2. Essential design elements include:",
"   - A **title**",
"   - One **short paragraph**",
"   - **Presenter notes**: Include examples to expand on ideas covered on the slide.",
]

In [43]:
# Tokenization
# All inputs must have the same length
# Add a dummy token at the end
# Having the same length => this is called padding

tokenizer.pad_token = tokenizer.eos_token


In [44]:
# tokeniz the data
tokenized_data = [tokenizer.encode_plus(
    sentence,
    add_special_tokens=True,
    return_tensors="pt",
    padding="max_length",
    max_length=100
) for sentence in documents]

tokenized_data[:2]

[{'input_ids': tensor([[ 5756,   338,   892,  2239,   416,  2239, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  

In [45]:
# Isolate the input IDs and the attention masks
input_ids = [item["input_ids"].squeeze() for item in tokenized_data]
attention_masks = [item["attention_mask"].squeeze() for item in tokenized_data]
len(input_ids[23])

IndexError: list index out of range

In [None]:
# Convert the input ids and attentions masks to tensors
# This step is necessary for processing the runed model.
input_ids = torch.stack(input_ids)
attention_masks = torch.stack(attention_masks)

In [46]:
input_ids

[tensor([ 5756,   338,   892,  2239,   416,  2239, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256]),
 tensor([50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256,

In [47]:
# Padding all sequences to make sure they are the same length
padded_input_ids = pad_sequence(input_ids,
                                batch_first=True,
                                padding_value=tokenizer.eos_token_id)
padded_attention_mask = pad_sequence(attention_masks,
                                     batch_first=True,
                                     padding_value=0)

In [48]:
# Create a custom dataset class including databels
class TextDataset(Dataset):
    def __init__(self, input_ids, attention_masks):
        super().__init__()
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = input_ids.clone()

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_masks": self.attention_masks[idx],
            "labels": self.labels[idx]
        }

dataset = TextDataset(padded_input_ids, padded_attention_mask)

In [49]:
dataloader = DataLoader(dataset,
                        batch_size=2,
                        shuffle=True)

In [52]:
from tqdm.auto import tqdm
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
model.train()
model.to("cpu")
for epoch in tqdm(range(3)):
    for batch in dataloader:
        input_ids = batch["input_ids"].to("cpu")
        attention_mask = batch["attention_masks"].to("cpu")
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        labels=input_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f"Epoh {epoch+1} and loss: {loss.item()} ")
        


  0%|          | 0/3 [00:00<?, ?it/s]

Epoh 1 and loss: 1.1432512998580933 
Epoh 2 and loss: 0.4897899031639099 
Epoh 3 and loss: 0.33285626769065857 


In [63]:
def generate_text(prompt, model, tokenizer, max_length=100):
    inputs = tokenizer.encode_plus(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    outputs = model.generate(input_ids, 
                             attention_mask=attention_mask, 
                             max_length=max_length)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [64]:
prompt = "The user has a PowerPoint"
text_generated = generate_text(prompt, model, tokenizer, max_length=100)
print(text_generated)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


The user has a PowerPoint
