In [1]:
'hellow'

'hellow'

In [21]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, BertTokenizer, BertForMaskedLM

def gpt2_generate_response(input_text):
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    output = model.generate(input_ids, max_length=15)
    generated_text = tokenizer.decode(output[-1], skip_special_tokens=True)
    return generated_text

def bert_generate_text(input_text):
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model = BertForMaskedLM.from_pretrained("bert-base-uncased")
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    output = model.generate(input_ids, max_length=15, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text


user_input = 'outside of this shallow [MASK]?' # [MASK] --> dark
response = gpt2_generate_response(user_input)
print('response generated: ',response)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


response generated:  outside of this shallow [MASK]?

I'm not sure


In [19]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
output = tokenizer.encode("I'm a little bit of a nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a", return_tensors="pt")
output

tensor([[   40,  1101,   257,  1310,  1643,   286,   257, 34712,    13,   314,
          1101,   257,  1263, 34712,    13,   314,  1101,   257,  1263, 34712,
            13,   314,  1101,   257,  1263, 34712,    13,   314,  1101,   257,
          1263, 34712,    13,   314,  1101,   257,  1263, 34712,    13,   314,
          1101,   257]])

In [16]:
tokenizer.decode(output[0], skip_special_tokens=True)

"I'm a little bit of a nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a"

In [23]:
from transformers import BertTokenizer, BertForMaskedLM
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForMaskedLM.from_pretrained("bert-base-uncased")

def predict_masked_word(sentence: str):
    # Tokenize input sentence and find the index of the masked token
    input_ids = tokenizer.encode(sentence, return_tensors="pt") # masked value to some kind tensor integer
    mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1]
    
    # Predict the masked token
    output = model(input_ids)
    
    # Get the logits for the masked token and find the top prediction
    mask_token_logits = output.logits[0, mask_token_index, :]
    top_token_id = torch.argmax(mask_token_logits, dim=-1)
    
    # Decode the predicted token ID back to a word
    predicted_token = tokenizer.decode(top_token_id)
    return predicted_token

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [26]:
sentence = "outside this shallow [MASK]?"
predicted_word = predict_masked_word(sentence)
print(f"Predicted word: {predicted_word}")

Predicted word: water


In [33]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

def gpt2_generate_response(input_text):
    
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    
    output = model.generate(input_ids, max_length=65, num_return_sequences=1, temperature=0.99)
    
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    
    return generated_text


In [34]:
input_text = "There was a time"
generated_text = gpt2_generate_response(input_text)
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


There was a time when the world was still a little bit more open to the idea of a world where people could live in peace and harmony.

"But now, we have a world where people are not afraid to be themselves. We have a world where people are not afraid to be themselves. We have a world


In [41]:
# ---------------------- Fine Tuning ----------------------

from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
from torch.utils.data import DataLoader, Dataset

# Load pre-trained model and tokenizer
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.input_ids = []
        
        for text in texts:
            tokenized_text = tokenizer.encode(text, truncation=True, max_length=max_length)
            self.input_ids.append(tokenized_text) # [[74, 837,343,54342,45443,323,], [742, 37,33,542,4443,323,]]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return torch.tensor(self.input_ids[idx])

# Example input texts (split your data into smaller chunks if necessary)
texts = [
    "Your very long input text here. " * 100,  # Repeat to simulate a long text
    "Another example of a long input text." * 100
]

# Create dataset and dataloader
dataset = TextDataset(texts, tokenizer, 1024)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

# Fine-tuning parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
epochs = 3


In [42]:

# Fine-tuning loop
for epoch in range(epochs):
    model.train()
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids = batch.squeeze(0)  # Remove batch dimension
        outputs = model(input_ids[:-1], labels=input_ids[1:])
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        break
    print(f"Epoch {epoch+1}: Loss - {loss.item()}")
    break


Epoch 1: Loss - 8.416922569274902


In [43]:
# Save fine-tuned model
model.save_pretrained("fine_tuned_gpt2")
tokenizer.save_pretrained("fine_tuned_gpt2")

('fine_tuned_gpt2\\tokenizer_config.json',
 'fine_tuned_gpt2\\special_tokens_map.json',
 'fine_tuned_gpt2\\vocab.json',
 'fine_tuned_gpt2\\merges.txt',
 'fine_tuned_gpt2\\added_tokens.json')

In [44]:
finetuned_model = GPT2LMHeadModel.from_pretrained("./fine_tuned_gpt2")
finetuned_tokenizer = GPT2Tokenizer.from_pretrained("./fine_tuned_gpt2")

input_text = "There was a time"
input_ids = finetuned_tokenizer.encode(input_text, return_tensors='pt')
output = finetuned_model.generate(input_ids, max_length=50, num_return_sequences=1)

print(finetuned_tokenizer.decode(output[0], skip_special_tokens=True))

# how you can dedply a gen_ai app on huggingface.co

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


There was a time when the world was still a little bit more open to the idea of a world where people could live in peace and harmony.

"But now, we have a world where people are not afraid to be themselves. We have


In [2]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# # Load pre-trained model and tokenizer
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# device = torch.device("cuda")
model

  from .autonotebook import tqdm as notebook_tqdm


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [8]:
import torch
block_size = 8
batch_size = 8

with open('shakepeare_s_plays.txt', 'r') as f:
  input_text = f.read()

texts = input_text.split('\n')
dataset = tokenizer.encode(input_text, return_tensors='pt')

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

In [5]:
def batches():
    indexes = torch.randint(len(dataset[0]) - block_size, (batch_size, )) # [1000111:1000111+block_size, 9:9+block_size, 1823789127, 13912]
    x = torch.stack([dataset[0][i:i+block_size] for i in indexes])
    y = torch.stack([dataset[0][i+1:i+block_size+1] for i in indexes]) # [1000112:1000112+block_size, 10:10+block_size, 1823789127, 13912]
    # x = x.to(device)
    # y = y.to(device)
    return x, y

x, y = batches()

In [9]:
epochs = 3
# Fine-tuning loop 
for epoch in range(epochs):
    model.train()
    # print(dataloader)
    for batch in range(batch_size):
        optimizer.zero_grad()
        x, y = batches() # Remove batch dimension
        # print(input_ids)
        outputs = model(x, labels=y)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}: Loss - {loss.item()}")

Epoch 1: Loss - 7.863354206085205
Epoch 2: Loss - 7.564795017242432
Epoch 3: Loss - 7.337238311767578


In [10]:
# Save fine-tuned model
model.save_pretrained("fine_tuned_gpt2_")
tokenizer.save_pretrained("fine_tuned_gpt2_")

('fine_tuned_gpt2_\\tokenizer_config.json',
 'fine_tuned_gpt2_\\special_tokens_map.json',
 'fine_tuned_gpt2_\\vocab.json',
 'fine_tuned_gpt2_\\merges.txt',
 'fine_tuned_gpt2_\\added_tokens.json')

In [11]:
finetuned_model = GPT2LMHeadModel.from_pretrained("./fine_tuned_gpt2_")
finetuned_tokenizer = GPT2Tokenizer.from_pretrained("./fine_tuned_gpt2_")

In [13]:
input_text = "There was a time"
input_ids = finetuned_tokenizer.encode(input_text, return_tensors='pt')
output = finetuned_model.generate(input_ids, max_length=20, num_return_sequences=1)

print(finetuned_tokenizer.decode(output[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


There was a time when I was a little bit of a kid, and I was a little bit
