In [2]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, BertTokenizer, BertForMaskedLM

def gpt2_generate_response(input_text):
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    output = model.generate(input_ids, max_length=100, num_return_sequences=1)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

def bert_generate_text(input_text):
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model = BertForMaskedLM.from_pretrained("bert-base-uncased")
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    output = model.generate(input_ids, max_length=100, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text


user_input = 'Hello, how are you?'
response = bert_generate_text(user_input)
print(response)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


hello, how are you? hello? hello? hello? hello? hello? hello? hello? hello? hello? hello? hello? hello? hello? hello? hello? hello? hello? hello? hello? hello! hello! hello! hello! hello! hello!!!!!!!!! hello!!! hello! hello!!!!!!!!!!!!!!! hello!!!!!!!!!!!


In [6]:
from transformers import BertTokenizer, BertForMaskedLM
import torch

def predict_masked_word(sentence):
    # Load pre-trained BERT model and tokenizer
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model = BertForMaskedLM.from_pretrained("bert-base-uncased")
    
    # Tokenize input sentence and find the index of the masked token
    input_ids = tokenizer.encode(sentence, return_tensors="pt")
    mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1]
    
    # Predict the masked token
    with torch.no_grad():
        output = model(input_ids)
    
    # Get the logits for the masked token and find the top prediction
    mask_token_logits = output.logits[0, mask_token_index, :]
    top_token_id = torch.argmax(mask_token_logits, dim=-1)
    
    # Decode the predicted token ID back to a word
    predicted_token = tokenizer.decode(top_token_id)
    return predicted_token

sentence = "Once upon a time, there was person [MASK]?"
predicted_word = predict_masked_word(sentence)
print(f"Predicted word: {predicted_word}")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Predicted word: ##hood


In [7]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

def gpt2_generate_response(input_text):
    
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    
    output = model.generate(input_ids, max_length=65, num_return_sequences=1)
    
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    
    return generated_text

input_text = "There was a time"
generated_text = gpt2_generate_response(input_text)
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


There was a time when the world was still a little bit more open to the idea of a world where people could live in peace and harmony.

"But now, we have a world where people are not afraid to be themselves. We have a world where people are not afraid to be themselves. We have a world


In [9]:
# ---------------------- Fine Tuning ----------------------

from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
from torch.utils.data import DataLoader, Dataset

# Load pre-trained model and tokenizer
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=1024):
        self.tokenizer = tokenizer
        self.input_ids = []
        
        for text in texts:
            tokenized_text = tokenizer.encode(text, truncation=True, max_length=max_length)
            self.input_ids.append(tokenized_text)

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return torch.tensor(self.input_ids[idx])

# Example input texts (split your data into smaller chunks if necessary)
texts = [
    "Your very long input text here. " * 100,  # Repeat to simulate a long text
    "Another example of a long input text." * 100
]

# Create dataset and dataloader
dataset = TextDataset(texts, tokenizer)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

# Fine-tuning parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
epochs = 3

# Fine-tuning loop
for epoch in range(epochs):
    model.train()
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids = batch.squeeze(0)  # Remove batch dimension
        outputs = model(input_ids, labels=input_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}: Loss - {loss.item()}")

# Save fine-tuned model
model.save_pretrained("fine_tuned_gpt2")
tokenizer.save_pretrained("fine_tuned_gpt2")


In [10]:
finetuned_model = GPT2LMHeadModel.from_pretrained("./fine_tuned_gpt2")
finetuned_tokenizer = GPT2Tokenizer.from_pretrained("./fine_tuned_gpt2")

input_text = "There was a time"
input_ids = finetuned_tokenizer.encode(input_text, return_tensors='pt')
output = finetuned_model.generate(input_ids, max_length=50, num_return_sequences=1)

print(finetuned_tokenizer.decode(output[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


There was a time when I was a little bit of a nerd. I was a little bit of a nerd. I was a little bit of a nerd. I was a little bit of a nerd. I was a little bit of a nerd.


In [27]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
# from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from torch.utils.data import DataLoader, Dataset

# # Load pre-trained model and tokenizer
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# tokenizer = AutoTokenizer.from_pretrained("cerebras/Cerebras-GPT-1.3B")
# model = AutoModelForCausalLM.from_pretrained("cerebras/Cerebras-GPT-1.3B")

# device = torch.device("cuda")
model


In [19]:
block_size = 8
batch_size = 8

with open('shakepeare_s_plays.txt', 'r') as f:
  input_text = f.read()

texts = input_text.split('\n')
dataset = tokenizer.encode(input_text, return_tensors='pt')

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)


tensor([[[  220,   220,   220,  2345,   437,   407,    11, 15967,    26,   329,
            356,   389, 28527]]])
tensor([[[  220,   220,   220,  1114,   790, 15393,  5586,   319,   465, 18030,
             11]]])
tensor([[[  220,   220,   220, 38068,  1479,  1243,   290,  3772,  2523,  2157,
             25]]])
tensor([], size=(1, 1, 0))


RuntimeError: cannot reshape tensor of 0 elements into shape [-1, 0] because the unspecified dimension size -1 can be any value and is ambiguous

In [None]:
def batches():
    # input_text = train if split == 'train' else test
    input_text = dataset[0]
    indexes = torch.randint(len(dataset[0]) - block_size, (batch_size, ))
    # print(indexes)
    # dataset[0][156933:156933+8]
    x = torch.stack([dataset[0][i:i+block_size] for i in indexes])
    y = torch.stack([dataset[0][i+1:i+block_size+1] for i in indexes])
        # dataset[0][4020066: 4020066+block]
    # x = x.to(device)
    # y = y.to(device)
    return x, y
# tensor*[9102830912309]
x , y = batches()

In [None]:
epochs = 3
# Fine-tuning loop 
for epoch in range(epochs):
    model.train()
    # print(dataloader)
    for batch in range(batch_size):
        optimizer.zero_grad()
        input_ids, y = batches() # Remove batch dimension
        # print(input_ids)
        outputs = model(input_ids, labels=y)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}: Loss - {loss.item()}")

# Save fine-tuned model
model.save_pretrained("fine_tuned_gpt2")


In [None]:
# Save fine-tuned model
model.save_pretrained("fine_tuned_gpt2")
tokenizer.save_pretrained("fine_tuned_gpt2")

In [None]:
finetuned_model = GPT2LMHeadModel.from_pretrained("./fine_tuned_gpt2")
finetuned_tokenizer = GPT2Tokenizer.from_pretrained("./fine_tuned_gpt2")

input_text = "There was a time"
input_ids = finetuned_tokenizer.encode(input_text, return_tensors='pt')
output = finetuned_model.generate(input_ids, max_length=50, num_return_sequences=1)

print(finetuned_tokenizer.decode(output[0], skip_special_tokens=True))