In [34]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load pre-trained model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)


In [35]:
import csv

# Load the custom vocabulary from the CSV file
custom_vocab = []
with open('data/custom_vocab.csv', 'r') as file:
    reader = csv.reader(file)
    next(reader)  # Skip the header
    for row in reader:
        custom_vocab.append(row[0])

# Print the custom vocabulary to verify
# print(custom_vocab)

In [36]:
# # Add custom tokens to the tokenizer
tokenizer.add_tokens(custom_vocab)

# Add a padding token to the tokenizer
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Resize the model embeddings to match the new tokenizer size
model.resize_token_embeddings(len(tokenizer))


Embedding(50324, 768)

In [37]:

"""
# Define the dataset (this is just an example; use your actual data here)
train_texts = ["hello world", "my custom vocabulary is cool"]

# Encode the training data with padding
train_encodings = tokenizer(train_texts, return_tensors='pt', padding=True, truncation=True)
"""
import pandas as pd

# Load the CSV file using pandas
df = pd.read_csv('data/dsl_training.csv')

# Extract the input and output columns
train_texts = df['input'].tolist()
train_labels = df['output'].tolist()


In [38]:
"""
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Define the dataset (this is just an example; use your actual data here)
train_texts = ["hello world", "my custom vocabulary is cool"]

# Encode the training data
train_encodings = tokenizer(train_texts, return_tensors='pt', padding=True, truncation=True)

# Shift the labels to the right to create the labels tensor
labels = train_encodings['input_ids'].clone()
labels[labels == tokenizer.pad_token_id] = -100  # Ignore padding tokens in loss computation

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item
    
    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = CustomDataset(train_encodings, labels)
"""

# # Add padding token if necessary
# if tokenizer.pad_token is None:
#     tokenizer.add_special_tokens({'pad_token': '[PAD]'})
#     model.resize_token_embeddings(len(tokenizer))


import torch
from transformers import DataCollatorForLanguageModeling
# Encode the training data with padding
train_encodings = tokenizer(train_texts, return_tensors='pt', padding=True, truncation=True)

# Encode the labels
label_encodings = tokenizer(train_labels, return_tensors='pt', padding=True, truncation=True)

# Shift the labels to the right to create the labels tensor
labels = label_encodings['input_ids'].clone()
labels[labels == tokenizer.pad_token_id] = -100  # Ignore padding tokens in loss computation

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item
    
    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = CustomDataset(train_encodings, labels)

# Define data collator to handle padding dynamically
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # We are not doing masked language modeling
)



In [39]:
# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


In [40]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    warmup_steps=10,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)


In [41]:
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained('./models/custom_vocab_model')
tokenizer.save_pretrained('./models/custom_vocab_tokenizer')



  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
100%|██████████| 369/369 [09:59<00:00,  1.62s/it]


{'train_runtime': 599.0559, 'train_samples_per_second': 1.227, 'train_steps_per_second': 0.616, 'train_loss': 4.3097095851329605, 'epoch': 3.0}


('./models/custom_vocab_tokenizer\\tokenizer_config.json',
 './models/custom_vocab_tokenizer\\special_tokens_map.json',
 './models/custom_vocab_tokenizer\\vocab.json',
 './models/custom_vocab_tokenizer\\merges.txt',
 './models/custom_vocab_tokenizer\\added_tokens.json')

In [44]:
# Define some test data
test_texts = ["hello my world", "vocabulary is custom"]

# Encode the test data
test_encodings = tokenizer(test_texts, return_tensors='pt', padding=True, truncation=True)

# Use the trainer to evaluate the model
results = trainer.evaluate(eval_dataset=CustomDataset(test_encodings, test_encodings['input_ids']))
print(results)


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
100%|██████████| 1/1 [00:00<00:00, 1000.31it/s]

{'eval_loss': 9.080677032470703, 'eval_runtime': 0.1092, 'eval_samples_per_second': 18.322, 'eval_steps_per_second': 9.161, 'epoch': 3.0}





In [49]:
# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained('./models/custom_vocab_model')
tokenizer = GPT2Tokenizer.from_pretrained('./models/custom_vocab_tokenizer')

# Generate text
input_text = "trash"
input_ids = tokenizer.encode(input_text, return_tensors='pt')

output = model.generate(input_ids, max_length=50)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


trash the  car  and clean the  bathroom   floor  and polish the  floor  and polish the  floor   table  and polish the  floor   table  and polish the  floor   floor  and polish the  floor   floor  and polish the  floor


In [50]:
# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained('./models/custom_vocab_model')
tokenizer = GPT2Tokenizer.from_pretrained('./models/custom_vocab_tokenizer')


def generate_custom_text(input_text, tokenizer, model, custom_tokens):
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    
    # Generate tokens with the model
    output_ids = model.generate(input_ids, max_length=50)
    
    # Decode the generated tokens
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    # Ensure the generated text uses only custom vocabulary
    tokens = generated_text.split()
    filtered_tokens = [token for token in tokens if token in custom_tokens]
    
    # Join filtered tokens to form the final output
    final_output = ' '.join(filtered_tokens)
    return final_output

# Define some test data
test_texts = ["Empty the trash", "Clean the kitchen sink"]

# Test the model with the custom generation function
for text in test_texts:
    generated_text = generate_custom_text(text, tokenizer, model, custom_vocab)
    print(f"Input: {text}\nOutput: {generated_text}\n")



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Empty the trash
Output: sink floor table table table table table table table table table table table table table table

Input: Clean the kitchen sink
Output: kitchen sink sink sink sink floor sink floor carpet carpet floor carpet carpet floor

