In [1]:
!pip install transformers[torch]
!pip install accelerate -U
!pip install datasets


Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.32.1-py3-none-any.whl (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu

In [9]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import Dataset
import os

# Load pre-trained model and tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Add padding token to the tokenizer
tokenizer.pad_token = tokenizer.eos_token

# Prepare dataset
def prepare_data(sentences, keywords):
    # Combine sentences and keywords for training
    data = [f"Keywords: {', '.join(kws)} Sentence: {sentence}" for kws, sentence in zip(keywords, sentences)]
    return data

sentences = [
    "The quick brown fox jumps over the lazy dog.",
    "She sells seashells by the seashore.",
    "The rain in Spain stays mainly in the plain."
]
keywords = [
    ["quick", "brown", "fox"],
    ["sells", "seashells", "shore"],
    ["rain", "Spain", "plain"]
]

data = prepare_data(sentences, keywords)

# Convert data to a Dataset
dataset_dict = {"text": data}
dataset = Dataset.from_dict(dataset_dict)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets,
)

# Train the model
trainer.train()


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Step,Training Loss


TrainOutput(global_step=10, training_loss=3.1704217910766603, metrics={'train_runtime': 1.4861, 'train_samples_per_second': 20.187, 'train_steps_per_second': 6.729, 'total_flos': 1959690240000.0, 'train_loss': 3.1704217910766603, 'epoch': 10.0})

In [10]:
# Save the model and tokenizer after training
model.save_pretrained("./results")
tokenizer.save_pretrained("./results")


('./results/tokenizer_config.json',
 './results/special_tokens_map.json',
 './results/vocab.json',
 './results/merges.txt',
 './results/added_tokens.json')

In [20]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model and tokenizer
model_name = "./results"  # Path to your saved model directory
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

def prepare_input(keywords):
    input_text = f"Keywords: {', '.join(keywords)} Sentence:"
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    return input_ids

keywords = ["fox", "black", "dog"]
input_ids = prepare_input(keywords)

output = model.generate(
    input_ids,
    max_length=50,
    num_return_sequences=1,
    no_repeat_ngram_size=2,
    early_stopping=True,
    temperature=0.7,
    top_k=50,
    top_p=0.95
)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

# Extract the generated sentence
prompt_length = len(tokenizer.decode(input_ids[0], skip_special_tokens=True))
generated_sentence = generated_text[prompt_length:]
print(generated_sentence.strip())


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The foxes are black and black. The dog is black."

The dog was black in color.
...
 (The black dog.)
,. The black fox is
