In [1]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling
from transformers import pipeline
from transformers import Trainer, TrainingArguments

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
poems = load_dataset("suayptalha/Poetry-Foundation-Poems")
poems = poems["train"].train_test_split(test_size = 0.2,seed = 42)

In [4]:
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token

In [5]:
model = AutoModelForCausalLM.from_pretrained("distilgpt2")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [6]:
gen_before_finetune = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device = 0
)

Device set to use cuda:0


In [9]:
prompt = "Write a short 5 line poem about rainfall:\n"
poem = gen_before_finetune(
    prompt,
    max_new_tokens=60,
    temperature=0.9,
    top_p=0.9,
    do_sample = True,
    num_return_sequences=1
)
print(poem[0]["generated_text"])

Write a short 5 line poem about rainfall:
I would like to get to know your favourite story about the rain. What was it like in that song?
I think you are going to like the story about the rain.
So, what would you like to see you go to the rain.
My idea is to go to a place


In [10]:
def tokenize_function(examples):
    return tokenizer(examples["Poem"], truncation=True, max_length = 60, padding="max_length" )

In [11]:
tokenized_datasets = poems.map(
    tokenize_function,
    batched=True,
    remove_columns=poems["train"].column_names,
)

Map:   0%|          | 0/11083 [00:00<?, ? examples/s]

Map:   0%|          | 0/2771 [00:00<?, ? examples/s]

In [12]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm = False)

training_args = TrainingArguments("test-trainer", eval_strategy="epoch")
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator
)
trainer.train()

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,3.7976,3.609795
2,3.5528,3.58698
3,3.4345,3.586308


TrainOutput(global_step=4158, training_loss=3.628817001863877, metrics={'train_runtime': 219.4928, 'train_samples_per_second': 151.481, 'train_steps_per_second': 18.944, 'total_flos': 509054042603520.0, 'train_loss': 3.628817001863877, 'epoch': 3.0})

In [13]:
model.save_pretrained("./distilgpt2-poems-finetuned")
tokenizer.save_pretrained("./distilgpt2-poems-finetuned")

('./distilgpt2-poems-finetuned\\tokenizer_config.json',
 './distilgpt2-poems-finetuned\\special_tokens_map.json',
 './distilgpt2-poems-finetuned\\vocab.json',
 './distilgpt2-poems-finetuned\\merges.txt',
 './distilgpt2-poems-finetuned\\added_tokens.json',
 './distilgpt2-poems-finetuned\\tokenizer.json')

In [15]:
fine_tuned_model = AutoModelForCausalLM.from_pretrained("./distilgpt2-poems-finetuned")
fine_tuned_model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [16]:
gen_after_finetune = pipeline(
    "text-generation",
    model=fine_tuned_model,
    tokenizer=tokenizer,
    device=0
)

Device set to use cuda:0


In [17]:
prompt = "Write a short 5 line poem about rainfall:\n\n"
poem = gen_after_finetune(
    prompt,
    max_new_tokens=50,
    temperature=0.9,
    top_p=0.9,
    do_sample = True,
    num_return_sequences=1
)
print(poem[0]["generated_text"])

Write a short 5 line poem about rainfall:


a few dozen people who walk through the streets
and see the rain.
In the dark, one hears a sound,
and one hears the light.
One hears the wind.
