In [1]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 4.3 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 55.8 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 63.7 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [3]:
# import all of the python modules/packages you'll need here
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel

# ...

books= pd.read_csv('books_clean.csv')
books.head(5)

Unnamed: 0,title,author,description,language
0,Harry Potter and the Half-Blood Prince,J.K. Rowling,The war against Voldemort is not going well; e...,English
1,Harry Potter and the Order of the Phoenix,"J.K. Rowling,Mary GrandPré",There is a door at the end of a silent corrido...,English
2,Harry Potter and the Sorcerer's Stone,"J.K. Rowling,Mary GrandPré",Harry Potter's life is miserable. His parents ...,English
3,Harry Potter and the Chamber of Secrets,J.K. Rowling,The Dursleys were so mean and hideous that sum...,English
4,Harry Potter and the Prisoner of Azkaban,"J.K. Rowling,Mary GrandPré","For twelve long years, the dread fortress of A...",English


In [4]:
MODEL_NAME = 'gpt2-medium'

# Download the pre-trained GPT-Neo model's tokenizer
# Add the custom tokens denoting the beginning and the end 
# of the sequence and a special token for padding
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME, bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
# Download the pre-trained GPT-Neo model and transfer it to the GPU
model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)
# Resize the token embeddings because we've just added 3 new tokens 
model.resize_token_embeddings(len(tokenizer))

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/718 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Embedding(50259, 1024)

In [5]:
books['tokenCount'] = books['description'].str.len()
books['tokenCount2'] = books['description'].str.split().map(lambda x: len(x))
print(max(books['tokenCount2']))

2000


In [6]:
books = books[books['tokenCount2'] <= 150]
len(books)

6731

In [7]:
print(max(books['tokenCount']))

1184


In [8]:
descriptions = books['description']
descriptions.head()

6     Box Set containing Harry Potter and the Sorcer...
14    Don't leave Earth without this hilarious inter...
15    Many are familiar with Douglas Adams's classic...
19    A revised and updated edition of a humorous pr...
25    With dazzling wit and astonishing insight, Bil...
Name: description, dtype: object

In [9]:
max_length = max([len(tokenizer.encode(description)) for description in descriptions])

In [10]:
max_length

792

In [11]:
class DescriptionDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            encodings_dict = tokenizer('<|startoftext|>' + txt + '<|endoftext|>', truncation=True,
                                       max_length=512, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]
    

In [12]:
dataset = DescriptionDataset(descriptions, tokenizer, max_length=max_length)
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

In [13]:
train_dataset[0]

(tensor([50257, 39507,  7079, 46771,    11, 14489,    13,   383,  1846,  3780,
          3776,    25,  7683, 38116,   329, 21516,    13, 12556,   670,   416,
         12930,  1982,    36,  8149,    13, 50256, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 5

In [14]:
import gc
gc.collect()

47

In [15]:
training_args = TrainingArguments(output_dir='/results',
                                  num_train_epochs=1,
                                  logging_steps=100,
                                  save_steps=5000,                                   
                                  per_device_train_batch_size=1,
                                  per_device_eval_batch_size=1,
                                  warmup_steps=10,
                                  weight_decay=0.05,  
                                  logging_dir='/logs',
                                  report_to = 'none')

In [16]:
trainer = Trainer(model=model, args=training_args,  
                  train_dataset=train_dataset,
                  eval_dataset=val_dataset, 
                  # This custom collate function is necessary 
                  # to built batches of data
                  data_collator=lambda data: 
                  {'input_ids': torch.stack([f[0] for f in data]),       
                  'attention_mask': torch.stack([f[1] for f in data]),
                  'labels': torch.stack([f[0] for f in data])})

In [17]:
# Start training process!
trainer.train()

***** Running training *****
  Num examples = 6057
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 6057
  Number of trainable parameters = 354825216


Step,Training Loss
100,2.2078
200,0.5606
300,0.5199
400,0.5156
500,0.5081
600,0.5251
700,0.5099
800,0.4886
900,0.5586
1000,0.6061


Saving model checkpoint to /results/checkpoint-5000
Configuration saved in /results/checkpoint-5000/config.json
Model weights saved in /results/checkpoint-5000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=6057, training_loss=0.5333825958942213, metrics={'train_runtime': 3561.2552, 'train_samples_per_second': 1.701, 'train_steps_per_second': 1.701, 'total_flos': 5625140106756096.0, 'train_loss': 0.5333825958942213, 'epoch': 1.0})

In [18]:
model.save_pretrained("/models")

Configuration saved in /models/part2_description_generation/config.json
Model weights saved in /models/part2_description_generation/pytorch_model.bin


In [19]:

tokenizer.save_pretrained("/models")

tokenizer config file saved in /models/part2_description_generation/tokenizer_config.json
Special tokens file saved in /models/part2_description_generation/special_tokens_map.json
added tokens file saved in /models/part2_description_generation/added_tokens.json


('/models/part2_description_generation/tokenizer_config.json',
 '/models/part2_description_generation/special_tokens_map.json',
 '/models/part2_description_generation/vocab.json',
 '/models/part2_description_generation/merges.txt',
 '/models/part2_description_generation/added_tokens.json')

In [21]:
tokenizer = GPT2Tokenizer.from_pretrained("/models/part2_description_generation")
model = GPT2LMHeadModel.from_pretrained("/models/part2_description_generation")

loading file vocab.json
loading file merges.txt
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
Adding <|startoftext|> to the vocabulary
Adding <|pad|> to the vocabulary
loading configuration file /models/part2_description_generation/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2-medium",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1024,
  "n_head": 16,
  "n_inner": null,
  "n_layer": 24,
  "n_positions": 1024,
  "n_special": 0,
  "predict_special_tokens": true,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_lab

In [22]:
generated = tokenizer("<|startoftext|> Harry Potter and the Half-Blood Prince", return_tensors="pt").input_ids
sample_outputs = model.generate(generated, do_sample=True, top_k=50, max_length=100, top_p=0.95, temperature=1.5, num_return_sequences=2)
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  Harry Potter and the Half-Blood Prince. One of the masterpieces of nonlinear storytelling and one of George R.R. Martin's absolute greatest works of fiction. This work by two leading Scottish novelist/political thinkers is a gripping tale from the mind of two masterminds whose visions lead them on a journey. This edition combines together eight original and alternate-language entries and a brand-new new page, Harry Potter and the Half-blood Prince. One of the masterpieces of non
1:  Harry Potter and the Half-Blood Prince is the best-loved fantasy ever written.,The story of one man's magical journey through the magical land of Rivellon, where it is threatened on three sides by demons, wizards, and fairies in each chapter of their tale. Full color.,Full color.
