# GPT-2 Fine-Tuning




# Setup

In [2]:
!pip install transformers

Collecting transformers
  Obtaining dependency information for transformers from https://files.pythonhosted.org/packages/b6/4d/fbe6d89fde59d8107f0a02816c4ac4542a8f9a85559fdf33c68282affcc1/transformers-4.38.2-py3-none-any.whl.metadata
  Using cached transformers-4.38.2-py3-none-any.whl.metadata (130 kB)
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers)
  Obtaining dependency information for huggingface-hub<1.0,>=0.19.3 from https://files.pythonhosted.org/packages/ab/28/d4b691840d73126d4c9845f8a22dad033ac872509b6d3a0d93b456eef424/huggingface_hub-0.21.4-py3-none-any.whl.metadata
  Using cached huggingface_hub-0.21.4-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Obtaining dependency information for regex!=2019.12.17 from https://files.pythonhosted.org/packages/eb/10/4ccc8eed80f11c082a2883d49d4090aa80c7f65704216a529f490cb089b1/regex-2023.12.25-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Using cached regex-2023.12.25-cp

In [1]:
# Importing libraries
import os
import time
import datetime
import pandas as pd
import numpy as np
import random

import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
torch.manual_seed(42)

from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/studio-lab-
[nltk_data]     user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Create Training Set

The data used to finetune the language model is a set of around 1000 DJ biographies, with the aim of generating them in the same general format and style.

This data isn't public so if you want to use this script, you'll have to source your own training set.

In [2]:
# path of the directory
dir_path = os.getcwd()
print(dir_path)

/home/studio-lab-user/sagemaker-studiolab-notebooks


In [3]:
# Reading the new training data file
data_gutenberg_full = pd.read_csv(dir_path+'/books_and_genres_eng.csv')

In [4]:
# Examining the larger set of books
print(f"Columns in dataset are: {data_gutenberg_full.columns.to_list()}")
print(f"Number of books: {len(data_gutenberg_full['title'].unique())}")
print("Let's look at first 5 random rows of the dataset")
print(data_gutenberg_full.head(5))

Columns in dataset are: ['Unnamed: 0', 'title', 'text', 'genres', 'language_code']
Number of books: 232
Let's look at first 5 random rows of the dataset
   Unnamed: 0                    title  \
0          31  the power and the glory   
1          32                 paradise   
2          88                  sonnets   
3          88                  sonnets   
4         147               persuasion   

                                                text  \
0  Produced by Juliet Sutherland, Sjaani and PG D...   
1  Produced by Judith Smith and Natalie Salter\n\...   
2  Produced by Paul Murray, Rénald Lévesque and t...   
3  Produced by Paul Murray, Rénald Lévesque and t...   
4  Produced by Sharon Partridge and Martin Ward. ...   

                                              genres language_code  
0  {'literary-fiction', 'christian', 'history', '...           eng  
1  {'literary-fiction', 'mythology', 'historical-...           eng  
2  {'read-for-school', 'poetry', '20th-century', .

In [5]:
# Only retain the text and genre from the dataset
gutenberg_full = data_gutenberg_full.drop(columns=['Unnamed: 0','title'])
print(data_gutenberg_full)

     Unnamed: 0                    title  \
0            31  the power and the glory   
1            32                 paradise   
2            88                  sonnets   
3            88                  sonnets   
4           147               persuasion   
..          ...                      ...   
437       10126          a modern utopia   
438       10136    the mill on the floss   
439       10136    the mill on the floss   
440       10212           piccadilly jim   
441       10363                sanctuary   

                                                  text  \
0    Produced by Juliet Sutherland, Sjaani and PG D...   
1    Produced by Judith Smith and Natalie Salter\n\...   
2    Produced by Paul Murray, Rénald Lévesque and t...   
3    Produced by Paul Murray, Rénald Lévesque and t...   
4    Produced by Sharon Partridge and Martin Ward. ...   
..                                                 ...   
437  Produced by Andrew Sly\n\n\n\n\n\n\nA MODERN U...   
438  Pr

In [6]:
# Pre-processing text
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
def clean_text(text):
    '''
    Function to clean text of books. Removes email addresses, new lines, html tags, and extra spaces.

    Input: Text (String)
    Output: Cleaned Text (String)
    '''
    cleaned_text = text.lower()
    cleaned_text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', ' ', text)
    cleaned_text = re.sub(r'^.*?(?=\n\n\n)', ' ', cleaned_text, flags=re.DOTALL)
    cleaned_text = re.sub(r'<a\s+(?:[^>]*?\s+)?href="([^"]*)"[^>]*>.*?</a>', ' ', cleaned_text, flags=re.DOTALL)
    cleaned_text = re.sub(r'\n', ' ', cleaned_text)
    cleaned_text = re.sub(r'\d+', '', cleaned_text)
    cleaned_text = re.sub(r'[^\w\s.?!]', ' ', cleaned_text)
    cleaned_text = re.sub(r' +', ' ', cleaned_text)
    cleaned_text = cleaned_text.lower()
    return cleaned_text

def remove_first_row(group):
    return group.iloc[1:]

[nltk_data] Downloading package stopwords to /home/studio-lab-
[nltk_data]     user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/studio-lab-
[nltk_data]     user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
# Applying the cleaning functions
gutenberg_full['cleaned_text'] = gutenberg_full['text'].apply(lambda x:clean_text(x))
gutenberg_full.loc[:,'genres'] = gutenberg_full.loc[:,'genres'].apply(lambda x:clean_text(x))
gutenberg_full.head(5)

Unnamed: 0,text,genres,language_code,cleaned_text
0,"Produced by Juliet Sutherland, Sjaani and PG D...",literary fiction christian history classics r...,eng,the power and the glory by grace macgowan coo...
1,Produced by Judith Smith and Natalie Salter\n\...,literary fiction mythology historical fiction...,eng,the vision of hell purgatory and paradise by ...
2,"Produced by Paul Murray, Rénald Lévesque and t...",read for school poetry th century reference,eng,note du transcripteur. ce document est tiré d...
3,"Produced by Paul Murray, Rénald Lévesque and t...",read for school poetry th century reference,eng,note du transcripteur. ce document est tiré d...
4,Produced by Sharon Partridge and Martin Ward. ...,romance literary fiction classics historical ...,eng,persuasion by jane austen chapter sir walter ...


# GPT2 Tokenizer

In [8]:
# Load the GPT tokenizer.
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>') #gpt2-medium


# Building the GPT2Dataset 


In [9]:
class GPT2Dataset(Dataset):

  def __init__(self, dataframe, tokenizer, gpt2_type="gpt2", max_length=768):

    self.tokenizer = tokenizer
    self.input_ids = []
    self.attn_masks = []
    genres = dataframe['genres'].tolist()
    text = dataframe['cleaned_text'].tolist()
    for idx, textbook in enumerate(text):
      encodings_dict = tokenizer(genres[idx]+':'+ '<|startoftext|>'+ textbook + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length")

      self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
      self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.attn_masks[idx]

In [10]:
# Building the custom dataset class for DataLoader that incorporates genre and text
class GPT2DatasetCustom(Dataset):

    def __init__(self, df, tokenizer, max_length=768):

        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []

        for _, row in df.iterrows():
            genres = row['genres']
            cleaned_text = row['cleaned_text']
            tokenized_segments = self.tokenize_with_genre(genres, cleaned_text, max_length)
            for segment in tokenized_segments:
                self.input_ids.append(torch.tensor(segment['input_ids']))
                self.attn_masks.append(torch.tensor(segment['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx] 

    def tokenize_with_genre(self, genres, cleaned_text, max_length):
        genre_tokens = self.tokenizer(genres)['input_ids']
        #print(genre_tokens)
        genre_length = len(genre_tokens)
        text_length = max_length - genre_length  # Remaining length for text tokens

        # Initialize list to store tokenized segments
        combined_tokens = []
        attention_masks = []

        # Tokenize cleaned_text into chunks of length text_length
        # NOTE: Contains a bunch of print statements for debugging purposes; remove before running
        for i in range(0, len(cleaned_text), text_length):
            #print(len(cleaned_text))
            # Get a chunk of text
            chunk = cleaned_text[i:i + text_length]
            #print(chunk)
            #print(len(chunk))

            # Tokenize the chunk
            text_tokens = self.tokenizer(chunk, truncation=True, max_length=text_length, padding="max_length")['input_ids']
            #print(text_tokens)

            # Added for debugging purposes- remove from final code
            #decoded_text = self.tokenizer.decode(text_tokens, skip_special_tokens=True)
            #print("Decoded text:", decoded_text)

            # Combine genre tokens with text tokens
            segment_tokens = genre_tokens + text_tokens

            # Create attention mask
            # Identify padding tokens and set attention to 0 for those tokens
            padding_mask = [0 if token == self.tokenizer.pad_token_id else 1 for token in segment_tokens]

            # Pad the attention mask to ensure it has the same length as segment_tokens
            # padding_mask += [0] * (max_length - len(padding_mask))

            # Add the combined tokens and attention mask to the lists
            combined_tokens.append(segment_tokens)
            attention_masks.append(padding_mask)

        # print to check lengths
        #print(len(combined_tokens))
        #print(len(attention_masks))

        # COMMENTING THIS OUT TO GET 1 tensor per row
        # Pad the segments to ensure they all have the same length
        #max_segment_length = max(map(len, combined_tokens))
        #combined_tokens = [segment + [self.tokenizer.pad_token_id] * (max_segment_length - len(segment)) for segment in combined_tokens]

        # Convert lists to tensors
        #input_ids = torch.tensor(combined_tokens)
        #attn_masks = torch.tensor(attention_masks)


        # Return a list of dictionaries containing input_ids and attention_mask for each segment
        tokenized_segments = [{'input_ids': segment, 'attention_mask': mask} for segment, mask in zip(combined_tokens, attention_masks)]
        #print(tokenized_segments)
        return tokenized_segments

In [11]:
# Creating the full dataset
dataset_full = GPT2Dataset(gutenberg_full, tokenizer, max_length=768)

# Split into training and validation sets
train_size = int(0.9 * len(dataset_full))
val_size = len(dataset_full) - train_size

train_dataset, val_dataset = random_split(dataset_full, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

  397 training samples
   45 validation samples


In [13]:
# Create the DataLoaders for our training and validation datasets.
# We'll take training samples in random order.
batch_size = 2
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

# Finetune GPT2 Language Model

In [14]:
# Using config from GPT2
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)

# instantiate the model
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)

# this step is necessary ensure embeddings and model tokenizer are aligned
model.resize_token_embeddings(len(tokenizer))

# Tell pytorch to run this model on the GPU.
device = torch.device("cuda")
model.cuda()

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [15]:
# Hyperparameters from tutorials: optimize if time available
epochs = 10
learning_rate = 5e-4
warmup_steps = 1e2
epsilon = 1e-8

# Defining step size and optimizer
sample_every = 100
optimizer = torch.optim.AdamW(model.parameters(),
                  lr = learning_rate,
                  eps = epsilon)

In [16]:
# Total number of training steps is [number of batches] x [number of epochs].
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
# This changes the learning rate as the training loop progresses
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = warmup_steps,
                                            num_training_steps = total_steps)

In [17]:
# Format time
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

In [18]:
# Running the model
total_t0 = time.time()
training_stats = []
model = model.to(device)
for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_train_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)
        model.zero_grad()
        outputs = model(b_input_ids,
                          labels=b_labels,
                          attention_mask = b_masks,
                          token_type_ids=None)

        loss = outputs[0]
        batch_loss = loss.item()
        total_train_loss += batch_loss

        # Get sample every x batches.
        if step % sample_every == 0 and not step == 0:

            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.   Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))

            model.eval()

            sample_outputs = model.generate(
                                    bos_token_id=random.randint(1,30000),
                                    do_sample=True,
                                    top_k=50,
                                    max_length = 200,
                                    top_p=0.95,
                                    num_return_sequences=1
                                )
            for i, sample_output in enumerate(sample_outputs):
                  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

            model.train()

        loss.backward()
        optimizer.step()
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)

    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))

    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:

        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        with torch.no_grad():

            outputs  = model(b_input_ids,
                             attention_mask = b_masks,
                            labels=b_labels)

            loss = outputs[0]

        batch_loss = loss.item()
        total_eval_loss += batch_loss

    avg_val_loss = total_eval_loss / len(validation_dataloader)

    validation_time = format_time(time.time() - t0)

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch   100  of    199. Loss: 3.8064138889312744.   Elapsed: 0:00:47.
0:  bipartisan, anti-government and anti- law enmeshed in the history of the United States. http_ of them. http. no_of them. http. no_of them. http. no_in_ondon. http. no_in_ondon as_ american_nebulous_us. http. no_in_ondon as_ americans_nato. http_russian. http. no_in_ondon as_ american_nebulous_us. http. no_on_n. http. no_on_n. http. the_most_random_comer. http. no_on_n. http _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

  Average training loss: 9.15
  Training epoch took: 0:01:35

Running Validation...
  Validation Loss: 5.85
  Validation took: 0:00:03

Training...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch   100  of    199. Loss: 3.7533438205718994.   Elapsed: 0:00:47.
0:  increasing the old one. to give me this opportunity of giving a very high ground on which to meet these young people. of course you won all three of them have never had the most difficult task of mine and it is that the most good of them but for a child. we are so long and so sure that they can. they have given us many opportunities for the most excellent use of these children with no disadvantage that it could be used the children. you are very sure that there is something to be said by the child as well as it should be. it is not difficult to learn how we shall go with such children. at the same time the whole time that children should be taught. for the good thing to do it they ought to be given. they must be very well trained to be able to teach their children a whole course of the the whole class of principles and to help them. the whole class are being taught by the guidance of the students of the teacher

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch   100  of    199. Loss: 3.2237935066223145.   Elapsed: 0:00:48.
0: day and to core of a dactylons of the original by this and of the great great by the dum of the original mr. s. and the great mrs. buchler brothers and sisters of the second book at the end of the first century at their christian villa and at last the year after christian and of the great mr. buchler brothers. the book was sold at random by the chancery of the german police guilds and was delivered in two volumes to the mr. buchler brothers who wrote the first edition and by the chancery of the german police guilds. the second edition of the great mr. buchler s work was published at the close of the nineteenth century but was unfinished at the end of the nineteenth century. it was not until the beginning of the century that it was again collected. the second edition

  Average training loss: 3.49
  Training epoch took: 0:01:36

Running Validation...
  Validation Loss: 3.37
  Validation took: 0:00:03

Training...

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch   100  of    199. Loss: 2.7897400856018066.   Elapsed: 0:00:47.
0:  Hang a place where you can best meet. you might get a chance to meet them on your own terms. you might get a glimpse at what will be the next chapter. but most of the people you know of don t understand why you need a place at the university to study a new science. you will not be able to afford it to come your own way. you will have a hard time figuring out how to go about it because there are already so many scientific problems in the world you need to deal with. you will see one problem in the next chapter. the professor won t tell you what you should do. tell him. and he will get tired. you are going to have to start anew in your own way. how do you do? a lot better than what you were used to. and what does he want you to do? to make up his mind? he is too busy. if you try teaching at university you will be making him waste a week and a day

  Average training loss: 2.93
  Training epoch took: 0:01:36

Runn

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch   100  of    199. Loss: 3.57405161857605.   Elapsed: 0:00:47.
0:  foods book with illustrations by charles dickens contents chapter i. a. p.c. ii. the stockholm stockholm chapter ii. a. p.c. iii. on stampegowan st. petersburg chapter iii. with which the title page of the book has been added chapter iv. in which the discussion of stockholm appears on page vii. the stockholm stockholm chapter viii. a. p.c. ix. stockholm stockholm chapter x. the stockholm stockholm chapter xi. an introduction to the book. a. p.c. ii. the stockholm stockholm is the most complete book on the part of the great publisher to the best of authorities in the field who are interested in reading this book and whose opinions on the subject matter are expressed in this manner. chapter i. a. p.c. iii. a. p.c. iv. in which the stockholm is compared with that of

  Average training loss: 2.42
  Training epoch took: 0:01:36

Running Validation...
  Validation Loss: 2.72
  Validation took: 0:00:03

Training...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch   100  of    199. Loss: 1.886625051498413.   Elapsed: 0:00:47.
0:  trail history classics religion reference literature fiction childrens adult non fiction  egypt iliad _reform_ _reform_ the egypt that we will call it by rome is the most successful in its history. since its founding it spread from the very cradle of its pharaon to become the centre of the world and to the most populous city in the western sky. the egypt was the first civilization to be modern in the land. the eastern empire of rome was founded on the site of the second godwin temple to the west where it now lies. the west empire included in its empire the pharaon and jaros which under the rule of the godwin were erected in the eleventh century of the eleventh century and the modern name beltenebros is the result of that long and successful attempt. to modernize and modernize the empire the east and west are mutually exclusive. they are both great powers and they are at the same time the sister cities

  Average

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch   100  of    199. Loss: 2.0912551879882812.   Elapsed: 0:00:47.
0: intend literary fiction drama classics novella high school novels literature fiction philosophy th century school adult read for school unfinished classics war poetry  the odyssey rendered into english prose for the use of those who cannot read original by samuel butler preface to first edition this translation is intended to supplement an earlier work entitled the authoress of the odyssey which i published in. i have not intended to supplement it with all the books i have translated and have restored them from their original location in their original location in u.s. for which i could not give the entire odyssey for this translation. the odyssey was written for a school boy of twelve or thirteen years who lived at amsterdam and as i soon grew old enough to read and enjoy the original i began to wonder how his ideas had changed as he went on his way about life. i begin with the idea that everybody was going sen

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch   100  of    199. Loss: 0.9620369076728821.   Elapsed: 0:00:47.
0:  surround science fiction philosophy fiction travel novels literature fiction adult non fiction historical college  the moon of darwin a travel companion by george warren the director of companies _by_ george warren_ _faye warren._ the moon of darwin is seen in the distance and its companion the marshes. the dwarf s glories of ivory rods and wove silk are marvelously reflected in the transient beauty of the sun and its broad disk just skirting the equator. the region on the left side of marshes slopes to the west and east with a low temperature barely venturing beneath the transient beauty of the north and east. on the right side of marshes is the undiscovered continent of the deep west through which the drifting cloud swimmered and clumped together through the drifting gloom. in its broad disk the sun never sets nor ages nor does it change its course nor ages nor does it change its course nor does it change its

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch   100  of    199. Loss: 0.6942958235740662.   Elapsed: 0:00:47.
0:  reflex the roman eagle and the halter of the spear. and with these words he laid the ground without turning to it the place now called the sultan. and there he laid before the palace that ran round it the pavilion and the palace itself. thereupon the young nobles of alban fathers found their feet in the pavilion and were carried there by the large furrows of the surrounding forest. thereupon they sat before the watcher in silence and did not answer but step out of the palace door and step into it. ah! the queen of the hill! said one of the daughters of the late queen of saros a man who had fallen in love with the old queen of alban. ah! the old queen! she was as dark as a raven s wing and her fiery black eyes were like an illusory dream. the young nobles knitted their brows and their hearts cried in terror at the sight of their master and at the sight of his daughters. he

  Average training loss: 1.34
  Traini

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch   100  of    199. Loss: 1.3799030780792236.   Elapsed: 0:00:47.
0:  display middle grade classics animals fantasy novels literature fiction th century childrens young adult school realistic fiction coming of age  the illustrated children s classics _treasure island_ robert louis stevenson _illustrated by_ milo winter illustration gramercy books new york and london by milo winter illustration gramercy books new york and london illustrated children s classics _treasure island_ robert louis ste treasure island_ robert louis stevenson preface _to_ _the_ reader if you like your children s classics deep inside of themselves then you ought to give your own opinion of them. while all this book content is intended to supplement a small volume i which i published in full on the possessions of my late father louis at the age of fifteen. what i liked most was the little illustrations that were used to illustrate the treasure island robert louis stevenson s treasure island and the little is

Let's view the summary of the training process.

In [19]:
# Looking at the stats
# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index.#
df_stats = df_stats.set_index('epoch')

# A hack to force the column headers to wrap.
#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])

# Display the table.
df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,9.149535,5.84756,0:01:35,0:00:03
2,4.480528,3.759224,0:01:36,0:00:04
3,3.494364,3.371041,0:01:36,0:00:03
4,2.931924,2.987054,0:01:36,0:00:03
5,2.423084,2.720094,0:01:36,0:00:03
6,2.024492,2.514761,0:01:36,0:00:03
7,1.728603,2.452179,0:01:36,0:00:03
8,1.495106,2.440395,0:01:36,0:00:03
9,1.335499,2.417466,0:01:36,0:00:03
10,1.245222,2.425793,0:01:36,0:00:03


# Display Model Info

In [20]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The GPT-2 model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:2]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[2:14]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-2:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The GPT-2 model has 148 different named parameters.

==== Embedding Layer ====

transformer.wte.weight                                  (50259, 768)
transformer.wpe.weight                                   (1024, 768)

==== First Transformer ====

transformer.h.0.ln_1.weight                                   (768,)
transformer.h.0.ln_1.bias                                     (768,)
transformer.h.0.attn.c_attn.weight                       (768, 2304)
transformer.h.0.attn.c_attn.bias                             (2304,)
transformer.h.0.attn.c_proj.weight                        (768, 768)
transformer.h.0.attn.c_proj.bias                              (768,)
transformer.h.0.ln_2.weight                                   (768,)
transformer.h.0.ln_2.bias                                     (768,)
transformer.h.0.mlp.c_fc.weight                          (768, 3072)
transformer.h.0.mlp.c_fc.bias                                (3072,)
transformer.h.0.mlp.c_proj.weight                        (3072

# Saving & Loading Fine-Tuned Model


In [21]:
# Loading the model for future use

output_dir = os.getcwd()+ '/models/full_data_2'
print("Saving model to %s" % output_dir)
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

Saving model to /home/studio-lab-user/sagemaker-studiolab-notebooks/models/full_data_2


('/home/studio-lab-user/sagemaker-studiolab-notebooks/models/full_data_2/tokenizer_config.json',
 '/home/studio-lab-user/sagemaker-studiolab-notebooks/models/full_data_2/special_tokens_map.json',
 '/home/studio-lab-user/sagemaker-studiolab-notebooks/models/full_data_2/vocab.json',
 '/home/studio-lab-user/sagemaker-studiolab-notebooks/models/full_data_2/merges.txt',
 '/home/studio-lab-user/sagemaker-studiolab-notebooks/models/full_data_2/added_tokens.json')

# Generate Text

In [25]:
# Trying with a simple prompt
model.eval()

prompt = "classic: In philosophy"

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)

print(generated)

sample_outputs = model.generate(
                                generated,
                                do_sample=True,
                                top_k=50,
                                max_length=50,
                                top_p=0.95,
                                num_return_sequences=3
                                )

for i, sample_output in enumerate(sample_outputs):
  print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[49421,    25,   554,  8876]], device='cuda:0')
0: classic: In philosophy fiction literature fiction theology spirituality poetry short stories theology short stories theology historical fiction non fiction theology american  oe de cervantes. introduction. _the poem of the martian fathers_. in the second century of american history


1: classic: In philosophy fiction literature politics all the traditions and the gods relate and in this way we become gods. e. t. c._ contents chapter the three cities of siddhartha by the river samladeva chapter the four gree


2: classic: In philosophy a person is a rational person. only logical sense which man forms is based on his own experience. this is the natural result of selection theory which holds that species of animals is indestructible and at last from that


