In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import random
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import GPT2Tokenizer, GPT2LMHeadModel, get_linear_schedule_with_warmup
from tqdm import tqdm
import os
import random

SEED = 69
random.seed(SEED)
device = "cuda" if torch.cuda.is_available() else "cpu"

# Fine-Tuning Parameters

In [2]:
#max_seq_length -> gpt2=768; gpt2-medium=1024; gpt2-large=1420; gpt2-xl=1600

#fine-tuning
gpt2_type = 'gpt2'
max_seq_length = 768
control_code = 'startoftext'
large_data = True
large = 'larger_' if large_data else ''
epochs=20

# Dataset

Adding sentences to the textual input until the setted max_length is reached. In this way each input contains more than one sentence and each of them is complete.

In [4]:
class RealAsimov(Dataset):
    def __init__(self, control_code = control_code, gpt2_type=gpt2_type, max_length=1024, larger=''):

        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.text = []
        self.input_len = []
        self.tot_input = ''
        self.discarded = ''

        self.file = f'{larger}data/{larger}asimov_sentence_dataset.txt'
        
        with open(self.file, 'r', encoding="utf-8") as data:
            data_list = data.readlines()
            for i,line in enumerate(data_list):
                temp = line + self.discarded
                self.discarded = ''
                if len(self.tot_input + temp) < max_length: #I can add the current line to the input
                    self.tot_input += temp
                    if i == len(data_list)-1: # if we are at the final line
                        self.text.append(torch.tensor(
                            self.tokenizer.encode(f"<|{control_code}|>{self.tot_input[:max_length]}<|endoftext|>")
                        ))
                else: #Adding the current sentence would result in an input longer than the max
                    self.text.append(torch.tensor(
                            self.tokenizer.encode(f"<|{control_code}|>{self.tot_input[:max_length]}<|endoftext|>")
                        ))
                    self.input_len.append(len(self.tot_input))
                    self.tot_input = ''
                    self.discarded = temp #keep in memory the current sentence
            self.text_count = len(self.text)
        
    def __len__(self):
        return self.text_count

    def __getitem__(self, item):
        return self.text[item]
    

dataset = RealAsimov(control_code=control_code, gpt2_type=gpt2_type, larger=large) 

# Tokenizer

In [8]:
#Get the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
model = GPT2LMHeadModel.from_pretrained(gpt2_type)

#Accumulated batch size (since GPT2 is so big)
def pack_tensor(new_tensor, packed_tensor, max_seq_len): #-> (input_tensor, carry_on, remainder)
    if packed_tensor is None: #first iteration
        return new_tensor, True, None   #input_tensor=new_tensor; carry_on=True; remainder=None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len: #new tensor is too big and can not be packed together with the previous one
        return packed_tensor, False, new_tensor #input_tensor=packed_tensor; carry_on=False; remainder=new_tensor
    else: #can be packed
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1) #concatenate the tensors
        return packed_tensor, True, None    #input_tensor=packed_tensor; carry_on=True; remainder=none

# Fine Tuning

In [9]:
def train(
    dataset,
    model,
    batch_size=16,
    epochs=3,
    lr=2e-5,
    max_seq_len=max_seq_length,
    warmup_steps=5000,
    device="cuda",
    output_dir="model",
    output_prefix=control_code,
    save_model_on_epoch=False,
):

    device=torch.device("cuda")
    model = model.to(device)
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=False) #in order to keep sentences order
    loss=0
    accumulating_batch_count = 0
    input_tensor = None

    number_of_training_steps = len([b for b in train_dataloader])
    for epoch in range(epochs):
        batch_pbar = tqdm(enumerate(train_dataloader), total=number_of_training_steps)
        for idx, entry in batch_pbar:
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, max_seq_len)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = remainder # to avoid loss of entries we reset the input tensor to the current entry we did not use in the packed tensor
        
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{large}final_{output_prefix}-{epoch}.pt"),
            )

        if (epoch+1)%2 == 0:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{large}final_{output_prefix}-{epoch+1}.pt"),
            )

    return model


In [None]:
#train / fine tune
model = train(dataset, model, output_dir=f"{large}model", epochs=epochs, device=device)