In [None]:
import json
import re

import pandas as pd
import numpy as np
import random

import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
torch.manual_seed(42)

from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config
from transformers import AdamW, get_linear_schedule_with_warmup

In [None]:
class CustomDataset(Dataset):
    def __init__(self, dataset, tokenizer, gpt2_type="gpt2", max_length=1024):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []

        for data in dataset:
            encodings_dict = tokenizer('<|sos|>'+ data + '<|eos|>', \
                                       truncation=True, max_length=max_length, padding="max_length")

            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx] 

* Load Data

In [None]:
domain="Kitchen"

In [None]:
file_path= './datasets/{}/Dataset.json'.format(domain)
with open(file_path, 'r') as f:
    data= json.load(f)
    
_dataset= list()
for idx in range(len(data['high_instructions'])):
    steps=''
    for step_i in data['steps'][idx]:
        step_i= re.sub(r'\s+', ' ', step_i, flags= re.DOTALL)
        steps = steps + step_i + '\n'
    train_data= 'Objects= '+ str(data['objects_on_table'][idx]) + '\n' +\
        "Command= " + data['high_instructions'][idx] + '\n'+\
    "Action Step= "+ '\n' + steps
    train_data.strip()
    _dataset.append(train_data)

In [None]:
sample_idx= -10

print(len(_dataset))
print(_dataset[sample_idx])

----
* Load models, Dataloader

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium', \
                                          bos_token='<|sos|>', \
                                          eos_token='<|eos|>', \
                                          pad_token='<|pad|>')

max_length= 1024
batch_size= 4

In [None]:
custom_dataset = CustomDataset(_dataset, tokenizer, max_length=max_length)

train_size = int(0.98 * len(custom_dataset))
val_size = len(custom_dataset) - train_size
train_dataset, val_dataset = random_split(custom_dataset, [train_size, val_size])

print('{} training samples'.format(train_size))
print('{} validation samples'.format(val_size))

train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

---
* (Train) Finetuning GPT2

In [None]:
epochs = 24
sample_every = 6000

In [None]:
configuration = GPT2Config.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)

model.resize_token_embeddings(len(tokenizer))

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.cuda()
print(device)

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

learning_rate = 5e-4
warmup_steps = 1e2
epsilon = 1e-8

optimizer = AdamW(model.parameters(),
                  lr = learning_rate,
                  eps = epsilon)

In [None]:
print("Start fine-tuning gpt2")

total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup_steps, 
                                            num_training_steps = total_steps)

model = model.to(device)
training_stats= list()
steps_count= 0

for epoch_i in range(0, epochs):
    print('\n======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    total_train_loss = 0
    model.train()

    for step, data in enumerate(train_dataloader):
        
        input_ids = data[0].to(device)
        labels = data[0].to(device)
        attn_masks = data[1].to(device)
        steps_count+=1

        model.zero_grad()        
        outputs = model(input_ids,
                        labels=labels, 
                        attention_mask = attn_masks,
                        token_type_ids=None)

        loss = outputs[0]  
        
        batch_loss = loss.item()
        total_train_loss += batch_loss

        loss.backward()
        optimizer.step()
        scheduler.step()
        
        # Save the model every 4000 step
        if (steps_count % sample_every == 0):
            model.eval()
            print('Steps {} , Loss: {}.'.format(steps_count, batch_loss))
            output_dir = './fintuned_model/{}/_gpt2_small/gpt2_small({})_objs'.format(domain, steps_count)
            model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
            model_to_save.save_pretrained(output_dir)
            tokenizer.save_pretrained(output_dir)
            model.train()

    avg_train_loss = total_train_loss / len(train_dataloader)       
    print("\nAverage training loss: {0:.2f}".format(avg_train_loss))
    print("\nRunning Validation...") ######
    model.eval()

    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for data in validation_dataloader:
        
        input_ids = data[0].to(device)
        labels = data[0].to(device)
        attn_masks = data[1].to(device)
        
        with torch.no_grad():        
            outputs  = model(input_ids, 
                             attention_mask = attn_masks,
                             labels=labels)
          
            loss = outputs[0]  
            
        batch_loss = loss.item()
        total_eval_loss += batch_loss        

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss
        }
    )

print("")
print("Training complete!")
print("Saving model to %s" % output_dir)
output_dir = './finetuned_small_lms/{}/gpt2_medium({})'.format(domain, steps_count)
model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)