# GPT-2 Fine-Tune
This notebook shows the training process of gpt-2 LLM.

Please, launch it on the collab or kaggle using GPU.

Important note: The notebook use pseudo-absolute path and should be launched only once. So If you want to launch it second time, restart the kernel.

In [None]:
import torch
import numpy as np
import os
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [1]:
# Upcast the path to the src folder
os.chdir('..')
print(os.getcwd())

/home/leon/Projects/Programming/Study/Python/ML_Inno/PMLDL/PML_ASS_1


In [None]:
def manual_seed(seed):
    """
    Function to set the seed value for reproducibility
    :param seed: seed value
    :return: None
    """
    # PyTorch manual seed
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

    # NumPy manual seed
    np.random.seed(seed)

# Set the seed value
seed = 42

# Call the manual seeding function
manual_seed(seed)

# Train

In [3]:
def load_dataset(file_path, tokenizer, block_size = 128):
    '''
    Function to load the dataset for the gpt-2 model training.
    :param file_path: path to the file with the dataset
    :param tokenizer: tokenizer
    :param block_size: size of the block
    :return: TextDataset object
    '''
    dataset = TextDataset(tokenizer=tokenizer, file_path=file_path, block_size=block_size)
    return dataset


def load_data_collator(tokenizer, mlm = False):
    """
    Function to load the data collator
    :param tokenizer: tokenizer
    :param mlm: boolean value to indicate whether to use masked language modeling or not
    :return: DataCollatorForLanguageModeling object
    """
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=mlm)
    return data_collator


def train(train_file_path, model_name, output_dir, overwrite_output_dir, per_device_train_batch_size, num_train_epochs, save_steps):
    """
    Function to fine-tune the gpt-2 model
    :param train_file_path: path to the file with the dataset
    :param model_name: name of the model
    :param output_dir: path to the output directory
    :param overwrite_output_dir: boolean value to indicate whether to overwrite the output directory or not
    :param per_device_train_batch_size: integer batch size
    :param num_train_epochs: float number of epochs
    :param save_steps: integer number of steps to save the model
    :return: None
    """

    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    tokenizer.add_tokens(['<T>', '<NT>', '<E>', '<F>'])

    train_dataset = load_dataset(train_file_path, tokenizer)
    data_collator = load_data_collator(tokenizer)

    tokenizer.save_pretrained(output_dir)

    model = GPT2LMHeadModel.from_pretrained(model_name)

    model.resize_token_embeddings(len(tokenizer))

    model.save_pretrained(output_dir)

    training_args = TrainingArguments(output_dir=output_dir, overwrite_output_dir=overwrite_output_dir, per_device_train_batch_size=per_device_train_batch_size, num_train_epochs=num_train_epochs, save_strategy='steps', save_steps=save_steps)

    trainer = Trainer(model=model,args=training_args,data_collator=data_collator,train_dataset=train_dataset)

    trainer.train()
    trainer.save_model()

In [4]:
# Setting the parameters
train_file_path = "data/interm/gpt2_corpus.txt"
model_name = 'gpt2'
output_dir = 'models/Gpt2'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 10.0
save_steps = 100000

In [5]:
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)



Step,Training Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 98.00 MiB (GPU 0; 3.81 GiB total capacity; 2.64 GiB already allocated; 110.50 MiB free; 2.67 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF