# Training GPT-2 Model with InfiniAttention Module

In [None]:
# !pip install datasets
# !pip install accelerate -U
# !pip install transformers -U
# !pip install zarr

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
zarr_file_path = '/content/drive/MyDrive/final_project/dataset_copy.zarr'
tokenizer_path = '/content/drive/MyDrive/final_project/tokenizer/'
# config_path = '/content/drive/MyDrive/Colab Notebooks/nlp_unicamp/final_project/configs/config.json'

In [None]:
# import zarr

# zarr_store = zarr.load(zarr_file_path)

In [None]:
import torch
import torch.nn as nn
from transformers import GPT2Config, GPT2LMHeadModel
from transformers.models.gpt2.modeling_gpt2 import GPT2Attention
from typing import Optional, Tuple, Union

### Standard GPT2LMHeadModel structure

In [None]:
config = GPT2Config()
# model = GPT2LMHeadModel(config)

In [None]:
import torch.nn.functional as F
from transformers.models.gpt2.modeling_gpt2 import GPT2Attention, Conv1D

### Training Model

In [None]:
model_type = "gpt2" #or "gpt2-infini"

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Trainer

In [None]:
# gpt-2 original
model = GPT2LMHeadModel(config).to(device)

# gpt-2 infini
# model = GPT2LMHeadModel(config)

# for i, layer in enumerate(model.transformer.h):
#     model.transformer.h[i].attn = InfiniAttentionGPT2(
#         config, layer_idx=i
#     )

# model = model.to(device)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 124,439,808 trainable parameters


In [None]:
from datasets import Dataset
import torch

path_dataset = '/content/drive/MyDrive/final_project/tokenizer/datasets/split_50/'

train_dataset = Dataset.load_from_disk(path_dataset + "train_dataset")
test_dataset = Dataset.load_from_disk(path_dataset + "test_dataset")

# 50% train data and 5% of test data of 50% train data.
train_dataset = train_dataset.select(range(int(len(train_dataset))))
test_dataset = test_dataset.select(range(4 * 16)) # 6 * 16

In [None]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 269652
})

In [None]:
test_dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 64
})

In [None]:
# train with trainer
from transformers import GPT2Tokenizer
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from tokenizers import ByteLevelBPETokenizer
import numpy as np
from datasets import Dataset

output_dir = '/content/drive/MyDrive/final_project/models_self_attn/output_dir'
logging_dir = '/content/drive/MyDrive/final_project/models_self_attn/logs'
model_save_dir = '/content/drive/MyDrive/final_project/models_self_attn/'

# batch_size = 16
# num_epochs = 1

# num_steps = len(train_dataset) * num_epochs // batch_size

training_args = TrainingArguments(
    learning_rate=2e-5,
    output_dir=output_dir,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir=logging_dir,
    logging_steps=5_000, #5000~10000
    eval_steps=5_000,
    save_steps=5_000,
    save_total_limit = 1,
    logging_strategy="steps",
    save_strategy="steps",
    eval_strategy="steps",
    seed=42,
    eval_accumulation_steps = 4,
    # fp16=True, -> Train with FP16 generate zeros/nan values in loss
    # fp16_full_eval = True,
)

vocab_file = tokenizer_path + "vocab.json"
merges_file = tokenizer_path + "merges.txt"

tokenizer = GPT2Tokenizer(vocab_file, merges_file)
tokenizer.model_max_length = model.config.n_positions
tokenizer.pad_token = tokenizer.eos_token

bos_id = tokenizer.bos_token_id
eos_id = tokenizer.eos_token_id
pad_id = tokenizer.pad_token_id

model.resize_token_embeddings(len(tokenizer))

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


# special tokens

# add an first column of bos value
# bos_array = np.zeros((zarr_store.shape[0], 1), dtype=np.int32)
# bos_array[:, 0] = bos_id

# add an last column of eos value
# eos_array = np.zeros((zarr_store.shape[0], 1), dtype=np.int32)
# eos_array[:, 0] = eos_id

# zarr_store = np.concatenate((bos_array, zarr_store), axis=1)
# zarr_store = np.concatenate((zarr_store, eos_array), axis=1)

# zarr_store[:, 0] = bos_id
# zarr_store[:, -1] = eos_id

# attention mask same dimension zarr_store
# attention_mask = np.ones(zarr_store.shape)

# train 95%
# train_size = int(zarr_store.shape[0] * 0.95)

# train_input_ids = zarr_store[:train_size]
# train_attention_mask = attention_mask[:train_size]

# test 5%
# test_input_ids = zarr_store[train_size:]
# test_attention_mask = attention_mask[train_size:]

# inputs_train = {"input_ids": torch.from_numpy(train_input_ids), "attention_mask": torch.from_numpy(train_attention_mask)}
# inputs_test = {"input_ids": torch.from_numpy(test_input_ids), "attention_mask": torch.from_numpy(test_attention_mask)}

# import torch
# inputs_train = torch.load(tokenizer_path + "inputs_train.pt")
# # replace bos
# inputs_train['input_ids'][:, 0] = bos_id
# # replace eos
# inputs_train['input_ids'][:, -1] = eos_id

# inputs_test = torch.load(tokenizer_path + "inputs_test.pt")
# # replace bos
# inputs_test['input_ids'][:, 0] = bos_id
# # replace eos
# inputs_test['input_ids'][:, -1] = eos_id


# # save inputs_train, inputs_test
# torch.save(inputs_train, tokenizer_path + "inputs_train.pt")
# torch.save(inputs_test, tokenizer_path + "inputs_test.pt")

In [None]:
# ### SCRIPT TO SAVE DATASET WITHOUT USING RAM
# import os
# from datasets import Dataset
# import torch

# tokenizer_path = '/content/drive/MyDrive/Colab Notebooks/nlp_unicamp/final_project/tokenizer/'

# # Create dataset directories
# train_ds_path = os.path.join(tokenizer_path, "datasets/train")
# test_ds_path = os.path.join(tokenizer_path, "datasets/test")

# os.makedirs(train_ds_path, exist_ok=True)
# os.makedirs(test_ds_path, exist_ok=True)

# # Load tensors
# train_ids = torch.load(tokenizer_path + "inputs_train.pt")
# test_ids = torch.load(tokenizer_path + "inputs_test.pt")

# # Function to save datasets in batches
# def save_dataset_in_batches(ids, path, batch_size=100000):
#     total_batches = (len(ids['input_ids']) + batch_size - 1) // batch_size  # Compute number of batches
#     for i in range(total_batches):
#         start_idx = i * batch_size
#         end_idx = min((i + 1) * batch_size, len(ids['input_ids']))
#         batch = {key: value[start_idx:end_idx] for key, value in ids.items()}
#         dataset = Dataset.from_dict(batch)
#         dataset.save_to_disk(os.path.join(path, f"batch_{i:03d}"))

# # Save datasets
# save_dataset_in_batches(train_ids, train_ds_path)
# save_dataset_in_batches(test_ids, test_ds_path)

In [None]:
# import os
# from datasets import Dataset, load_from_disk
# from tqdm import tqdm
# import datasets

# # Function to iteratively concatenate batch datasets in a directory into a single dataset
# def concatenate_batches_iteratively(directory):
#     batch_files = sorted([os.path.join(directory, f) for f in os.listdir(directory) if f.startswith("batch_")])
#     cumulative_dataset = None

#     # Use tqdm to display a progress bar
#     for batch_file in tqdm(batch_files, desc="Loading and concatenating batches"):
#         current_batch = load_from_disk(batch_file)
#         # current_batch.set_format(type="torch", columns=["input_ids", "attention_mask"])
#         if cumulative_dataset is None:
#             cumulative_dataset = current_batch
#         else:
#             # Concatenate the current batch with the cumulative dataset
#             cumulative_dataset = datasets.concatenate_datasets([cumulative_dataset, current_batch])

#     return cumulative_dataset

# # Path to the directories containing the batch files
# # tokenizer_path = "path/to/your/tokenizer/"  # Set this to the correct path
# train_ds_path = os.path.join(tokenizer_path, "datasets/train")
# test_ds_path = os.path.join(tokenizer_path, "datasets/test")

# # Concatenate batches into a single dataset for both train and test, using the iterative function
# train_dataset = concatenate_batches_iteratively(train_ds_path)
# test_dataset = concatenate_batches_iteratively(test_ds_path)

# # Example of usage
# print("Combined train dataset:", train_dataset)
# print("Combined test dataset:", test_dataset)

In [None]:
# save datasets
# train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
# train_dataset.save_to_disk(tokenizer_path + "train_dataset")

# test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
# test_dataset.save_to_disk(tokenizer_path + "test_dataset")

In [None]:
# # from datasets import Dataset
# # import torch
# # # load dataset

# path_dataset = '/content/drive/MyDrive/Colab Notebooks/nlp_unicamp/final_project/tokenizer/datasets/split_10/'

# train_dataset = Dataset.load_from_disk(path_dataset + "train_dataset")
# test_dataset = Dataset.load_from_disk(path_dataset + "test_dataset")

# test_dataset = test_dataset.select(range(int(len(test_dataset) * 0.25)))


# # train_dataset = Dataset.from_dict(torch.load(tokenizer_path + "inputs_train.pt"))
# # train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
# # train_dataset.save_to_disk(tokenizer_path + "train_dataset")

# # train_dataset = Dataset.from_dict(torch.load(tokenizer_path + "inputs_test.pt"))
# # train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
# # train_dataset.save_to_disk(tokenizer_path + "test_dataset")

# # # replace column bos
# # train_dataset['input_ids'][:, 0] = bos_id
# # # replace column eos
# # train_dataset['input_ids'][:, -1] = eos_id

# # test_dataset = Dataset.from_dict(torch.load(tokenizer_path + "inputs_test.pt"))
# # test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

# # # replace column bos
# # test_dataset['input_ids'][:, 0] = bos_id
# # # replace column eos
# # test_dataset['input_ids'][:, -1] = eos_id

# # # Save datasets
# # # train_dataset.save_to_disk(tokenizer_path + "train_dataset")
# # # test_dataset.save_to_disk(tokenizer_path + "test_dataset")

In [None]:
# # import datasets

# dataset = datasets.concatenate_datasets([train_dataset, test_dataset])

# # reduce 50%
# dataset = dataset.select(range(int(len(dataset) * 0.5)))

# # 98% train, 2% test
# train_dataset = dataset.train_test_split(test_size=0.02)

# test_dataset = train_dataset.pop("test")
# train_dataset = train_dataset["train"]

In [None]:
# train_dataset.save_to_disk('/content/drive/MyDrive/Colab Notebooks/nlp_unicamp/final_project/tokenizer/datasets/split_50/train_dataset')
# test_dataset.save_to_disk('/content/drive/MyDrive/Colab Notebooks/nlp_unicamp/final_project/tokenizer/datasets/split_50/test_dataset')

In [None]:
# import perplexity
import torch.nn.functional as F

def compute_metrics(eval_pred):
    logits, labels = eval_pred

    if not isinstance(logits, torch.Tensor):
        logits = torch.tensor(logits)

    if not isinstance(labels, torch.Tensor):
        labels = torch.tensor(labels)

    loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.view(-1))
    perplexity = torch.exp(loss).item()  # Ensure perplexity is a scalar

    return {
        "eval_loss": loss.item(),
        "eval_perplexity": perplexity
    }

In [None]:
# # # DEBUG
# # # get 10% of train_dataset and 1% of test_dataset
# train_dataset = train_dataset.select(range(int(len(train_dataset) * 0.1)))
# test_dataset = test_dataset.select(range(int(len(test_dataset) * 0.5)))

In [None]:
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Perplexity
5000,6.596,5.833317,10995.932617
10000,5.657,5.251013,12436.422852
15000,5.2581,4.931976,14730.146484
20000,5.0135,4.709438,16989.417969
25000,4.8403,4.55865,19057.517578
30000,4.727,4.480994,20543.910156


TrainOutput(global_step=33707, training_loss=5.274952045383748, metrics={'train_runtime': 20956.8962, 'train_samples_per_second': 12.867, 'train_steps_per_second': 1.608, 'total_flos': 1.40915838025728e+17, 'train_loss': 5.274952045383748, 'epoch': 1.0})

In [None]:
# save trainer
trainer.save_model(model_save_dir + 'trainer/')

NameError: name 'trainer' is not defined

In [None]:
trainer.model.save_pretrained(model_save_dir)
tokenizer.save_pretrained(model_save_dir)

('/content/drive/MyDrive/final_project/models_self_attn/tokenizer_config.json',
 '/content/drive/MyDrive/final_project/models_self_attn/special_tokens_map.json',
 '/content/drive/MyDrive/final_project/models_self_attn/vocab.json',
 '/content/drive/MyDrive/final_project/models_self_attn/merges.txt',
 '/content/drive/MyDrive/final_project/models_self_attn/added_tokens.json')

In [None]:
import time

time.sleep(5)

from google.colab import runtime
runtime.unassign()

### Inference trainer

In [None]:

def generate_infini(model_infini, tokenizer, text="Este é um carro", tokens_gen=10):

    model_infini.eval()

    previous_token_id = None

    for _ in range(tokens_gen):

        inputs = tokenizer(text, return_tensors="pt", truncation=True)
        input_ids = inputs.input_ids.to(device)
        attention_mask = inputs.attention_mask.to(device)

        outputs = model_infini(input_ids, attention_mask=attention_mask)

        # get next token

        next_token_logits = outputs[0][:, -1, :]
        next_token_id = torch.argmax(next_token_logits, dim=-1)

        if previous_token_id == next_token_id:
            break
        else:
            previous_token_id = next_token_id

        # add to input_ids

        input_ids = torch.cat([input_ids, next_token_id.unsqueeze(-1)], dim=-1)
        text = tokenizer.decode(input_ids[0], skip_special_tokens=True)

    return text



In [None]:
print(generate_infini(model, tokenizer, text="Meu nome é Pe", tokens_gen=10))
print(generate_infini(model, tokenizer, text="Um carro pass", tokens_gen=10))
print(generate_infini(model, tokenizer, text="Música", tokens_gen=10))

In [None]:
# DEBUG GENERATE
# inputs = tokenizer("LAR",return_tensors="pt", truncation=True)
# input_ids = inputs.input_ids.to(device)
# attention_mask = inputs.attention_mask.to(device)

# model.generate(input_ids, max_new_tokens=10, attention_mask=attention_mask)

In [None]:
model = GPT2LMHeadModel.from_config(config)
y = model(input_ids, attention_mask=attention_mask)

In [None]:
y.past_key_values[0].shape