# Import data

In [None]:
import deeplake

ds = deeplake.load('hub://activeloop/openwebtext-train')
ds_val = deeplake.load('hub://activeloop/openwebtext-val')

print(ds)
print(ds[0].text.text())

# Tokenize

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# define transform to tokenize texts
def get_tokens_transform(tokenizer):
    def tokens_transform(sample_in):
        tokenized_text = tokenizer(
            sample_in["text"],
            truncation=True,
            max_length=512,
            padding='max_length',
            return_tensors="pt"
        )
        tokenized_text = tokenized_text["input_ids"][0]
        return {
            "input_ids": tokenized_text,
            "labels": tokenized_text
        }
    return tokens_transform

# create data loaders
ds_train_loader = ds.dataloader()\
    .batch(32)\
    .transform(get_tokens_transform(tokenizer))\
    .pytorch()
ds_eval_train_loader = ds_val.dataloader()\
    .batch(32)\
    .transform(get_tokens_transform(tokenizer))\
    .pytorch()

# Initialize Model

In [None]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained("gpt2")
print(config)

In [None]:
""" 
    GPT2Config {
        "_name_or_path": "gpt2",
        "activation_function": "gelu_new",
        "architectures": [
        "GPT2LMHeadModel"
        ],
        "attn_pdrop": 0.1,
        "bos_token_id": 50256,
        "embd_pdrop": 0.1,
        "eos_token_id": 50256,
        "initializer_range": 0.02,
        "layer_norm_epsilon": 1e-05,
        "model_type": "gpt2",
        "n_ctx": 1024,
        "n_embd": 768,
        "n_head": 12,
        "n_inner": null,
        "n_layer": 12,
        "n_positions": 1024,
        "reorder_and_upcast_attn": false,
        "resid_pdrop": 0.1,
        "scale_attn_by_inverse_layer_idx": false,
        "scale_attn_weights": true,
        "summary_activation": null,
        "summary_first_dropout": 0.1,
        "summary_proj_to_labels": true,
        "summary_type": "cls_index",
        "summary_use_proj": true,
        "task_specific_params": {
        "text-generation": {
        "do_sample": true,
        "max_length": 50
        }
        },
        "transformers_version": "4.30.2",
        "use_cache": true,
        "vocab_size": 50257
    }
"""

In [None]:
from transformers import GPT2LMHeadModel

model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1e6:.1f}M parameters")

GPT2-1B size: 124.4M parameters

In [None]:
config.n_layer = 32
config.n_embd = 1600
config.n_positions = 512
config.n_ctx = 512
config.n_head = 32

# Updated Hyperparameters

In [None]:
model_1b = GPT2LMHeadModel(config)

model_size = sum(t.numel() for t in model_1b.parameters())
print(f"GPT2-1B size: {model_size/1e6:.1f}M parameters")

GPT2-1B size: 1065.8M parameters

# Training Loop

In [None]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="GPT2-scratch-openwebtext",
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=500,
    save_steps=500,
    num_train_epochs=2,
    logging_steps=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=1,
    weight_decay=0.1,
    warmup_steps=100,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    bf16=True,
    ddp_find_unused_parameters=False,
    run_name="GPT2-scratch-openwebtext",
    report_to="wandb"
)

In [None]:
from transformers import Trainer

class TrainerWithDataLoaders(Trainer):
    def __init__(self, *args, train_dataloader=None, eval_dataloader=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.train_dataloader = train_dataloader
        self.eval_dataloader = eval_dataloader

    def get_train_dataloader(self):
        return self.train_dataloader

    def get_eval_dataloader(self, dummy):
        return self.eval_dataloader

# Train

In [None]:
trainer = TrainerWithDataLoaders(
    model=model,
    args=args,
    train_dataloader=ds_train_loader,
    eval_dataloader=ds_eval_train_loader,
)

trainer.train()

# Inference

In [None]:
from transformers import pipeline

pipe = pipeline("text-generation",
                model="./GPT2-scratch-openwebtext",
                tokenizer=tokenizer,
                device="cuda:0")

In [None]:
txt = "The house prices dropped down"

completion = pipe(txt, num_return_sequences=1)
print(completion)