In [1]:
from itertools import chain
import warnings
import math

#from hf
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer
from transformers import DataCollatorForLanguageModeling
from transformers import GPT2Config, GPT2LMHeadModel, GPT2Model
from transformers import TrainingArguments, Trainer

In [30]:
# loading raw data
dataset = load_dataset("raddwolf/BookCorpus74M",trust_remote_code=True)

# make splits
dataset = dataset['train'].select(range(200000))

dataset.train_test_split(test_size=0.0015) 

# load the gpt-2 tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token=tokenizer.eos_token

# tokenize
def tokenize_function(example):
    return tokenizer(text=example["text"])
tokenized_ds = dataset.map(tokenize_function,batched=True,remove_columns='text')

# save to disk if required (use load_from_disk latter)
tokenized_ds.save_to_disk('bookcorpus/tokenized_ds')

# Make samples to a size of 1024
def concat(examples):    
    examples["input_ids"]=[list(chain.from_iterable(examples['input_ids']))] # convert chain to list of tokens
    examples["attention_mask"]=[list(chain.from_iterable(examples['attention_mask']))] # convert chain to list of tokens
    return examples
    
# takes a lot of time (worth saving it to disk)
concated_ds = tokenized_ds.map(concat,batched=True,batch_size=1000000,num_proc=8)

def chunk(examples):
    chunk_size = 1024 # modify this accordingly       
    input_ids = examples["input_ids"][0] # List[List], pass the inner list      
    attention_mask = examples["attention_mask"][0] # List[List]
    input_ids_truncated = []
    attention_mask_truncated = []
    
    #slice with step_size=chunk_size
    for i in range(0,len(input_ids),chunk_size):
        chunk = input_ids[i:i+chunk_size]
        if len(chunk)==chunk_size: # drop the last chunk if not equal
            input_ids_truncated.append(chunk)
            attention_mask_truncated.append(attention_mask[i:i+chunk_size])     
    examples['input_ids']=input_ids_truncated
    examples["attention_mask"]=attention_mask_truncated
        
    return examples   

chunked_ds = concated_ds.map(chunk,batched=True,batch_size=2,num_proc=2)
chunked_ds.save_to_disk('bookcorpus/chunked_ds') # will use this latter for diff experimentation

chunked_ds

Map:   0%|          | 0/200000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/200000 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/200000 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/8 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1491 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 1491
})

In [31]:
data_collator = DataCollatorForLanguageModeling(tokenizer,mlm=False)

In [32]:
# load the model
configuration = GPT2Config(
    n_head=6,
    n_embd=384,
    n_layer=4,
)
model = GPT2LMHeadModel(configuration)
# model = GPT2Model(configuration)
print(f"{model.num_parameters():,}")

# training arguments
training_args = TrainingArguments( output_dir='gpt-2-warm-up/standard-gpt',
                                  # evaluation_strategy="steps",
                                  # eval_steps=500,                                  
                                  num_train_epochs=1,
                                  per_device_train_batch_size=8,
                                  # per_device_eval_batch_size=8,
                                  learning_rate=2.5e-4,
                                  lr_scheduler_type='cosine',
                                  warmup_ratio=0.05,
                                  adam_beta1=0.9,
                                  adam_beta2=0.999,                                  
                                  weight_decay=0.01,                                  
                                  logging_strategy="steps",
                                  logging_steps = 5,
                                  save_steps=20,
                                  save_total_limit=10,                                  
                                 ) 
trainer = Trainer(model=model,
                 args = training_args,
                 tokenizer=tokenizer,
                 train_dataset=chunked_ds,
                 # eval_dataset=chunked_ds,
                 data_collator = data_collator)



26,790,528


  trainer = Trainer(model=model,


In [33]:
trainer.train()

Step,Training Loss
5,10.6945
10,9.9748
15,9.4301
20,8.7373
25,8.1086
30,7.5028
35,7.0462
40,6.7162
45,6.3998
50,6.1905


TrainOutput(global_step=187, training_loss=6.217721276104769, metrics={'train_runtime': 2087.6931, 'train_samples_per_second': 0.714, 'train_steps_per_second': 0.09, 'total_flos': 65028393271296.0, 'train_loss': 6.217721276104769, 'epoch': 1.0})

In [35]:
model = GPT2LMHeadModel.from_pretrained('gpt-2-warm-up/standard-gpt/checkpoint-98') # modify the path
prompts = "I was telling her that"
inputs = tokenizer(prompts,return_tensors='pt').input_ids
outputs = model.generate(inputs, max_new_tokens=100, do_sample=True, top_k=10, top_p=0.95)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


["I was telling her that she was he her her her , .she . ''`` her , but she , and the the he 'd to her her and the the she 'd she did n't and her and she 's she 's .she was her . ''he .`` i 's , .`` i 'd .i was the , she was and her .he was the that she was she was n't was n't the his .she , he was .`` you .she was to"]

In [39]:
model.push_to_hub("Granther/gpt-2-pretrained-26m")
tokenizer.push_to_hub("Granther/gpt-2-pretrained-26m")

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/Granther/gpt-2-pretrained-26m/commit/db4a38bd2a789db4ba32fbecac815b788d1c7f19', commit_message='Upload tokenizer', commit_description='', oid='db4a38bd2a789db4ba32fbecac815b788d1c7f19', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Granther/gpt-2-pretrained-26m', endpoint='https://huggingface.co', repo_type='model', repo_id='Granther/gpt-2-pretrained-26m'), pr_revision=None, pr_num=None)