In [2]:
from itertools import chain
import warnings
import math

#from hf
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer
from transformers import DataCollatorForLanguageModeling
from transformers import GPT2Config, GPT2LMHeadModel, GPT2Model
from transformers import TrainingArguments, Trainer

In [4]:
# loading raw data
dataset = load_dataset("raddwolf/BookCorpus74M",trust_remote_code=True)

# make splits
dataset = dataset['train'].select(range(20000))

dataset.train_test_split(test_size=0.0015) 

# load the gpt-2 tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token=tokenizer.eos_token

# tokenize
def tokenize_function(example):
    return tokenizer(text=example["text"])
tokenized_ds = dataset.map(tokenize_function,batched=True,remove_columns='text')

# save to disk if required (use load_from_disk latter)
tokenized_ds.save_to_disk('bookcorpus/tokenized_ds')

# Make samples to a size of 1024
def concat(examples):    
    examples["input_ids"]=[list(chain.from_iterable(examples['input_ids']))] # convert chain to list of tokens
    examples["attention_mask"]=[list(chain.from_iterable(examples['attention_mask']))] # convert chain to list of tokens
    return examples
    
# takes a lot of time (worth saving it to disk)
concated_ds = tokenized_ds.map(concat,batched=True,batch_size=1000000,num_proc=8)

def chunk(examples):
    chunk_size = 1024 # modify this accordingly       
    input_ids = examples["input_ids"][0] # List[List], pass the inner list      
    attention_mask = examples["attention_mask"][0] # List[List]
    input_ids_truncated = []
    attention_mask_truncated = []
    
    #slice with step_size=chunk_size
    for i in range(0,len(input_ids),chunk_size):
        chunk = input_ids[i:i+chunk_size]
        if len(chunk)==chunk_size: # drop the last chunk if not equal
            input_ids_truncated.append(chunk)
            attention_mask_truncated.append(attention_mask[i:i+chunk_size])     
    examples['input_ids']=input_ids_truncated
    examples["attention_mask"]=attention_mask_truncated
        
    return examples   

chunked_ds = concated_ds.map(chunk,batched=True,batch_size=2,num_proc=2)
chunked_ds.save_to_disk('bookcorpus/chunked_ds') # will use this latter for diff experimentation

Saving the dataset (0/1 shards):   0%|          | 0/20000 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/8 [00:00<?, ? examples/s]

AttributeError: 'list' object has no attribute 'to'

In [3]:
data_collator = DataCollatorForLanguageModeling(tokenizer,mlm=False)

In [5]:
# load the model
configuration = GPT2Config(
    n_head=6,
    n_embd=450,
    n_layer=8,
)
model = GPT2LMHeadModel(configuration)
model.to("cuda")
# model = GPT2Model(configuration)
print(f"{model.num_parameters():,}")

# training arguments
training_args = TrainingArguments( output_dir='gpt-2-warm-up/standard-gpt',
                                  # evaluation_strategy="steps",
                                  # eval_steps=500,                                  
                                  num_train_epochs=1,
                                  per_device_train_batch_size=4,
                                  # per_device_eval_batch_size=8,
                                  learning_rate=2.5e-4,
                                  lr_scheduler_type='cosine',
                                  warmup_ratio=0.05,
                                  adam_beta1=0.9,
                                  adam_beta2=0.999,                                  
                                  weight_decay=0.01,                                  
                                  logging_strategy="steps",
                                  logging_steps = 50,
                                  save_steps=10,
                                  save_total_limit=10,                                  
                                 ) 
trainer = Trainer(model=model,
                 args = training_args,
                 tokenizer=tokenizer,
                 train_dataset=chunked_ds,
                 # eval_dataset=chunked_ds,
                 data_collator = data_collator)

42,564,150


  trainer = Trainer(model=model,


In [None]:
trainer.train(resume_from_checkpoint=True)

There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
  checkpoint_rng_state = torch.load(rng_file)


Step,Training Loss


In [9]:
model = GPT2LMHeadModel.from_pretrained('gpt-2-warm-up/standard-gpt/checkpoint-3613') # modify the path
prompts = "I ate the fish"
inputs = tokenizer(prompts,return_tensors='pt').input_ids
outputs = model.generate(inputs, max_new_tokens=100, do_sample=True, top_k=10, top_p=0.95)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


["I ate the fish in my mother and i wanted to do .and he 'd done to be able to make sure if he 's , he 'd come up and she 'd said , he did n't believe her .she did n't want to get back for her , but she 'd come in a second time , but she was in love of her mind , but she had to the time she could n't get to go home with him , her mother .he knew he did n't think"]

In [10]:
model.push_to_hub("Granther/gpt-2-pretrained-26m")
tokenizer.push_to_hub("Granther/gpt-2-pretrained-26m")

HfHubHTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-675e8cfc-2fcccaa525272a511670b19c;4c2aacca-56f6-4fee-aaf7-f2bf107cb2c9)

Invalid username or password.