In [2]:
from itertools import chain
import warnings
import math

#from hf
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer
from transformers import DataCollatorForLanguageModeling
from transformers import GPT2Config, GPT2LMHeadModel
from transformers import TrainingArguments, Trainer

In [3]:
# loading raw data
dataset = load_dataset("bookcorpus",trust_remote_code=True)

# make splits
dataset = dataset['train'].train_test_split(test_size=0.0015) 

# load the gpt-2 tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token=tokenizer.eos_token

# tokenize
def tokenize_function(example):
    return tokenizer(text=example["text"])
tokenized_ds = dataset.map(tokenize_function,batched=True,remove_columns='text')

# save to disk if required (use load_from_disk latter)
tokenized_ds.save_to_disk('bookcorpus/tokenized_ds')

# Make samples to a size of 1024
def concat(examples):    
    examples["input_ids"]=[list(chain.from_iterable(examples['input_ids']))] # convert chain to list of tokens
    examples["attention_mask"]=[list(chain.from_iterable(examples['attention_mask']))] # convert chain to list of tokens
    return examples
    
# takes a lot of time (worth saving it to disk)
concated_ds = tokenized_ds.map(concat,batched=True,batch_size=1000000,num_proc=8)

def chunk(examples):
    chunk_size = 1024 # modify this accordingly       
    input_ids = examples["input_ids"][0] # List[List], pass the inner list      
    attention_mask = examples["attention_mask"][0] # List[List]
    input_ids_truncated = []
    attention_mask_truncated = []
    
    #slice with step_size=chunk_size
    for i in range(0,len(input_ids),chunk_size):
        chunk = input_ids[i:i+chunk_size]
        if len(chunk)==chunk_size: # drop the last chunk if not equal
            input_ids_truncated.append(chunk)
            attention_mask_truncated.append(attention_mask[i:i+chunk_size])     
    examples['input_ids']=input_ids_truncated
    examples["attention_mask"]=attention_mask_truncated
        
    return examples   

chunked_ds = concated_ds.map(chunk,batched=True,batch_size=2,num_proc=2)
chunked_ds.save_to_disk('bookcorpus/chunked_ds') # will use this latter for diff experimentation

README.md:   0%|          | 0.00/18.5k [00:00<?, ?B/s]

bookcorpus.py:   0%|          | 0.00/3.25k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/74004228 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/73893221 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1195 > 1024). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/111007 [00:00<?, ? examples/s]

Saving the dataset (0/13 shards):   0%|          | 0/73893221 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/111007 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/73893221 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/111007 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/80 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/8 [00:00<?, ? examples/s]

Saving the dataset (0/6 shards):   0%|          | 0/572429 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/791 [00:00<?, ? examples/s]

In [4]:
data_collator = DataCollatorForLanguageModeling(tokenizer,mlm=False)

In [5]:
# load the model
configuration = GPT2Config()
model =GPT2LMHeadModel(configuration)

# training arguments
training_args = TrainingArguments( output_dir='gpt-2-warm-up/standard-gpt',
                                  evaluation_strategy="steps",
                                  eval_steps=500,                                  
                                  num_train_epochs=1,
                                  per_device_train_batch_size=8,
                                  per_device_eval_batch_size=8,
                                  learning_rate=2.5e-4,
                                  lr_scheduler_type='cosine',
                                  warmup_ratio=0.05,
                                  adam_beta1=0.9,
                                  adam_beta2=0.999,                                  
                                  weight_decay=0.01,                                  
                                  logging_strategy="steps",
                                  logging_steps = 500,
                                  save_steps=5000,
                                  save_total_limit=10,                                  
                                 ) 
trainer = Trainer(model=model,
                 args = training_args,
                 tokenizer=tokenizer,
                 train_dataset=chunked_ds["train"],
                 eval_dataset=chunked_ds["test"],
                 data_collator = data_collator)

  trainer = Trainer(model=model,


In [None]:
trainer.train()

In [6]:
model =GPT2LMHeadModel.from_pretrained('path/to/checkpoint-xxxx/') # modify the path
prompts = "I was telling her that"
inputs = tokenizer(prompts,return_tensors='pt').input_ids
outputs = model.generate(inputs, max_new_tokens=100, do_sample=True, top_k=10, top_p=0.95)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

OSError: Incorrect path_or_model_id: 'path/to/checkpoint-xxxx/'. Please provide either the path to a local folder or the repo_id of a model on the Hub.