In [2]:
from itertools import islice, chain
import warnings
import math

#from hf
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer
from transformers import DataCollatorForLanguageModeling
from transformers import GPT2Config, GPT2LMHeadModel, GPT2Model
from transformers import TrainingArguments, Trainer
from transformers import get_scheduler
import datasets
from datasets import Dataset
import accelerator

# torch
import torch
import torch.optim as optim
import torch.nn as nn

ModuleNotFoundError: No module named 'accelerator'

In [2]:
### Hyper Params
learning_rate = 2.5e-4
warmup_ratio = 0.5
num_rows = 1000
context_len = 1024
epochs = 1

In [3]:
# loading raw data
dataset_stream = load_dataset("HuggingFaceFW/fineweb", split="train", streaming=True)

dataset_raw = Dataset.from_list(list(islice(dataset_stream, num_rows)))

Resolving data files:   0%|          | 0/23781 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/23781 [00:00<?, ?it/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token=tokenizer.eos_token

def tokenize(element):
    outputs = tokenizer(
        element['text'],
        truncation=True,
        max_length=context_len,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs['length'], outputs['input_ids']):
        if length == context_len:
            input_batch.append(input_ids)

    # print(len(input_batch))
    return {"input_ids": torch.tensor(input_batch)}

tokenized_ds = dataset_raw.map(tokenize, remove_columns=dataset_raw.column_names)

In [5]:
# load the gpt-2 tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token=tokenizer.eos_token

# tokenize
def tokenize_function(example):
    return tokenizer(text=example["text"])
    
# tokenized_ds = dataset.map(tokenize_function, batched=True, remove_columns='text')
#tokenized_ds = dataset_raw.map(tokenize_function, batched=True, remove_columns=['text', 'id', 'dump', 'url', 'date', 'file_path', 'language', 'language_score', 'token_count'], num_proc=8)

# save to disk if required (use load_from_disk latter)
# tokenized_ds.save_to_disk('bookcorpus/tokenized_ds')

In [37]:
def chunk_and_pad(examples):
    chunk_size = 1024  # Replace with model's max input length
    input_ids = examples["input_ids"]

    # Create chunks
    chunks = [input_ids[i:i + chunk_size] for i in range(0, len(input_ids), chunk_size)]

    # Drop the last chunk if it's smaller than `chunk_size`
    chunks = [chunk for chunk in chunks if len(chunk) == chunk_size]

    return {"input_ids": chunks}

# Apply chunking
chunked_ds = tokenized_ds.map(chunk_and_pad)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [32]:
def chunk(examples):
    chunk_size = 1024 # modify this accordingly       
    input_ids = examples["input_ids"] # List[List], pass the inner list      
    attention_mask = examples["attention_mask"] # List[List]
    input_ids_truncated = []
    attention_mask_truncated = []
    
    #slice with step_size=chunk_size
    for i in range(0,len(input_ids),chunk_size):
        chunk = input_ids[i:i+chunk_size]
        if len(chunk)==chunk_size: # drop the last chunk if not equal
            input_ids_truncated.append(chunk)
            attention_mask_truncated.append(attention_mask[i:i+chunk_size])     
    examples['input_ids']=input_ids_truncated
    examples["attention_mask"]=attention_mask_truncated
    
    return examples   

chunked_ds = tokenized_ds.map(chunk)
#,batched=True,batch_size=2,num_proc=2)
# chunked_ds.save_to_disk('bookcorpus/chunked_ds') # will use this latter for diff experimentation

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [39]:
# Convert to PyTorch tensors
def to_torch_format(examples):
    return {"input_ids": torch.tensor(examples["input_ids"], dtype=torch.long)}

tensor_ds = tokenized_ds.with_transform(to_torch_format)

In [54]:
from torch.utils.data import DataLoader

dataloader = DataLoader(tensor_ds)

In [None]:
tokenized_ds['input_ids'][0]

In [5]:
# save to disk if required (use load_from_disk latter)
# tokenized_ds.save_to_disk('bookcorpus/tokenized_ds')

# Make samples to a size of 1024
# def concat(examples):    
#     examples["input_ids"]=[list(chain.from_iterable(examples['input_ids']))] # convert chain to list of tokens
#     examples["attention_mask"]=[list(chain.from_iterable(examples['attention_mask']))] # convert chain to list of tokens
#     return examples
    
# # takes a lot of time (worth saving it to disk)
# concated_ds = tokenized_ds.map(concat,num_proc=8)

def chunk(examples):
    chunk_size = 1024 # modify this accordingly       
    input_ids = examples["input_ids"][0] # List[List], pass the inner list      
    attention_mask = examples["attention_mask"][0] # List[List]
    input_ids_truncated = []
    attention_mask_truncated = []
    
    #slice with step_size=chunk_size
    for i in range(0,len(input_ids),chunk_size):
        chunk = input_ids[i:i+chunk_size]
        if len(chunk)==chunk_size: # drop the last chunk if not equal
            input_ids_truncated.append(chunk)
            attention_mask_truncated.append(attention_mask[i:i+chunk_size])     
    examples['input_ids']=input_ids_truncated
    examples["attention_mask"]=attention_mask_truncated
        
    return examples   

chunked_ds = tokenized_ds.map(chunk,batched=True,batch_size=2,num_proc=2)
# chunked_ds.save_to_disk('bookcorpus/chunked_ds') # will use this lat

Map (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [6]:
data_collator = DataCollatorForLanguageModeling(tokenizer,mlm=False)

In [7]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [8]:
from torch.utils.data import DataLoader

data_loader = DataLoader(
    chunked_ds, 
    batch_size=2,
    collate_fn=data_collator,
)

In [9]:
# load the model
configuration = GPT2Config(
    n_head=6,
    n_embd=450,
    n_layer=8,
)
model = GPT2LMHeadModel(configuration)
model.to(device)
print(f"{model.num_parameters():,}")

# # training arguments
# training_args = TrainingArguments( output_dir='gpt-2-warm-up/standard-gpt',
#                                   # evaluation_strategy="steps",
#                                   # eval_steps=500,                                  
#                                   num_train_epochs=1,
#                                   per_device_train_batch_size=4,
#                                   # per_device_eval_batch_size=8,
#                                   learning_rate=2.5e-4,
#                                   lr_scheduler_type='cosine',
#                                   warmup_ratio=0.05,
#                                   adam_beta1=0.9,
#                                   adam_beta2=0.999,                                  
#                                   weight_decay=0.01,                                  
#                                   logging_strategy="steps",
#                                   logging_steps = 50,
#                                   save_steps=10,
#                                   save_total_limit=10,                                  
#                                  ) 
# trainer = Trainer(model=model,
#                  args = training_args,
#                  tokenizer=tokenizer,
#                  train_dataset=chunked_ds,
#                  # eval_dataset=chunked_ds,
#                  data_collator = data_collator)

42,564,150


In [10]:
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
cross_entropy = nn.CrossEntropyLoss()

scheduler = get_scheduler(
    name="cosine",
    optimizer=optimizer, 
    num_warmup_steps=5,
    num_training_steps=100
)

In [None]:
model.train()

for i in range(epochs):
    for step, batch in enumerate(data_loader):
        batch = {key: value.to(device) for key, value in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward(loss)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

#     outputs = model(**batch)
#     loss = outputs.loss
#     # acc.backward(loss)
#     # optimizer.step()
#     # scheduler.step()
#     # optimizer.zero_grad()

#     if i > 10:
#         break
    
    # if acc.is_main_process:
    # perplexity = torch.exp(loss)
    # wandb.log({"loss": loss.item(), "learning_rate": optimizer.param_groups[0]['lr'], "perplexity": perplexity})
    
    # global_step += 1

In [13]:
model = GPT2LMHeadModel.from_pretrained('Granther/gpt2-pretrain-bookcorp-40m') # modify the path
prompts = "Cock and ball torture"
inputs = tokenizer(prompts,return_tensors='pt').input_ids
outputs = model.generate(inputs, max_new_tokens=100, do_sample=True, top_k=10, top_p=0.85)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


["Cock and ball tortureracks , and a small round table in the center .a couple of chairs sat down on the opposite side of the table .`` i 've never seen anyone before , '' the guy said .`` i 've seen the place . ''the bartender said , `` i think they 're here . ''`` i 'll take that to them . ''he turned and started to move toward the door , `` but if you 'll be able to get them out of the house , they '"]

In [None]:
model.push_to_hub("Granther/gpt-2-pretrained-26m")
tokenizer.push_to_hub("Granther/gpt-2-pretrained-26m")