# Training a causal language model from scratch (PyTorch)

In [1]:
!pip install -q git+https://github.com/huggingface/transformers
!pip install -q datasets accelerate evaluate bitsandbytes sentencepiece loralib peft --upgrade
# install additional dependencies needed for training
!pip install -q tensorboard py7zr
!apt install git-lfs

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 16 not upgraded.


In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Dataset

```codeparrot ```
dataset contains GitHub dump of about 180 GB containing roughly 20 million Python files.Because of the dataset’s size, we want to avoid downloading it; instead, we’ll use the streaming feature to filter it on the fly. Here we train subset of the dataset concerned with the Python data science stack.

In [3]:
def any_keyword_in_string(string, keywords):
    for keyword in keywords:
        if keyword in string:
            return True
    return False

In [4]:
filters = ["pandas", "sklearn", "matplotlib", "seaborn"]
example_1 = "import numpy as np"
example_2 = "import pandas as pd"

print(
    any_keyword_in_string(example_1, filters), any_keyword_in_string(example_2, filters)
)

False True


We can use this to create a function that will stream the dataset and filter the elements we want:

In [5]:
from collections import defaultdict
from tqdm import tqdm
from datasets import Dataset


def filter_streaming_dataset(dataset, filters):
    filtered_dict = defaultdict(list)
    total = 0
    for sample in tqdm(iter(dataset)):
        total += 1
        if any_keyword_in_string(sample["content"], filters):
            for k, v in sample.items():
                filtered_dict[k].append(v)
    print(f"{len(filtered_dict['content'])/total:.2%} of data after filtering.")
    return Dataset.from_dict(filtered_dict)

In [6]:
# This cell will take a very long time to execute, so you should skip it and go to
# the next one!
from datasets import load_dataset

split = "train"  # "valid"
filters = ["pandas", "sklearn", "matplotlib", "seaborn"]

# data = load_dataset(f"transformersbook/codeparrot-{split}", split=split, streaming=True)
# filtered_data = filter_streaming_dataset(data, filters)

Even 3% of the datset is quite large ~ 6 GB, to avoid this we can use ready made dataset as below

In [3]:
from datasets import load_dataset, DatasetDict

ds_train = load_dataset("huggingface-course/codeparrot-ds-train", split="train").shuffle().select(range(500))
ds_valid = load_dataset("huggingface-course/codeparrot-ds-valid", split="validation").shuffle().select(range(10))

raw_datasets = DatasetDict(
    {
        "train": ds_train,  # .shuffle().select(range(50000)),
        "valid": ds_valid,  # .shuffle().select(range(500))
    }
)

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 500
    })
    valid: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 10
    })
})

an example from the dataset.

In [8]:
for key in raw_datasets["train"][0]:
    print(f"{key.upper()}: {raw_datasets['train'][0][key][:400]}")

REPO_NAME: albertaparicio/tfg-voice-conversion
PATH: seq2seq_plot_curves.py
COPIES: 1
SIZE: 5888
CONTENT: # Created by Albert Aparicio on 6/12/16
# coding: utf-8

# This script takes the results of a training and plots its loss curves

import h5py
import matplotlib.pyplot as plt
import numpy as np

model_description = 'seq2seq_pretrain'

with h5py.File('training_results/' + model_description + '_training_params.h5',
               'r') as f:
    params_loss = f.attrs.get('params_loss').decode('utf-8')
LICENSE: gpl-3.0


We can see that the content field contains the code that we want our model to train on.Now that we have a dataset, we need to prepare the texts so they’re in a format suitable for pretraining. Let’s fix the context size at 128 tokens, as opposed to the 1,024 or 2,048 used in GPT-2 or GPT-3, respectively as we want to autocomplete short function calls.

In [4]:
from transformers import AutoTokenizer

context_length = 128
tokenizer = AutoTokenizer.from_pretrained("gpt2")

outputs = tokenizer(
    raw_datasets["train"][:2]["content"],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True,
)
print(f"Tokenizer cntains: {outputs.keys()}")
print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

Tokenizer cntains: dict_keys(['input_ids', 'attention_mask', 'length', 'overflow_to_sample_mapping'])
Input IDs length: 72
Input chunk lengths: [128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 126, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 53]
Chunk mapping: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Most documents contain many more than 128 tokens, so simply truncating the inputs to the maximum length would eliminate a large fraction of our dataset. Instead, we’ll use the `return_overflowing_tokens` option to tokenize the whole input and split it into several chunks. We’ll also use the `return_length` option to return the length of each created chunk automatically. Often the last chunk will be smaller than the context size.

In [5]:
def tokenize(element):
    outputs = tokenizer(
        element["content"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)
tokenized_datasets

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 20265
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 288
    })
})

# Initalize Model

In [12]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    load_in_8bit=True, device_map="auto"
)

In [35]:
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

GPT-2 size: 124.4M parameters


# Freezing the original weights

In [13]:
import torch
import torch.nn as nn

for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

In [36]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

# Setting up the LoRa Adapters

In [37]:
# from peft import LoraConfig, get_peft_model,prepare_model_for_int8_training, TaskType

# config = LoraConfig(
#     r=4, #attention heads, reduce this to reduce trainable parameters
#     lora_alpha=32, #alpha scaling
#     # target_modules=["q_proj", "v_proj"], # for LLaMa style models
#     # target_modules=["q", "v"],
#     lora_dropout=0.05,
#     bias="none",
#     task_type="CAUSAL_LM" # set this for CAUSAL_LM or SEQ_2_SEQ_LM
# )

# # prepare int-8 model for training - will consume CUDA
# # model = prepare_model_for_int8_training(model)
# # add LoRA adaptor
# model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 124439808 || all params: 124439808 || trainable%: 100.0


In [16]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

an example

In [17]:
out = data_collator([tokenized_datasets["train"][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


input_ids shape: torch.Size([5, 128])
attention_mask shape: torch.Size([5, 128])
labels shape: torch.Size([5, 128])


In [21]:
REPO_ID = "codeparrot-gpt-2"
USER_ID = "pritam3355"

In [39]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir=REPO_ID,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_steps=100,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=100,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    logging_dir=f"{REPO_ID}/logs",
    logging_strategy="steps",
    save_strategy="steps",
    report_to="tensorboard",

    save_steps=100,
    fp16=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

In [40]:
trainer.train()

Step,Training Loss,Validation Loss
100,4.9728,4.000661
200,3.4928,3.530043
300,3.019,3.287601


TrainOutput(global_step=335, training_loss=3.729230328460238, metrics={'train_runtime': 395.0041, 'train_samples_per_second': 54.382, 'train_steps_per_second': 0.848, 'total_flos': 1400525291520000.0, 'train_loss': 3.729230328460238, 'epoch': 1.0})

In [41]:
trainer.push_to_hub()

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

'https://huggingface.co/pritam3355/codeparrot-gpt-2/tree/main/'

# Code generation with a pipeline

In [42]:
import torch
from transformers import pipeline

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
pipe = pipeline(
    "text-generation", model=f"{USER_ID}/{REPO_ID}", device=device
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/912 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [44]:
txt = """\
# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
print_sample_weight = model.astbors(


Test - 1

In [45]:
txt = """\
# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
# Compate of the number of the time of the


Test - 2

In [46]:
txt = """\
# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create dataframe from x and y
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create dataframe from x and y
# compute that have given to test is None.




Test - 3

In [47]:
txt = """\
# dataframe with profession, income and name
df = pd.DataFrame({'profession': x, 'income':y, 'name': z})

# calculate the mean income per profession
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


# dataframe with profession, income and name
df = pd.DataFrame({'profession': x, 'income':y, 'name': z})

# calculate the mean income per profession
x1 = xmax =


Test - 4

In [48]:
txt = """
# import random forest regressor from scikit-learn
from sklearn.ensemble import RandomForestRegressor

# fit random forest model with 300 estimators on X, y:
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



# import random forest regressor from scikit-learn
from sklearn.ensemble import RandomForestRegressor

# fit random forest model with 300 estimators on X, y:
classifier
scans: numpy


# Training with Accelerate



We can easily identify these examples through the use of keywords such as plt, pd, sk, fit, and predict, which are the most frequent import names for matplotlib.pyplot, pandas, and sklearn as well as the fit/predict pattern of the latter. If these are each represented as a single token, we can easily check if they occur in the input sequence. Tokens can have a whitespace prefix, so we’ll also check for those versions in the tokenizer vocabulary. To verify that it works, we’ll add one test token which should be split into multiple tokens:

In [6]:
keytoken_ids = []
for keyword in [
    "plt",
    "pd",
    "sk",
    "fit",
    "predict",
    " plt",
    " pd",
    " sk",
    " fit",
    " predict",
    "testtest",
]:
    ids = tokenizer([keyword]).input_ids[0]
    if len(ids) == 1:
        keytoken_ids.append(ids[0])
    else:
        print(f"Keyword has not single token: {keyword}")

Keyword has not single token: plt
Keyword has not single token: predict
Keyword has not single token:  plt
Keyword has not single token:  pd
Keyword has not single token: testtest


First we need to align the logits and inputs: the input sequence shifted by one to the right forms the labels, since the next token is the label for the current token. We can achieve this by starting the labels from the second token of the input sequence, since the model does not make a prediction for the first token anyway. Then we cut off the last logit, as we don’t have a label for the token that follows the full input sequence. With that we can compute the loss per sample and count the occurrences of all keywords in each sample. Finally, we calculate the weighted average over all samples using the occurrences as weights. Since we don’t want to throw away all the samples that have no keywords, we add 1 to the weights:

In [7]:
from torch.nn import CrossEntropyLoss
import torch


def keytoken_weighted_loss(inputs, logits, keytoken_ids, alpha=1.0):
    # Shift so that tokens < n predict n
    shift_labels = inputs[..., 1:].contiguous()
    shift_logits = logits[..., :-1, :].contiguous()
    # Calculate per-token loss
    loss_fct = CrossEntropyLoss(reduce=False)
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    # Resize and average loss per sample
    loss_per_sample = loss.view(shift_logits.size(0), shift_logits.size(1)).mean(axis=1)
    # Calculate and scale weighting
    weights = torch.stack([(inputs == kt).float() for kt in keytoken_ids]).sum(
        axis=[0, 2]
    )
    weights = alpha * (1.0 + weights)
    # Calculate weighted average
    weighted_loss = (loss_per_sample * weights).mean()
    return weighted_loss

In [8]:
from torch.utils.data.dataloader import DataLoader

tokenized_datasets.set_format("torch")
train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=16, shuffle=True)
eval_dataloader = DataLoader(tokenized_datasets["valid"], batch_size=16)

we group the parameters so that the optimizer knows which ones will get an additional weight decay. Usually, all bias and LayerNorm weights terms are exempt from this

In [9]:
weight_decay = 0.1


def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
    params_with_wd, params_without_wd = [], []
    for n, p in model.named_parameters():
        if any(nd in n for nd in no_decay):
            params_without_wd.append(p)
        else:
            params_with_wd.append(p)
    return [
        {"params": params_with_wd, "weight_decay": weight_decay},
        {"params": params_without_wd, "weight_decay": 0.0},
    ]

we want to evaluate the model regularly on the validation set during training

In [32]:
def evaluate():
    model.eval()
    losses = []
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(batch["input_ids"])
            try:
              loss = keytoken_weighted_loss(batch["input_ids"], outputs.logits, keytoken_ids)
              losses.append(accelerator.gather(loss.clone().detach()))
            except:
              continue
    if len(losses):
      loss = torch.mean(torch.cat(losses))
      perplexity = torch.exp(loss)
    else:
      loss = float('inf')
      perplexity = float('inf')

    return loss, perplexity


redefine our model to make sure we train from scratch again

In [13]:
model = GPT2LMHeadModel(config)

In [14]:
from torch.optim import AdamW

optimizer = AdamW(get_grouped_params(model), lr=5e-4)

In [15]:
import torch

# Set the max_split_size_mb option
torch.backends.cuda.split_kernel_solve = True
torch.backends.cuda.max_split_size_mb = 16  # You can adjust this value based on your needs

In [16]:
from accelerate import Accelerator

accelerator = Accelerator(torch.float16)

model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [17]:
from transformers import get_scheduler

num_train_epochs = 1
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=100,
    num_training_steps=num_training_steps,
)

In [20]:
ACC_REPO_ID = "codeparrot-gpt2-lora-accelerate"

In [22]:
from huggingface_hub import Repository

repo = Repository(ACC_REPO_ID, clone_from=f"{USER_ID}/{ACC_REPO_ID}")

/content/codeparrot-gpt2-lora-accelerate is already a clone of https://huggingface.co/pritam3355/codeparrot-gpt2-lora-accelerate. Make sure you pull the latest changes with `repo.git_pull()`.


In the training loop we iterate over the dataloader and pass the batches to the model. With the logits, we can then evaluate our custom loss function. We scale the loss by the number of gradient accumulation steps so as not to create larger losses when aggregating more steps. Before we optimize, we also clip the gradients for better convergence. Finally, every few steps we evaluate the model on the evaluation set with our new evaluate() function

In [33]:
from tqdm.notebook import tqdm

gradient_accumulation_steps = 8
eval_steps = 100

model.train()
completed_steps = 0
for epoch in range(num_train_epochs):
    for step, batch in tqdm(
        enumerate(train_dataloader, start=1), total=num_training_steps
    ):
        logits = model(batch["input_ids"]).logits
        loss = keytoken_weighted_loss(batch["input_ids"], logits, keytoken_ids)
        if step % 100 == 0:
            accelerator.print(
                {
                    # "lr": get_lr(),
                    # "samples": step * samples_per_step,
                    "steps": completed_steps,
                    "loss/train": round(loss.item() * gradient_accumulation_steps,2)
                }
            )
        loss = loss / gradient_accumulation_steps
        accelerator.backward(loss)
        if step % gradient_accumulation_steps == 0:
            accelerator.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            completed_steps += 1
        if (step % (eval_steps * gradient_accumulation_steps)) == 0:
            eval_loss, perplexity = evaluate()
            accelerator.print({"loss/eval": eval_loss, "perplexity": perplexity})
            model.train()
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(ACC_REPO_ID, save_function=accelerator.save)
            if accelerator.is_main_process:
                tokenizer.save_pretrained(ACC_REPO_ID)
                repo.push_to_hub(
                    commit_message=f"Training in progress step {step}", blocking=False
                )

  0%|          | 0/1267 [00:00<?, ?it/s]



{'steps': 12, 'loss/train': 17.92}
{'steps': 24, 'loss/train': 16.85}
{'steps': 37, 'loss/train': 14.99}
{'steps': 49, 'loss/train': 17.76}
{'steps': 62, 'loss/train': 18.51}
{'steps': 74, 'loss/train': 16.77}
{'steps': 87, 'loss/train': 16.54}
{'steps': 99, 'loss/train': 14.96}


RuntimeError: ignored

# Inference (Accelerate)

In [None]:
import torch
from transformers import pipeline

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
pipe = pipeline(
    "text-generation", model=f"{USER_ID}/{ACC_REPO_ID}", device=device
)

In [None]:
txt = """\
# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])