## **Dataset**

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
!pip install --upgrade datasets


Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-4.0.0-py3-none-any.whl (494 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.8/494.8 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency r

In [3]:
from datasets import load_dataset

ds = load_dataset("datablations/c4-filter-small", split="train")
ds = ds.select_columns(["text"])
ds = ds.train_test_split(test_size=0.1)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/791 [00:00<?, ?B/s]

(…)-00000-of-00001-091e566583af27e4.parquet:   0%|          | 0.00/141M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/100000 [00:00<?, ? examples/s]

In [4]:
ds

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 90000
    })
    test: Dataset({
        features: ['text'],
        num_rows: 10000
    })
})

## **Tokenizer**

In [None]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.normalizers import NFKC
from tokenizers.decoders import ByteLevel as ByteLevelDecoder

# Initialize BPE token
tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = ByteLevel()
tokenizer.normalizer = NFKC()
tokenizer.decoder = ByteLevelDecoder()

trainer = BpeTrainer(
    vocab_size=20000,
    special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
)

tokenizer.train_from_iterator(ds["train"]["text"], trainer)
tokenizer.save("/content/drive/MyDrive/Colab Notebooks/Train_gpt2_from_scrath/gpt_tokenizer.json")

# test
output = tokenizer.encode("I am learning GPT tokenizer")
print(output.tokens)   # token/subword
print(output.ids)      # ID token
print(tokenizer.decode(output.ids))  # decode text


['ĠI', 'Ġam', 'Ġlearning', 'ĠGP', 'T', 'Ġtoken', 'izer']
[259, 611, 2292, 11887, 55, 17992, 6240]
 I am learning GPT tokenizer


In [6]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(tokenizer_file="/content/drive/MyDrive/Colab Notebooks/Train_gpt2_from_scrath/gpt_tokenizer.json")
tokenizer.add_special_tokens({
    "bos_token": "<s>",
    "eos_token": "</s>",
    "unk_token": "<unk>",
    "pad_token": "<pad>",
    "mask_token": "<mask>",
})

tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/Train_gpt2_from_scrath/gpt-tokenizer")

('/content/drive/MyDrive/Colab Notebooks/Train_gpt2_from_scrath/gpt-tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/Train_gpt2_from_scrath/gpt-tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/Train_gpt2_from_scrath/gpt-tokenizer/tokenizer.json')

In [7]:
len(tokenizer)

20000

In [8]:
tokenizer.pad_token_id, tokenizer.eos_token_id, tokenizer.bos_token_id

(1, 2, 0)

In [9]:
def tokenize(example):
    return tokenizer(example["text"])

tokenized_ds = ds.map(
    tokenize, remove_columns=["text"], batched=True, num_proc=20
)

Map (num_proc=20):   0%|          | 0/90000 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [10]:
tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 90000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10000
    })
})

In [11]:
block_size = 256

def group_texts(examples):
    # concat input_ids
    concatenated = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated["input_ids"])
    total_length = (total_length // block_size) * block_size

    # split block_size
    result = {
        k: [concatenated[k][i : i + block_size] for i in range(0, total_length, block_size)]
        for k in concatenated
    }

    # prepare labels
    result["labels"] = result["input_ids"].copy()
    return result

lm_ds = tokenized_ds.map(group_texts, batched=True, num_proc=20)


Map (num_proc=20):   0%|          | 0/90000 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [12]:
lm_ds

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 170893
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 19038
    })
})

In [13]:
print(lm_ds["train"]["input_ids"][:5])

import torch

torch.tensor(lm_ds["train"]["input_ids"][:5])

[[51, 3701, 12, 87, 344, 5014, 472, 411, 13, 294, 10307, 253, 287, 1348, 16643, 411, 434, 2669, 13352, 8058, 2915, 1996, 234, 638, 1301, 675, 219, 3036, 10232, 3203, 269, 1131, 4881, 2492, 18, 1608, 16, 304, 6283, 327, 19, 47, 76, 10096, 12866, 214, 339, 832, 289, 241, 3445, 17, 2173, 415, 844, 4860, 16, 269, 1713, 16, 294, 2489, 234, 3735, 74, 531, 5256, 231, 18, 2249, 210, 16, 302, 914, 291, 676, 209, 4529, 241, 483, 17, 390, 747, 17, 14791, 16, 1579, 210, 6250, 3972, 17, 7638, 16327, 53, 4545, 968, 12566, 16, 4831, 4979, 1594, 16, 9906, 7666, 1972, 286, 433, 6369, 3226, 16, 238, 304, 3178, 1541, 274, 1835, 17, 3997, 4579, 5680, 5105, 16, 299, 267, 3862, 1501, 234, 8457, 214, 327, 19, 47, 76, 3290, 4637, 18, 441, 933, 13496, 356, 16, 483, 18735, 338, 245, 267, 5600, 234, 209, 9560, 2793, 19405, 338, 245, 16, 972, 209, 1321, 483, 305, 1179, 680, 209, 1250, 5014, 5007, 18, 300, 747, 253, 287, 1348, 173, 489, 1252, 933, 13496, 356, 311, 12385, 1080, 747, 19405, 338, 3441, 241, 12324, 23

tensor([[   51,  3701,    12,  ...,   373,   757,    16],
        [  302,  7877,   291,  ...,  6518,   274,   483],
        [ 1556,   370,  1026,  ...,   345,  3235,  4922],
        [17323,  2336,   214,  ...,   625,   361,  1481],
        [  209,  2452,   241,  ...,  4097,   241,   532]])

In [14]:
lm_ds

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 170893
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 19038
    })
})

## **Model**

In [15]:
from transformers import GPT2Config, GPT2LMHeadModel

config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    n_positions=block_size,
    n_ctx=block_size,
    n_embd=256,
    n_layer=6,
    n_head=4,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id
)

model = GPT2LMHeadModel(config)
model.resize_token_embeddings(tokenizer.vocab_size)

Embedding(20000, 256)

In [16]:
# Use wandb
import wandb
wandb.init(
    project="gpt2-pretraining",
    name="c4-en-small"
)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33manhnguyentien8365[0m ([33manhnguyentien8365-no[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [17]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Colab Notebooks/Train_gpt2_from_scrath/gpt-small-c4",
    logging_dir="/content/drive/MyDrive/Colab Notebooks/Train_gpt2_from_scrath/logs",
    per_device_train_batch_size=48,
    per_device_eval_batch_size=48,
    num_train_epochs=10,
    eval_strategy="steps",
    save_strategy="steps",
    logging_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    logging_steps=1000,
    save_total_limit=1,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    load_best_model_at_end=True,
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_ds["train"],
    eval_dataset=lm_ds["test"],
    processing_class=tokenizer,
    data_collator=data_collator
)

## **Training**

In [18]:
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
1000,7.2707,6.605732
2000,6.4287,6.248656
3000,6.166,6.034111
4000,5.9897,5.881382
5000,5.8608,5.765547
6000,5.7586,5.661932
7000,5.6721,5.577486
8000,5.5888,5.505172
9000,5.5279,5.441109
10000,5.468,5.384087


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=35610, training_loss=5.327908358618102, metrics={'train_runtime': 9195.9715, 'train_samples_per_second': 185.835, 'train_steps_per_second': 3.872, 'total_flos': 1.243966819270656e+16, 'train_loss': 5.327908358618102, 'epoch': 10.0})

In [19]:
!pip install huggingface_hub
!huggingface-cli login


model.push_to_hub("NTA1802/Trained-GPT2-from-scratch")
tokenizer.push_to_hub("NTA1802/Trained-GPT2-from-scratch")


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: write).
The token `NTA1802/Trained-GPT2-from-scratch` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You m

model.safetensors:   0%|          | 0.00/39.7M [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/NTA1802/Trained-GPT2-from-scratch/commit/44d35173a477419af041f5d235e81426a4d0a8ab', commit_message='Upload tokenizer', commit_description='', oid='44d35173a477419af041f5d235e81426a4d0a8ab', pr_url=None, repo_url=RepoUrl('https://huggingface.co/NTA1802/Trained-GPT2-from-scratch', endpoint='https://huggingface.co', repo_type='model', repo_id='NTA1802/Trained-GPT2-from-scratch'), pr_revision=None, pr_num=None)

## **Inference**

In [20]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "NTA1802/Trained-GPT2-from-scratch"

model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/756 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/39.7M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/692 [00:00<?, ?B/s]

In [23]:
prompt = "Today is a very good day"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

output = model.generate(
    **inputs,
    max_new_tokens=50,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id
)

In [24]:
print(tokenizer.decode(output[0], skip_special_tokens=True))

Today is a very good day, we were able to see a good deal to be able to start on the market, we decided to do that because the business is very small. As my parents and families, I would make these things more affordable when we could help them to avoid


In [25]:
import math

# Shift for labels (causal LM setting: predict token t+1 from token t)
labels = output[:, 1:].clone()
inputs = output[:, :-1].clone()

with torch.no_grad():
    outputs = model(inputs)
    logits = outputs.logits

# Compute log softmax over vocabulary
log_probs = torch.nn.functional.log_softmax(logits, dim=-1)

# Gather log-probabilities corresponding to the labels
selected_log_probs = log_probs.gather(2, labels.unsqueeze(-1)).squeeze(-1)

# Sum negative log probs → total NLL
nll = -selected_log_probs.sum().item()
num_tokens = labels.numel()
perplexity = math.exp(nll / num_tokens)
perplexity


28.295598012486877

In [26]:
import torch

if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("CUDA cache cleared.")
else:
    print("CUDA is not available.")

CUDA cache cleared.
