In [None]:
!pip install transformers datasets accelerate



In [None]:
!pip install --upgrade transformers




In [None]:
from datasets import load_dataset

dataset = load_dataset(
    "json",
    data_files="train_data.jsonl",
    split="train",
)

split = dataset.train_test_split(test_size=0.02, seed=42)
train_ds, val_ds = split["train"], split["test"]

cols = ["input_ids", "attention_mask", "labels"]
train_ds.set_format(type="torch", columns=cols)
val_ds.set_format(type="torch", columns=cols)


Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
import random

# Pick 5 random indices
for i in random.sample(range(len(dataset)), 5):
    sample = dataset[i]
    print("Sample index:", i)
    print("input_ids:", sample["input_ids"][:20], "...")  # show first 20 tokens
    print("attention_mask:", sample["attention_mask"][:20], "...")
    print("labels:", sample["labels"][:20], "...")
    print("-" * 50)


Sample index: 6018
input_ids: [50258, 220, 50261, 685, 90, 6, 12683, 1300, 10354, 705, 7146, 485, 7, 64, 25, 493, 11, 275, 25, 493] ...
attention_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] ...
labels: [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100] ...
--------------------------------------------------
Sample index: 3576
input_ids: [50258, 220, 50261, 685, 90, 6, 12683, 1300, 10354, 705, 40296, 7, 77, 25, 493, 8, 3256, 705, 11213, 10354] ...
attention_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] ...
labels: [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100] ...
--------------------------------------------------
Sample index: 8011
input_ids: [50258, 220, 50261, 685, 90, 6, 12683, 1300, 10354, 705, 7146, 485, 7, 64, 25, 493, 11, 275, 25, 493] ...
attention_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [None]:
from transformers import GPT2Tokenizer

# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# All special tokens including PAD, BOS, EOS, UNK
special_tokens = ["<pad>", "<sos>", "<eos>", "<unk>", "<tools>", "</tools>", "<user>", "</user>", "<func>", "</func>"]

# Add all special tokens to tokenizer
tokenizer.add_special_tokens({"additional_special_tokens": special_tokens})

# Assign the standard special tokens
tokenizer.pad_token = "<pad>"
tokenizer.bos_token = "<sos>"
tokenizer.eos_token = "<eos>"
tokenizer.unk_token = "<unk>"

# Check IDs
print("PAD ID:", tokenizer.pad_token_id, tokenizer.pad_token)
print("BOS ID:", tokenizer.bos_token_id, tokenizer.bos_token)
print("EOS ID:", tokenizer.eos_token_id, tokenizer.eos_token)
print("UNK ID:", tokenizer.unk_token_id, tokenizer.unk_token)

# Example: other special tokens
for tok in ["<tools>", "</tools>", "<user>", "</user>", "<func>", "</func>"]:
    print(f"{tok} -> {tokenizer.convert_tokens_to_ids(tok)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

PAD ID: 50257 <pad>
BOS ID: 50258 <sos>
EOS ID: 50259 <eos>
UNK ID: 50260 <unk>
<tools> -> 50261
</tools> -> 50262
<user> -> 50263
</user> -> 50264
<func> -> 50265
</func> -> 50266


In [None]:
import math
import torch

from transformers import (
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    default_data_collator,
)

model = AutoModelForCausalLM.from_pretrained("gpt2")
model.config.pad_token_id = 50257
model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(50267, 768)

In [None]:
args = TrainingArguments(
    output_dir="./gpt2-finetune",
    num_train_epochs=2,
    per_device_train_batch_size=4,   # Increase batch size
    gradient_accumulation_steps=1,   # Not needed
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    logging_steps=200,
    eval_strategy="steps",
    eval_steps=100,
    save_steps=1000,
    save_total_limit=2,
    load_best_model_at_end=False,
    per_device_eval_batch_size=4,    # Increase eval batch size
    fp16=True                        # Mixed precision
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=default_data_collator,
    tokenizer = tokenizer,
)

trainer.train()

  trainer = Trainer(


Step,Training Loss,Validation Loss
100,No log,1e-06
200,0.000000,0.0
300,0.000000,0.002152
400,0.018300,0.002594
500,0.018300,0.000547
600,0.004200,0.000112
700,0.004200,3.6e-05
800,0.004600,1.1e-05
900,0.004600,9e-06
1000,0.002800,7e-06


TrainOutput(global_step=4900, training_loss=0.0014894893856799915, metrics={'train_runtime': 2860.7706, 'train_samples_per_second': 6.851, 'train_steps_per_second': 1.713, 'total_flos': 1.02426476544e+16, 'train_loss': 0.0014894893856799915, 'epoch': 2.0})

In [None]:
from zipfile import ZipFile
import shutil

# Zip the folder
shutil.make_archive("gpt2-finetuned", 'zip', "./gpt2-finetuned")

# Then use files.download to download
from google.colab import files
files.download("gpt2-finetuned.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>