In [1]:
from datasets import load_dataset

raw_datasets = load_dataset("wikimedia/wikipedia", "20231101.en", split="train[:10%]")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
raw_datasets.dataset_size

20200062385

In [3]:
import psutil

print(f"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB")


RAM used: 195.11 MB


In [4]:
print(f"Number of files in dataset : {raw_datasets.dataset_size}")
size_gb = raw_datasets.dataset_size / (1024**3)
print(f"Dataset size (cache file) : {size_gb:.2f} GB")

Number of files in dataset : 20200062385
Dataset size (cache file) : 18.81 GB


In [5]:
raw_datasets[:2]["text"]

['Anarchism is a political philosophy and movement that is skeptical of all justifications for authority and seeks to abolish the institutions it claims maintain unnecessary coercion and hierarchy, typically including nation-states, and capitalism. Anarchism advocates for the replacement of the state with stateless societies and voluntary free associations. As a historically left-wing movement, this reading of anarchism is placed on the farthest left of the political spectrum, usually described as the libertarian wing of the socialist movement (libertarian socialism).\n\nHumans have lived in societies without formal hierarchies long before the establishment of states, realms, or empires. With the rise of organised hierarchical bodies, scepticism toward authority also rose. Although traces of anarchist ideas are found all throughout history, modern anarchism emerged from the Enlightenment. During the latter half of the 19th and the first decades of the 20th century, the anarchist moveme

In [6]:
from transformers import AutoTokenizer

context_length = 128
tokenizer = AutoTokenizer.from_pretrained("sachin6624/sachin-gpt2-tokenizer")

outputs = tokenizer(
    raw_datasets[:2]["text"],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True,
)

print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

Input IDs length: 109
Input chunk lengths: [128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 118, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 77]
Chunk mapping: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [7]:
def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets.column_names
)
tokenized_datasets

Dataset({
    features: ['input_ids'],
    num_rows: 3983934
})

In [8]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

In [9]:
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

GPT-2 size: 125.8M parameters


In [10]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [11]:
out = data_collator([tokenized_datasets[i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

input_ids shape: torch.Size([5, 128])
attention_mask shape: torch.Size([5, 128])
labels shape: torch.Size([5, 128])


In [12]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="GPT-2",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    eval_strategy="no",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
)

trainer = Trainer(
    model=model,
    processing_class=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets,
)

In [13]:
trainer.train()


Step,Training Loss


KeyboardInterrupt: 

In [1]:
from colabcode import ColabCode
ColabCode(port=10000)


--2025-02-06 15:09:58--  https://code-server.dev/install.sh
Resolving code-server.dev (code-server.dev)... 2606:4700:3031::ac43:d6e1, 2606:4700:3035::6815:10ab, 104.21.16.171, ...
Connecting to code-server.dev (code-server.dev)|2606:4700:3031::ac43:d6e1|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/cdr/code-server/main/install.sh [following]
--2025-02-06 15:09:59--  https://raw.githubusercontent.com/cdr/code-server/main/install.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8003::154, 2606:50c0:8002::154, 2606:50c0:8001::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8003::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15872 (16K) [text/plain]
Saving to: ‘install.sh’

     0K .......... .....                                      100% 65.8M=0s

2025-02-06 15:09:59 (65.8 MB/s) - ‘install.sh’ saved [15872/15872]

==> Aut

[2025-02-06T09:40:25.025Z] info  Wrote default config file to /Users/sachinmurali/.config/code-server/config.yaml
Installing extensions...
Installing extension 'ms-python.python'...


[2025-02-06T09:40:36.375Z] error parent:73674 Uncaught exception: Signature verification was not executed.
[2025-02-06T09:40:36.378Z] error parent:73674 SignatureVerificationInternal: Signature verification was not executed.
    at Hc.ub (file:///opt/homebrew/Cellar/code-server/4.96.4/libexec/lib/vscode/out/server-main.js:58:17105)
    at async Hc.tb (file:///opt/homebrew/Cellar/code-server/4.96.4/libexec/lib/vscode/out/server-main.js:58:15571)
Error while installing extension ms-python.python: Signature verification was not executed.
Error while installing extension ms-python.debugpy: Signature verification was not executed.
Failed Installing Extensions: ms-python.python, ms-python.debugpy
[2025-02-06T09:40:39.703Z] error parent:73674 Uncaught exception: Signature verification was not executed.
[2025-02-06T09:40:39.703Z] error parent:73674 SignatureVerificationInternal: Signature verification was not executed.
    at Hc.ub (file:///opt/homebrew/Cellar/code-server/4.96.4/libexec/lib/vs

Installing extensions...
Installing extension 'ms-toolsai.jupyter'...


[2025-02-06T09:40:50.569Z] error parent:73851 Uncaught exception: Signature verification was not executed.
[2025-02-06T09:40:50.572Z] error parent:73851 SignatureVerificationInternal: Signature verification was not executed.
    at Hc.ub (file:///opt/homebrew/Cellar/code-server/4.96.4/libexec/lib/vscode/out/server-main.js:58:17105)
    at async Hc.tb (file:///opt/homebrew/Cellar/code-server/4.96.4/libexec/lib/vscode/out/server-main.js:58:15571)
[2025-02-06T09:40:51.658Z] error parent:73851 Uncaught exception: Signature verification was not executed.
[2025-02-06T09:40:51.658Z] error parent:73851 SignatureVerificationInternal: Signature verification was not executed.
    at Hc.ub (file:///opt/homebrew/Cellar/code-server/4.96.4/libexec/lib/vscode/out/server-main.js:58:17105)
    at async Hc.tb (file:///opt/homebrew/Cellar/code-server/4.96.4/libexec/lib/vscode/out/server-main.js:58:15571)
[2025-02-06T09:40:51.666Z] error parent:73851 Uncaught exception: Signature verification was not execu

Downloading ngrok ...

Error while installing extension ms-toolsai.jupyter-keymap: Signature verification was not executed.
Error while installing extension ms-toolsai.vscode-jupyter-slideshow: Signature verification was not executed.
Error while installing extension ms-toolsai.vscode-jupyter-cell-tags: Signature verification was not executed.
Error while installing extension ms-toolsai.jupyter-renderers: Signature verification was not executed.
Error while installing extension ms-toolsai.jupyter: Signature verification was not executed.
Failed Installing Extensions: ms-toolsai.jupyter-keymap, ms-toolsai.vscode-jupyter-slideshow, ms-toolsai.vscode-jupyter-cell-tags, ms-toolsai.jupyter-renderers, ms-toolsai.jupyter
[2025-02-06T09:40:57.104Z] error parent:73851 Uncaught exception: Signature verification was not executed.
[2025-02-06T09:40:57.104Z] error parent:73851 SignatureVerificationInternal: Signature verification was not executed.
    at Hc.ub (file:///opt/homebrew/Cellar/code-server/4.96.4/libexec/lib/v

                                                                                                    

t=2025-02-06T15:11:03+0530 lvl=eror msg="failed to reconnect session" obj=tunnels.session err="authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n"
ERROR:  authentication failed: Usage of ngrok requires a verified account and authtoken.
ERROR:  
ERROR:  Sign up for an account: https://dashboard.ngrok.com/signup
ERROR:  Install your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken
ERROR:  
ERROR:  ERR_NGROK_4018
ERROR:  https://ngrok.com/docs/errors/err_ngrok_4018
ERROR:  


PyngrokNgrokError: The ngrok process errored on start: authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n.