In [1]:
import random, time, math, json
from datasets import load_dataset, Dataset, DatasetDict
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing

import torch
from transformers import (
    GPT2Config, GPT2LMHeadModel, PreTrainedTokenizerFast,
    Trainer, TrainingArguments, DataCollatorForLanguageModeling
)

In [2]:
TOPIC_LANG = "Python"  # choose a topic/language slice
MAX_DOCS   = 1000      # per instructions (use smaller for quick smoke-test)
TRAIN_FRAC = 0.9
SEQ_LEN    = 256
WP_VOCAB   = 16000
BATCH_SIZE = 16
EPOCHS     = 1         # bump to 2–3 if you have time/GPU
LR         = 3e-4
N_LAYERS   = 4; N_HEADS = 4; N_EMBD = 256
SEED       = 42

In [4]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from datasets import load_dataset

def load_stack_sample(lang: str, n_docs: int):
    """
    Downloads (or reuses cached) subset of The Stack v2 for a given language.
    No streaming—this will store a small shard on disk so you can re-run offline.
    """
    # Try the language-specific configuration if available
    try:
        ds = load_dataset("bigcode/the-stack-v2", lang, split="train[:{}]".format(n_docs))
    except Exception as e:
        print("Language subset not found or dataset too large, falling back to filtered version.", e)
        ds = load_dataset("bigcode/the-stack-v2", split="train[:50000]")  # fallback: small slice
        ds = ds.filter(lambda r: (r.get("lang") or r.get("language") or "").lower() == lang)
        ds = ds.select(range(min(n_docs, len(ds))))
    
    texts = []
    for row in ds:
        text = row.get("content") or row.get("text") or ""
        if text:
            texts.append(text)
        if len(texts) >= n_docs:
            break
    if not texts:
        raise RuntimeError("No docs found. Check language name or dataset fields.")
    return texts

docs = load_stack_sample(TOPIC_LANG, MAX_DOCS)
len(docs), docs[0][:200]


Resolving data files:   0%|          | 0/917 [00:00<?, ?it/s]

'HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.' thrown while requesting GET https://huggingface.co/datasets/bigcode/the-stack-v2/resolve/7408bfbcfd48e5833d62fd3dba48afd20d109473/data/Python/train-00007-of-00009.parquet
Retrying in 1s [Retry 1/5].
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 12ffa550-8b9e-450b-912a-22c6c2c0e652)')' thrown while requesting GET https://huggingface.co/datasets/bigcode/the-stack-v2/resolve/7408bfbcfd48e5833d62fd3dba48afd20d109473/data/Python/train-00007-of-00009.parquet
Retrying in 2s [Retry 2/5].
'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /datasets/bigcode/the-stack-v2/resolve/7408bfbcfd48e5833d62fd3dba48afd20d109473/data/Python/train-00007-of-00009.parquet (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x720f675f5340>: Failed to resolve \'huggingface.co\

RuntimeError: No docs found. Check language name or dataset fields.

In [None]:
def train_wordpiece(texts, vocab_size=16000):
    tok = Tokenizer(WordPiece(unk_token="[UNK]"))
    tok.pre_tokenizer = Whitespace()
    trainer = WordPieceTrainer(
        vocab_size=vocab_size, min_frequency=2,
        special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
    )
    tok.train_from_iterator(texts, trainer=trainer)
    tok.post_processor = TemplateProcessing(
        single="$A",
        pair="$A [SEP] $B:1",
        special_tokens=[("[SEP]", tok.token_to_id("[SEP]"))],
    )
    fast = PreTrainedTokenizerFast(tokenizer_object=tok)
    fast.add_special_tokens({"pad_token": "[PAD]"})
    return fast

wp_tok = train_wordpiece(docs, WP_VOCAB)
len(wp_tok), wp_tok.tokenize("def hello_world(): pass")[:10]


In [None]:
split = int(len(docs) * TRAIN_FRAC)
train_texts, val_texts = docs[:split], docs[split:]

raw = DatasetDict({
    "train": Dataset.from_dict({"text": train_texts}),
    "validation": Dataset.from_dict({"text": val_texts})
})

def chunkify(examples, tokenizer, max_length=SEQ_LEN):
    ids = tokenizer(examples["text"], add_special_tokens=False).input_ids
    flat = [i for doc in ids for i in doc]
    chunks = [flat[i:i+max_length] for i in range(0, len(flat)-max_length, max_length)]
    return {"input_ids": chunks}

ds_train = raw["train"].map(lambda ex: chunkify(ex, wp_tok), batched=True, remove_columns=["text"]).flatten_indices()
ds_val   = raw["validation"].map(lambda ex: chunkify(ex, wp_tok), batched=True, remove_columns=["text"]).flatten_indices()

len(ds_train), len(ds_val), ds_train[0]["input_ids"][:10]


{'blob_id': 'd44bbb217114c0831167824d694d57c29ab86665', 'directory_id': 'e3f3f911019ac126d01c056eafc7c3183107a5af', 'path': '/Traffic Sign Detection/all_signs_combined/src/predict.py', 'content_id': '19ed9a428015b625610be9930dfee35938fb451b', 'detected_licenses': [], 'license_type': 'no_license', 'repo_name': 'uncctrafficsigndetection/Traffic-Sign-Detection', 'snapshot_id': '595258766f865c4b3c628b002d7b93a774168a9b', 'revision_id': '3ff4be52357f4b6340fef94124f8c835ab66fd8a', 'branch_name': 'refs/heads/master', 'visit_date': Timestamp('2020-04-09 20:28:33.910961'), 'revision_date': Timestamp('2018-12-05 21:29:50'), 'committer_date': Timestamp('2018-12-05 21:29:50'), 'github_id': 160574509, 'star_events_count': 0, 'fork_events_count': 0, 'gha_license_id': None, 'gha_event_created_at': None, 'gha_created_at': None, 'gha_language': None, 'src_encoding': 'UTF-8', 'language': 'Python', 'is_vendor': False, 'is_generated': False, 'length_bytes': 959, 'extension': 'py'}


In [None]:
from pathlib import Path

OUTPUT_DIR = Path("me7_wp_decoder_ckpt")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

cfg = GPT2Config(
    vocab_size=len(wp_tok),
    n_positions=SEQ_LEN, n_ctx=SEQ_LEN,
    n_embd=N_EMBD, n_layer=N_LAYERS, n_head=N_HEADS,
    bos_token_id=wp_tok.cls_token_id or wp_tok.pad_token_id,
    eos_token_id=wp_tok.sep_token_id or wp_tok.pad_token_id,
)
model = GPT2LMHeadModel(cfg)
model.resize_token_embeddings(len(wp_tok))

collator = DataCollatorForLanguageModeling(tokenizer=wp_tok, mlm=False)

args = TrainingArguments(
    output_dir=str(OUTPUT_DIR),
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    evaluation_strategy="epoch",
    save_strategy="epoch",            # <— save a checkpoint each epoch
    save_total_limit=3,               # <— keep last 3 to save disk
    load_best_model_at_end=True,      # <— restores best on eval_loss at end
    metric_for_best_model="loss",
    greater_is_better=False,
    logging_steps=50,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

trainer = Trainer(
    model=model, args=args,
    train_dataset=ds_train, eval_dataset=ds_val,
    data_collator=collator, tokenizer=wp_tok
)

t0 = time.time()
trainer.train()           # first run — from scratch
decoder_train_s = time.time() - t0

metrics = trainer.evaluate()
decoder_loss = metrics["eval_loss"]
decoder_ppl  = math.exp(decoder_loss)
decoder_train_s, decoder_ppl, metrics

In [None]:
# Save tokenizer & best model to OUTPUT_DIR/final
final_dir = OUTPUT_DIR / "final"
final_dir.mkdir(exist_ok=True)
wp_tok.save_pretrained(str(final_dir))
trainer.model.save_pretrained(str(final_dir))
print("Saved:", final_dir)