In [10]:
!pip install -q -U \
    "transformers>=4.38.0" \
    "datasets>=2.14.0" \
    "accelerate>=0.24.0" \
    "bitsandbytes>=0.41.0" \
    "peft>=0.7.0" \
    "protobuf==3.20.3" \
    sentencepiece \
    evaluate

In [11]:
from pathlib import Path
import os
from pprint import pprint
from typing import Optional

project_root = Path.cwd()
data_dir = project_root / "data"
output_dir = project_root / "outputs" / "tinyllama-custom"
data_dir.mkdir(parents=True, exist_ok=True)
output_dir.mkdir(parents=True, exist_ok=True)

# Keep bitsandbytes on a single GPU to avoid DataParallel cuBLAS errors
if "CUDA_VISIBLE_DEVICES" not in os.environ:
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    print("Pinned CUDA_VISIBLE_DEVICES=0 (override before this cell if you need multi-GPU).")
else:
    print(f"Respecting existing CUDA_VISIBLE_DEVICES={os.environ['CUDA_VISIBLE_DEVICES']}")

def detect_kaggle_train_dev() -> Optional[Path]:
    kaggle_input = Path("/kaggle/input")
    if not kaggle_input.exists():
        return None
    for dataset_dir in kaggle_input.iterdir():
        if not dataset_dir.is_dir():
            continue
        train_path = dataset_dir / "train.json"
        dev_path = dataset_dir / "dev.json"
        if train_path.exists() and dev_path.exists():
            return dataset_dir
        # allow nested folders (e.g., uploaded zip extractions)
        nested_train = list(dataset_dir.rglob("train.json"))
        for candidate in nested_train:
            sibling_dev = candidate.parent / "dev.json"
            if sibling_dev.exists():
                return candidate.parent
    return None

env_spider_root = os.getenv("SPIDER2_ROOT")
kaggle_spider_root = detect_kaggle_train_dev()
default_spider_root = env_spider_root or (str(kaggle_spider_root) if kaggle_spider_root else str((project_root / "Spider2").resolve()))

if kaggle_spider_root:
    print(f"Detected Kaggle train/dev files under: {kaggle_spider_root}")
elif env_spider_root:
    print(f"Using SPIDER2_ROOT from environment: {env_spider_root}")
else:
    print("Falling back to ./Spider2 (override with SPIDER2_ROOT if needed).")

config = {
    "base_model_name": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    "train_file": str(data_dir / "train.jsonl"),
    "eval_file": str(data_dir / "eval.jsonl"),
    "system_prompt": "You are TinyLlama, a compact and helpful assistant.",
    "max_seq_length": 512,
    "packing": False,  # set True if you want to pack multiple samples per sequence
    "num_train_epochs": 3,
    "per_device_train_batch_size": 1,
    "per_device_eval_batch_size": 1,
    "gradient_accumulation_steps": 8,
    "learning_rate": 2e-4,
    "warmup_ratio": 0.03,
    "weight_decay": 0.01,
    "logging_steps": 10,
    "save_steps": 200,
    "eval_steps": 200,
    "max_grad_norm": 1.0,
    "lr_scheduler_type": "cosine",
    "lora_r": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.05,
    "lora_target_modules": [
        "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"
    ],
    "bf16": False,
    "fp16": True,
    "push_to_hub": False,
    "hub_model_id": "",  # fill this if push_to_hub is True
    "hub_token": os.getenv("HUGGINGFACE_HUB_TOKEN", ""),
    "output_dir": str(output_dir),
    # Spider 2.0 controls
    "use_spider2": True,
    "spider2_root": default_spider_root,
    "spider2_train_json": "train.json",
    "spider2_dev_json": "dev.json",
    "spider2_train_out": str(data_dir / "spider2-train.jsonl"),
    "spider2_dev_out": str(data_dir / "spider2-dev.jsonl"),
    "spider2_sample_limit": None,  # set to an int for quick smoke tests
}

print("Configuration summary:")
pprint(config)

Respecting existing CUDA_VISIBLE_DEVICES=0
Detected Kaggle train/dev files under: /kaggle/input/nl2sql-from-spider2
Configuration summary:
{'base_model_name': 'TinyLlama/TinyLlama-1.1B-Chat-v1.0',
 'bf16': False,
 'eval_file': '/kaggle/working/data/eval.jsonl',
 'eval_steps': 200,
 'fp16': True,
 'gradient_accumulation_steps': 8,
 'hub_model_id': '',
 'hub_token': '',
 'learning_rate': 0.0002,
 'logging_steps': 10,
 'lora_alpha': 32,
 'lora_dropout': 0.05,
 'lora_r': 16,
 'lora_target_modules': ['q_proj',
                         'k_proj',
                         'v_proj',
                         'o_proj',
                         'gate_proj',
                         'up_proj',
                         'down_proj'],
 'lr_scheduler_type': 'cosine',
 'max_grad_norm': 1.0,
 'max_seq_length': 512,
 'num_train_epochs': 3,
 'output_dir': '/kaggle/working/outputs/tinyllama-custom',
 'packing': False,
 'per_device_eval_batch_size': 1,
 'per_device_train_batch_size': 1,
 'push_to_hub': False

In [12]:
from typing import Any, Dict, Optional

def serialize_schema(db_schema: Dict[str, Any]) -> str:
    table_names = db_schema["table_names"]
    column_names = db_schema["column_names"]
    column_table_ids = db_schema["column_table_ids"] if "column_table_ids" in db_schema else db_schema["column_names_original_table_ids"]
    lines = []
    for table_index, table in enumerate(table_names):
        cols = []
        for (col_id, col), table_id in zip(enumerate(column_names), column_table_ids):
            if table_id == table_index:
                cols.append(col[1])
        pretty_cols = ", ".join(cols) if cols else "*"
        lines.append(f"{table}({pretty_cols})")
    return "\n".join(lines)

def make_instruction_sample(example: Dict[str, Any]) -> Dict[str, str]:
    schema_text = serialize_schema(example["schema"])
    instruction = (
        "### Instruction:\n"
        "Given the following database schema, write the SQL query for the question.\n\n"
        "### Schema:\n" + schema_text + "\n\n"
        "### Question:\n" + example["question"] + "\n\n"
        "### SQL:"
    )
    return {
        "instruction": instruction,
        "response": example["sql"],
        "system": "You convert natural-language questions over relational databases into SQL queries.",
    }

def convert_spider_split(source_path: Path, target_path: Path, sample_limit: Optional[int] = None) -> None:
    with source_path.open("r", encoding="utf-8") as f:
        data = json.load(f)
    if sample_limit:
        data = data[:sample_limit]
    converted = [make_instruction_sample(row) for row in data]
    with target_path.open("w", encoding="utf-8") as fw:
        for record in converted:
            fw.write(json.dumps(record, ensure_ascii=False) + "\n")
    print(f"Saved {len(converted)} samples to {target_path}")

In [14]:
import json
if config.get("use_spider2", False):
    spider_root = Path(config["spider2_root"])
    assert spider_root.exists(), f"Spider2 root not found at {spider_root}. Update config['spider2_root']."
    train_src = spider_root / config["spider2_train_json"]
    dev_src = spider_root / config["spider2_dev_json"]
    missing = [p for p in (train_src, dev_src) if not p.exists()]
    if missing:
        missing_str = ", ".join(str(p) for p in missing)
        raise FileNotFoundError(
            f"Missing Spider2 JSON files: {missing_str}. If you're on Kaggle, make sure your dataset includes train.json and dev.json at the root or adjust config['spider2_train_json']/['spider2_dev_json']."
        )
    train_out = Path(config["spider2_train_out"])
    dev_out = Path(config["spider2_dev_out"])
    convert_spider_split(train_src, train_out, config["spider2_sample_limit"])
    convert_spider_split(dev_src, dev_out, config["spider2_sample_limit"])
    config["train_file"] = str(train_out)
    config["eval_file"] = str(dev_out)
    print("Config updated to use Spider 2.0 JSONL files:")
    print("  train_file ->", config["train_file"])
    print("  eval_file  ->", config["eval_file"])
else:
    print("Skipping Spider 2.0 conversion (set config['use_spider2']=True to enable).")

Saved 12248 samples to /kaggle/working/data/spider2-train.jsonl
Saved 1484 samples to /kaggle/working/data/spider2-dev.jsonl
Config updated to use Spider 2.0 JSONL files:
  train_file -> /kaggle/working/data/spider2-train.jsonl
  eval_file  -> /kaggle/working/data/spider2-dev.jsonl


In [15]:
import random
from typing import Dict, List

import torch
from datasets import Dataset, DatasetDict, load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
    logging
)
from peft import LoraConfig, get_peft_model

logging.set_verbosity_info()

def set_seed(seed: int = 42):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

set_seed(42)

def maybe_create_sample_data(train_path: str, eval_path: str) -> None:
    """Create a toy dataset so the notebook can run end-to-end if you don't have data yet."""
    if Path(train_path).exists() and Path(eval_path).exists():
        return
    sample_examples = [
        {
            "instruction": "Explain why the sky appears blue.",
            "response": "The atmosphere scatters sunlight so shorter blue wavelengths are seen from every direction.",
            "system": "You are a concise science explainer.",
        },
        {
            "instruction": "Give me three tips for learning Python.",
            "response": "1. Practice daily. 2. Read other people's code. 3. Build tiny projects and iterate.",
            "system": "Be upbeat and practical.",
        },
    ]
    Path(train_path).write_text("\n".join(json.dumps(e) for e in sample_examples), encoding="utf-8")
    Path(eval_path).write_text("\n".join(json.dumps(e) for e in sample_examples[:1]), encoding="utf-8")
    print(f"Sample dataset written to {train_path} / {eval_path}")

def format_chat(example: Dict, system_prompt: str) -> Dict:
    messages: List[Dict[str, str]] = []
    if example.get("system") or system_prompt:
        messages.append({
            "role": "system",
            "content": example.get("system") or system_prompt,
        })
    messages.append({"role": "user", "content": example["instruction"]})
    messages.append({"role": "assistant", "content": example["response"]})
    example["messages"] = messages
    return example

2025-11-24 16:31:33.452206: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764001893.882463      48 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764001894.048484      48 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [16]:
if not Path(config["train_file"]).exists():
    maybe_create_sample_data(config["train_file"], config["eval_file"])

data_files = {"train": config["train_file"]}
if Path(config["eval_file"]).exists():
    data_files["validation"] = config["eval_file"]
else:
    print("No eval file detected — validation metrics will be skipped.")

train_ext = Path(config["train_file"]).suffix.lower()
if train_ext in {".json", ".jsonl"}:
    dataset = load_dataset("json", data_files=data_files)
elif train_ext == ".csv":
    dataset = load_dataset("csv", data_files=data_files)
elif train_ext in {".parquet", ".pq"}:
    dataset = load_dataset("parquet", data_files=data_files)
else:
    raise ValueError(f"Unsupported file extension: {train_ext}")

dataset = dataset.map(lambda ex: format_chat(ex, config["system_prompt"]))
dataset

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/12248 [00:00<?, ? examples/s]

Map:   0%|          | 0/1484 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['instruction', 'response', 'system', 'messages'],
        num_rows: 12248
    })
    validation: Dataset({
        features: ['instruction', 'response', 'system', 'messages'],
        num_rows: 1484
    })
})

In [17]:
tokenizer = AutoTokenizer.from_pretrained(config["base_model_name"], use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

def add_text(example):
    example["text"] = tokenizer.apply_chat_template(
        example["messages"],
        tokenize=False,
        add_generation_prompt=False,
    )
    return example

dataset = dataset.map(add_text)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

padding_strategy = "max_length" if config["packing"] else False

def tokenize_for_causal_lm(batch):
    tokenized = tokenizer(
        batch["text"],
        truncation=True,
        max_length=config["max_seq_length"],
        padding=padding_strategy,
    )
    return tokenized

tokenized_datasets = dataset.map(
    tokenize_for_causal_lm,
    batched=True,
    remove_columns=dataset["train"].column_names,
    desc="Tokenizing conversations",
)

print("Sample formatted conversation:")
print(dataset["train"][0]["text"][:200] + "...")
print("\nTokenized sample ids:")
print(tokenized_datasets["train"][0]["input_ids"][:20])

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--TinyLlama--TinyLlama-1.1B-Chat-v1.0/snapshots/fe8a4ea1ffedaf415f4da2f062534de366a451e6/tokenizer.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--TinyLlama--TinyLlama-1.1B-Chat-v1.0/snapshots/fe8a4ea1ffedaf415f4da2f062534de366a451e6/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--TinyLlama--TinyLlama-1.1B-Chat-v1.0/snapshots/fe8a4ea1ffedaf415f4da2f062534de366a451e6/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--TinyLlama--TinyLlama-1.1B-Chat-v1.0/snapshots/fe8a4ea1ffedaf415f4da2f062534de366a451e6/tokenizer_config.json
loading file chat_template.jinja from cache at None


Map:   0%|          | 0/12248 [00:00<?, ? examples/s]

Map:   0%|          | 0/1484 [00:00<?, ? examples/s]

Tokenizing conversations:   0%|          | 0/12248 [00:00<?, ? examples/s]

Tokenizing conversations:   0%|          | 0/1484 [00:00<?, ? examples/s]

Sample formatted conversation:
<|system|>
You convert natural-language questions over relational databases into SQL queries.</s>
<|user|>
### Instruction:
Given the following database schema, write the SQL query for the question.

...

Tokenized sample ids:
[1, 529, 29989, 5205, 29989, 29958, 13, 3492, 3588, 5613, 29899, 11675, 5155, 975, 1104, 1288, 21218, 964, 3758, 9365]


In [18]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16 if config["bf16"] else torch.float16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    config["base_model_name"],
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config,
)

lora_config = LoraConfig(
    r=config["lora_r"],
    lora_alpha=config["lora_alpha"],
    target_modules=config["lora_target_modules"],
    lora_dropout=config["lora_dropout"],
    bias="none",
    task_type="CAUSAL_LM",
)
peft_model = get_peft_model(model, lora_config)

has_validation_split = "validation" in tokenized_datasets
supports_eval_strategy = hasattr(TrainingArguments, "__dataclass_fields__") and "evaluation_strategy" in TrainingArguments.__dataclass_fields__
eval_strategy = "steps" if has_validation_split else "no"

training_args_kwargs = dict(
    output_dir=config["output_dir"],
    num_train_epochs=config["num_train_epochs"],
    per_device_train_batch_size=config["per_device_train_batch_size"],
    per_device_eval_batch_size=config["per_device_eval_batch_size"],
    gradient_accumulation_steps=config["gradient_accumulation_steps"],
    learning_rate=config["learning_rate"],
    warmup_ratio=config["warmup_ratio"],
    weight_decay=config["weight_decay"],
    logging_steps=config["logging_steps"],
    save_steps=config["save_steps"],
    eval_steps=config["eval_steps"],
    max_grad_norm=config["max_grad_norm"],
    lr_scheduler_type=config["lr_scheduler_type"],
    fp16=config["fp16"],
    bf16=config["bf16"],
    report_to=["tensorboard"],
)

if supports_eval_strategy:
    training_args_kwargs["evaluation_strategy"] = eval_strategy
else:
    training_args_kwargs["do_eval"] = has_validation_split

training_args = TrainingArguments(**training_args_kwargs)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"] if has_validation_split else None,
    data_collator=data_collator,
)

peft_model.print_trainable_parameters()

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--TinyLlama--TinyLlama-1.1B-Chat-v1.0/snapshots/fe8a4ea1ffedaf415f4da2f062534de366a451e6/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "dtype": "bfloat16",
  "eos_token_id": 2,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 5632,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 22,
  "num_key_value_heads": 4,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.57.1",
  "use_cache": true,
  "vocab_size": 32000
}



model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--TinyLlama--TinyLlama-1.1B-Chat-v1.0/snapshots/fe8a4ea1ffedaf415f4da2f062534de366a451e6/model.safetensors
Instantiating LlamaForCausalLM model under default dtype torch.float16.
Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2
}

target_dtype {target_dtype} is replaced by `CustomDtype.INT4` for 4-bit BnB quantization


generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--TinyLlama--TinyLlama-1.1B-Chat-v1.0/snapshots/fe8a4ea1ffedaf415f4da2f062534de366a451e6/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "max_length": 2048,
  "pad_token_id": 0
}

Could not locate the custom_generate/generate.py inside TinyLlama/TinyLlama-1.1B-Chat-v1.0.
PyTorch: setting up devices
Using auto half precision backend


trainable params: 12,615,680 || all params: 1,112,664,064 || trainable%: 1.1338


In [19]:
train_result = trainer.train()
trainer.save_state()
train_result

***** Running training *****
  Num examples = 12,248
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 8
  Total optimization steps = 4,593
  Number of trainable parameters = 12,615,680


Step,Training Loss
10,1.866
20,1.6961
30,1.5763
40,1.4255
50,1.2359
60,1.1594
70,1.1565
80,1.0726
90,0.8723
100,0.833


Saving model checkpoint to /kaggle/working/outputs/tinyllama-custom/checkpoint-200
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--TinyLlama--TinyLlama-1.1B-Chat-v1.0/snapshots/fe8a4ea1ffedaf415f4da2f062534de366a451e6/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "dtype": "bfloat16",
  "eos_token_id": 2,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 5632,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 22,
  "num_key_value_heads": 4,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.57.1",
  "use_cache": true,
  "vocab_size": 32000
}

Saving Trainer.data_collator.toke

TrainOutput(global_step=4593, training_loss=0.05841092236344912, metrics={'train_runtime': 11568.6483, 'train_samples_per_second': 3.176, 'train_steps_per_second': 0.397, 'total_flos': 1.0784368789751808e+17, 'train_loss': 0.05841092236344912, 'epoch': 3.0})

In [20]:
trainer.save_model(config["output_dir"])
tokenizer.save_pretrained(config["output_dir"])

#if config["push_to_hub"] and config["hub_model_id"]:
#    trainer.push_to_hub()
#    tokenizer.push_to_hub(config["hub_model_id"])
#    print(f"Pushed to https://huggingface.co/{config['hub_model_id']}")

# Optional: merge the adapters into the base model for standalone deployment
# from peft import PeftModel
# base_model = AutoModelForCausalLM.from_pretrained(
#     config["base_model_name"],
#     device_map="auto",
#     trust_remote_code=True,
# )
# peft_model = PeftModel.from_pretrained(base_model, config["output_dir"])
# merged_model = peft_model.merge_and_unload()
# merged_dir = Path(config["output_dir"]) / "merged"
# merged_model.save_pretrained(merged_dir)
# tokenizer.save_pretrained(merged_dir)
# print(f"Merged model saved to {merged_dir}")

Saving model checkpoint to /kaggle/working/outputs/tinyllama-custom
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--TinyLlama--TinyLlama-1.1B-Chat-v1.0/snapshots/fe8a4ea1ffedaf415f4da2f062534de366a451e6/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "dtype": "bfloat16",
  "eos_token_id": 2,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 5632,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 22,
  "num_key_value_heads": 4,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.57.1",
  "use_cache": true,
  "vocab_size": 32000
}

Saving Trainer.data_collator.tokenizer by defaul

('/kaggle/working/outputs/tinyllama-custom/tokenizer_config.json',
 '/kaggle/working/outputs/tinyllama-custom/special_tokens_map.json',
 '/kaggle/working/outputs/tinyllama-custom/chat_template.jinja',
 '/kaggle/working/outputs/tinyllama-custom/tokenizer.model',
 '/kaggle/working/outputs/tinyllama-custom/added_tokens.json',
 '/kaggle/working/outputs/tinyllama-custom/tokenizer.json')

In [41]:
try:
    from huggingface_hub import HfHubHTTPError  # newer versions expose it at the top level
except ImportError:  # fall back for older installs
    from huggingface_hub.utils import HfHubHTTPError

push_to_hub = config.get("push_to_hub", False)
hub_model_id = config.get("hub_model_id") or ""
hub_token = config.get("hub_token") or os.getenv("HUGGINGFACE_HUB_TOKEN")

if push_to_hub and hub_model_id:
    if not hub_token:
        raise ValueError(
            "Hugging Face token missing—set config['hub_token'] (not recommended for sharing) "
            "or export HUGGINGFACE_HUB_TOKEN before running this cell."
        )

    try:
        trainer.push_to_hub(token=hub_token)
        tokenizer.push_to_hub(hub_model_id, token=hub_token)
        print(f"✅ Pushed to https://huggingface.co/{hub_model_id}")
    except HfHubHTTPError as err:
        print("❌ Push to Hub failed. Double-check:")
        print("  • The token has 'read' + 'write' permissions and hasn't expired.")
        print(
            "  • You are authenticated as the owner/collaborator of"
            f" '{hub_model_id}' (repo naming: username-or-org/model_name)."
        )
        print("  • The repo already exists or you have permission to create it in that namespace.")
        print("  • There are no stray spaces/newlines in the token.")
        print("Original error ->", err)
        raise
else:
    print("Push to Hub skipped (set config['push_to_hub']=True and supply hub_model_id/token).")


Saving model checkpoint to /kaggle/working/outputs/tinyllama-custom
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--TinyLlama--TinyLlama-1.1B-Chat-v1.0/snapshots/fe8a4ea1ffedaf415f4da2f062534de366a451e6/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "dtype": "bfloat16",
  "eos_token_id": 2,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 5632,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 22,
  "num_key_value_heads": 4,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.57.1",
  "use_cache": true,
  "vocab_size": 32000
}

Saving Trainer.data_collator.tokenizer by defaul

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

chat template saved in /tmp/tmps0t25_x9/chat_template.jinja
tokenizer config file saved in /tmp/tmps0t25_x9/tokenizer_config.json
Special tokens file saved in /tmp/tmps0t25_x9/special_tokens_map.json
Uploading the following files to aliabohendy22/tinyllama_NL2SQL: special_tokens_map.json,tokenizer.json,tokenizer_config.json,chat_template.jinja,tokenizer.model,README.md


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

✅ Pushed to https://huggingface.co/aliabohendy22/tinyllama_NL2SQL
