In [None]:
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional


@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune.
    """

    model_name: str = field(
        metadata={
            "help": (
                "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
            )
        },
    )
    model_revision: str = field(
        default="main",
        metadata={
            "help": "The specific model version to use (can be a branch name, tag name or commit id)."
        },
    )
    model_code_revision: str = field(
        default=None, metadata={"help": "The branch of the IFT model"}
    )
    torch_dtype: Optional[str] = field(
        default=None,
        metadata={
            "help": (
                "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the "
                "dtype will be automatically derived from the model's weights."
            ),
            "choices": ["auto", "bfloat16", "float16", "float32"],
        },
    )
    tokenizer_name_or_path: Optional[str] = field(
        default=None,
        metadata={
            "help": (
                "The path to the tokenizer. Useful if you want to use a different tokenizer to the one stored in `model_name_or_path`."
            )
        },
    )
    trust_remote_code: bool = field(
        default=False, metadata={"help": "Trust remote code when loading a model."}
    )
    use_flash_attention_2: bool = field(
        default=False,
        metadata={
            "help": (
                "Whether to use flash attention 2. You must install this manually by running `pip install flash-attn --no-build-isolation`"
            )
        },
    )
    use_peft: bool = field(
        default=False,
        metadata={"help": ("Whether to use PEFT or not for training.")},
    )
    lora_r: Optional[int] = field(
        default=16,
        metadata={"help": ("LoRA R value.")},
    )
    lora_alpha: Optional[int] = field(
        default=32,
        metadata={"help": ("LoRA alpha.")},
    )
    lora_dropout: Optional[float] = field(
        default=0.0,
        metadata={"help": ("LoRA dropout.")},
    )
    lora_target_modules: Optional[List[str]] = field(
        default=None,
        metadata={"help": ("LoRA target modules.")},
    )
    lora_modules_to_save: Optional[List[str]] = field(
        default=None,
        metadata={"help": ("Model layers to unfreeze & train")},
    )
    load_in_8bit: bool = field(default=False, metadata={"help": "use 8 bit precision"})
    load_in_4bit: bool = field(default=False, metadata={"help": "use 4 bit precision"})

    bnb_4bit_quant_type: Optional[str] = field(
        default="nf4", metadata={"help": "precise the quantization type (fp4 or nf4)"}
    )
    use_bnb_nested_quant: bool = field(
        default=False, metadata={"help": "use nested quantization"}
    )
    bnb_4bit_quant_storage: Optional[str] = field(
        default="uint8",
        metadata={"help": "storage type to pack the quanitzed 4-bit prarams."},
    )

    def __post_init__(self):
        if self.load_in_8bit and self.load_in_4bit:
            raise ValueError("You can't use 8 bit and 4 bit precision at the same time")


@dataclass
class DataArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    chat_template: Optional[str] = field(
        default=None, metadata={"help": "The chat template to use."}
    )

    text_column: Optional[str] = field(
        default="text",
        metadata={
            "help": "The column name to use for the text in the dataset (only used for continued pretraining)."
        },
    )

    preprocessing_num_workers: Optional[int] = field(
        default=None,
        metadata={"help": "The number of processes to use for the preprocessing."},
    )
    truncation_side: Optional[str] = field(
        default=None, metadata={"help": "Truncation side to use for the tokenizer."}
    )
    auto_insert_empty_system_msg: bool = field(
        default=True,
        metadata={
            "help": (
                "Whether to automatically insert an empty system message as the first message if `system` is mentioned in the chat template."
            )
        },
    )

    train_dataset_path: str = field(
        default=None,
        metadata={"help": ("The path to the training dataset.")},
    )
    test_dataset_path: str = field(
        default=None,
        metadata={"help": ("The path to the training dataset.")},
    )


@dataclass
class SFTConfig:
    """
    Arguments related to the training process itself. For all parameters, see: https://huggingface.co/docs/transformers/v4.26.1/en/main_classes/trainer#transformers.TrainingArguments
    Also used for the continued pretraining task.
    """

    dataset_kwargs: Optional[Dict[str, Any]] = field(
        default=None, metadata={"help": "Dataset kwargs for the SFTTrainer"}
    )
    max_seq_length: Optional[int] = field(
        default=8196,
        metadata={
            "help": (
                "Used by TRL for reward model training, which tries to read this parameter in init."
            )
        },
    )
    logging_first_step: bool = field(
        default=True,
        metadata={
            "help": ("Whether to log and evaluate the first global_step or not.")
        },
    )
    optim: Optional[str] = field(default="adamw_torch")
    train_batch_size: Optional[int] = field(
        default=4,
        metadata={"help": ("The batch size for training.")},
    )
    epochs: Optional[int] = field(
        default=3, metadata={"help": ("The number of epochs to train for.")}
    )
    checkpoint_save_steps: Optional[int] = field(
        default=50, metadata={"help": ("The number of steps to save the model.")}
    )

    logging_steps: Optional[int] = field(
        default=10, metadata={"help": ("The number of steps to log the model.")}
    )

    weight_decay: Optional[float] = field(
        default=0.01, metadata={"help": ("The weight decay to use.")}
    )

    lr: Optional[float] = field(
        default=2e-5, metadata={"help": ("The learning rate to use.")}
    )

    output_data_dir: str = field(
        default=None,
        metadata={"help": ("The output data directory.")},
    )

    model_dir: str = field(
        default=None,
        metadata={"help": ("The model directory.")},
    )

    model_checkpoint_dir: Optional[str] = field(
        default="/opt/ml/checkpoints",
        metadata={"help": ("The model checkpoint directory.")},
    )

    gradient_accumulation_steps: Optional[int] = field(
        default=4, metadata={"help": ("The number of gradient accumulation steps.")}
    )

    resume_from_checkpoint: Optional[bool] = field(
        default=False, metadata={"help": ("Whether to resume from a checkpoint.")}
    )

    warmup_ratio: Optional[float] = field(
        default=0.1, metadata={"help": ("The warmup ratio.")}
    )
    lr_scheduler_type: Optional[str] = field(
        default="linear", metadata={"help": ("The learning rate scheduler type.")}
    )
    packing: Optional[bool] = field(default=False)

In [None]:
import gzip
import json
import os
import sys
from typing import Dict, Tuple

import logging

import pandas as pd
import torch
import transformers
import datasets
from datasets import Dataset
from trl import SFTTrainer
from accelerate import Accelerator
from transformers import (
    TrainingArguments,
    set_seed,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoModelForCausalLM,
    HfArgumentParser,
)

from transformers.trainer_utils import get_last_checkpoint
from peft import get_peft_model, LoraConfig, TaskType


logger = logging.getLogger(__name__)


def get_checkpoint(output_dir: str):
    last_checkpoint = None
    if os.path.isdir(output_dir):
        last_checkpoint = get_last_checkpoint(output_dir)
    return last_checkpoint


def get_quantization_config(model_args: ModelArguments) -> BitsAndBytesConfig | None:
    if model_args.load_in_4bit:
        compute_dtype = torch.float16
        if model_args.torch_dtype not in {"auto", None}:
            compute_dtype = getattr(torch, model_args.torch_dtype)

        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=compute_dtype,
            bnb_4bit_quant_type=model_args.bnb_4bit_quant_type,
            bnb_4bit_use_double_quant=model_args.use_bnb_nested_quant,
            bnb_4bit_quant_storage=model_args.bnb_4bit_quant_storage,
        )
    elif model_args.load_in_8bit:
        quantization_config = BitsAndBytesConfig(
            load_in_8bit=True,
        )
    else:
        quantization_config = None

    return quantization_config


def get_current_device() -> int:
    """Get the current device. For GPU we return the local process index to enable multiple GPU training."""
    return Accelerator().local_process_index if torch.cuda.is_available() else "cpu"


def get_kbit_device_map() -> Dict[str, int] | None:
    """Useful for running inference with quantized models by setting `device_map=get_peft_device_map()`"""
    return {"": get_current_device()} if torch.cuda.is_available() else None


def parse_args() -> Tuple[ModelArguments, DataArguments, SFTConfig]:
    parser = HfArgumentParser((ModelArguments, DataArguments, SFTConfig))
    return parser.parse_args_into_dataclasses()


default_prompt = """{}: {}\n {}"""
# instruction = "Instruct: Generate adequate search engine query to obtain requested information."
instruction = "Rewrite this search query"

def formatting_prompts_func(example, eos_token):

    input = example.get("query", '')
    output = example.get('alternative', '')
    text = default_prompt.format(instruction, input, output) + eos_token

    return {"text": text}

def create_ds_from_parquet(data_path, tokenizer, max_seq_length):
    dataset = (
        Dataset.from_parquet(data_path)
        .map(
            lambda d: formatting_prompts_func(
                example=d, eos_token=tokenizer.eos_token
            ),
            batched=False,
        )
        .shuffle(seed=411)
    )
    return dataset


def main(model_args: ModelArguments, data_args: DataArguments, training_args: SFTConfig, seed: int = 3407):
    # Set seed for reproducibility
    set_seed(seed)

    ###############
    # Setup logging
    ###############
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    log_level = logging.INFO
    logger.setLevel(log_level)
    datasets.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.enable_default_handler()
    transformers.utils.logging.enable_explicit_format()


    # Check for last checkpoint
    last_checkpoint = get_checkpoint(training_args.model_checkpoint_dir)
    if last_checkpoint is not None and training_args.resume_from_checkpoint:
        logger.info(f"Checkpoint detected, resuming training at {last_checkpoint=}.")

    tokenizer = AutoTokenizer.from_pretrained(
        model_args.model_name,
    )
    tokenizer.pad_token = (
        tokenizer.unk_token
    )  # use unk rather than eos token to prevent endless generation
    tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
    EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN

    logger.info("*** Load pretrained model ***")
    torch_dtype = (
        torch.bfloat16 if not torch.cuda.is_bf16_supported() else torch.float16
    )
    quantization_config = get_quantization_config(model_args)

    model_kwargs = dict(
        trust_remote_code=True,
        # attn_implementation="flash_attention_2",
        # torch_dtype=torch_dtype,
        device_map=get_kbit_device_map() if quantization_config is not None else None,
        quantization_config=quantization_config,
        low_cpu_mem_usage=True,
    )

    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=model_args.lora_r,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
        target_modules=model_args.lora_target_modules,
        lora_alpha=model_args.lora_alpha,
        lora_dropout=model_args.lora_dropout,  # Supports any, but = 0 is optimized
        bias="none",  # Supports any, but = "none" is optimized)
        use_rslora=False,  # We support rank stabilized LoRA
        # loftq_config=None,  # And LoftQ
    )
    model = AutoModelForCausalLM.from_pretrained(model_args.model_name, **model_kwargs)
    # model = get_peft_model(model, peft_config)
    # model.print_trainable_parameters()

    # Train the model
    # @title Show current memory stats
    gpu_stats = torch.cuda.get_device_properties(0)
    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
    print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
    print(f"{start_gpu_memory} GB of memory reserved.")

    train_ds = create_ds_from_parquet(
        tokenizer=tokenizer,
        data_path=data_args.train_dataset_path,
        max_seq_length=training_args.max_seq_length,
    )
    test_ds = create_ds_from_parquet(
        tokenizer=tokenizer,
        data_path=data_args.train_dataset_path,
        max_seq_length=training_args.max_seq_length,
    )
    print(train_ds[0])

    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_ds,
        eval_dataset=test_ds,
        dataset_text_field="text",
        max_seq_length=training_args.max_seq_length,
        dataset_num_proc=1,
        packing=training_args.packing,  # Can make training 5x faster for short sequences.
        args=TrainingArguments(
            per_device_train_batch_size=training_args.train_batch_size,
            gradient_accumulation_steps=training_args.gradient_accumulation_steps,
            warmup_steps=5,
            logging_dir=training_args.output_data_dir,
            num_train_epochs=training_args.epochs,
            learning_rate=training_args.lr,
            # 'fp16' is set to True if bfloat16 is not supported, which means the model will use 16-bit floating point precision for training if possible.
            fp16=not torch.cuda.is_bf16_supported(),
            # 'bf16' is set to True if bfloat16 is supported, which means the model will use bfloat16 precision for training if possible.
            bf16=torch.cuda.is_bf16_supported(),
            logging_steps=training_args.logging_steps,
            optim=training_args.optim,
            weight_decay=training_args.weight_decay,
            lr_scheduler_type="linear",
            seed=seed,
            output_dir=training_args.model_dir,
            save_strategy="steps",
            save_steps=training_args.checkpoint_save_steps,
            restore_callback_states_from_checkpoint=False,
            # gradient_checkpointing=True,
        ),
    )
    trainer_stats = trainer.train(resume_from_checkpoint=False)

    used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
    used_percentage = round(used_memory / max_memory * 100, 3)
    lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
    print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
    print(
        f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
    )
    print(f"Peak reserved memory = {used_memory} GB.")
    print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
    print(f"Peak reserved memory % of max memory = {used_percentage} %.")
    print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

    trainer.save_model(training_args.output_data_dir)  # Local saving
    tokenizer.save_pretrained(training_args.output_data_dir)

    return trainer.model, tokenizer


In [41]:
model_name = "openai-community/gpt2"
# model_name = "jtatman/gpt2-open-instruct-v1-Anthropic-hh-rlhf"
# model_name = "HuggingFaceTB/SmolLM2-135M-Instruct"
# last_name = model_name.split("/")[-1]
model_args = ModelArguments(
    model_name=model_name,
    lora_alpha=64,
    lora_r=32,
)
data_args = DataArguments(
    train_dataset_path="./sample_query_rewrite_nodup_full.parquet"
)
train_args = SFTConfig(
    model_dir=f"models/{last_name}",
    model_checkpoint_dir=f"models/{last_name}-checkpoint",
    output_data_dir=f"models/{last_name}-data",
    train_batch_size=64,
    packing=True,
    epochs=5,
    lr=1e-4,
    max_seq_length=32,
    resume_from_checkpoint=False,
    logging_steps=10,
    gradient_accumulation_steps=1,d
)

In [42]:
model, tokenizer = main(
    model_args=model_args,
    training_args=train_args,
    data_args=data_args
)

[INFO|configuration_utils.py:679] 2024-11-08 09:18:16,784 >> loading configuration file config.json from cache at /home/ubuntu/.cache/huggingface/hub/models--openai-community--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/config.json
[INFO|configuration_utils.py:746] 2024-11-08 09:18:16,785 >> Model config GPT2Config {
  "_name_or_path": "openai-community/gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_inde

2024-11-08 09:18:16 - INFO - __main__ - *** Load pretrained model ***


[INFO|configuration_utils.py:679] 2024-11-08 09:18:17,003 >> loading configuration file config.json from cache at /home/ubuntu/.cache/huggingface/hub/models--openai-community--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/config.json
[INFO|configuration_utils.py:746] 2024-11-08 09:18:17,005 >> Model config GPT2Config {
  "_name_or_path": "openai-community/gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_inde

trainable params: 589,824 || all params: 125,029,632 || trainable%: 0.4717
GPU = Tesla T4. Max memory = 14.741 GB.
3.295 GB of memory reserved.
2024-11-08 09:18:17 - INFO - datasets.builder - Using custom data configuration default-b0d48c55225bc05a


Loading Dataset Infos from /home/ubuntu/.local/lib/python3.10/site-packages/datasets/packaged_modules/parquet


2024-11-08 09:18:17 - INFO - datasets.info - Loading Dataset Infos from /home/ubuntu/.local/lib/python3.10/site-packages/datasets/packaged_modules/parquet


Overwrite dataset info from restored data version if exists.


2024-11-08 09:18:17 - INFO - datasets.builder - Overwrite dataset info from restored data version if exists.


Loading Dataset info from /home/ubuntu/.cache/huggingface/datasets/parquet/default-b0d48c55225bc05a/0.0.0/9d41700293b5cf3c3cee6167e8c49e37598331b6466506aecb40a8c11b6aa9f6


2024-11-08 09:18:17 - INFO - datasets.info - Loading Dataset info from /home/ubuntu/.cache/huggingface/datasets/parquet/default-b0d48c55225bc05a/0.0.0/9d41700293b5cf3c3cee6167e8c49e37598331b6466506aecb40a8c11b6aa9f6


Found cached dataset parquet (/home/ubuntu/.cache/huggingface/datasets/parquet/default-b0d48c55225bc05a/0.0.0/9d41700293b5cf3c3cee6167e8c49e37598331b6466506aecb40a8c11b6aa9f6)


2024-11-08 09:18:17 - INFO - datasets.builder - Found cached dataset parquet (/home/ubuntu/.cache/huggingface/datasets/parquet/default-b0d48c55225bc05a/0.0.0/9d41700293b5cf3c3cee6167e8c49e37598331b6466506aecb40a8c11b6aa9f6)


Loading Dataset info from /home/ubuntu/.cache/huggingface/datasets/parquet/default-b0d48c55225bc05a/0.0.0/9d41700293b5cf3c3cee6167e8c49e37598331b6466506aecb40a8c11b6aa9f6


2024-11-08 09:18:17 - INFO - datasets.info - Loading Dataset info from /home/ubuntu/.cache/huggingface/datasets/parquet/default-b0d48c55225bc05a/0.0.0/9d41700293b5cf3c3cee6167e8c49e37598331b6466506aecb40a8c11b6aa9f6


Map:   0%|          | 0/1929 [00:00<?, ? examples/s]

Caching processed dataset at /home/ubuntu/.cache/huggingface/datasets/parquet/default-b0d48c55225bc05a/0.0.0/9d41700293b5cf3c3cee6167e8c49e37598331b6466506aecb40a8c11b6aa9f6/cache-f7d9210cb25bca3e.arrow


2024-11-08 09:18:17 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/ubuntu/.cache/huggingface/datasets/parquet/default-b0d48c55225bc05a/0.0.0/9d41700293b5cf3c3cee6167e8c49e37598331b6466506aecb40a8c11b6aa9f6/cache-f7d9210cb25bca3e.arrow


Caching indices mapping at /home/ubuntu/.cache/huggingface/datasets/parquet/default-b0d48c55225bc05a/0.0.0/9d41700293b5cf3c3cee6167e8c49e37598331b6466506aecb40a8c11b6aa9f6/cache-360b941713eadb4e.arrow


2024-11-08 09:18:17 - INFO - datasets.arrow_dataset - Caching indices mapping at /home/ubuntu/.cache/huggingface/datasets/parquet/default-b0d48c55225bc05a/0.0.0/9d41700293b5cf3c3cee6167e8c49e37598331b6466506aecb40a8c11b6aa9f6/cache-360b941713eadb4e.arrow


[INFO|training_args.py:2147] 2024-11-08 09:18:17,767 >> PyTorch: setting up devices
[INFO|training_args.py:1822] 2024-11-08 09:18:17,787 >> The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).

Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
[INFO|training_args.py:2147] 2024-11-08 09:18:17,791 >> PyTorch: setting up devices


{'query': 'what is a normal cholesterol score', 'alternative': 'healthy cholesterol levels', 'semantic_score': 0.7151662707328796, '__index_level_0__': 919, 'text': 'Rewrite this search query: what is a normal cholesterol score\n healthy cholesterol levels<|endoftext|>'}


Using custom data configuration default-791ab15b4db867b0


2024-11-08 09:18:17 - INFO - datasets.builder - Using custom data configuration default-791ab15b4db867b0


Loading Dataset Infos from /home/ubuntu/.local/lib/python3.10/site-packages/datasets/packaged_modules/generator


2024-11-08 09:18:17 - INFO - datasets.info - Loading Dataset Infos from /home/ubuntu/.local/lib/python3.10/site-packages/datasets/packaged_modules/generator


Generating dataset generator (/home/ubuntu/.cache/huggingface/datasets/generator/default-791ab15b4db867b0/0.0.0)


2024-11-08 09:18:17 - INFO - datasets.builder - Generating dataset generator (/home/ubuntu/.cache/huggingface/datasets/generator/default-791ab15b4db867b0/0.0.0)


Downloading and preparing dataset generator/default to /home/ubuntu/.cache/huggingface/datasets/generator/default-791ab15b4db867b0/0.0.0...


2024-11-08 09:18:17 - INFO - datasets.builder - Downloading and preparing dataset generator/default to /home/ubuntu/.cache/huggingface/datasets/generator/default-791ab15b4db867b0/0.0.0...


Generating train split


2024-11-08 09:18:17 - INFO - datasets.builder - Generating train split


Generating train split: 0 examples [00:00, ? examples/s]

Unable to verify splits sizes.


2024-11-08 09:18:18 - INFO - datasets.utils.info_utils - Unable to verify splits sizes.


Dataset generator downloaded and prepared to /home/ubuntu/.cache/huggingface/datasets/generator/default-791ab15b4db867b0/0.0.0. Subsequent calls will reuse this data.


2024-11-08 09:18:18 - INFO - datasets.builder - Dataset generator downloaded and prepared to /home/ubuntu/.cache/huggingface/datasets/generator/default-791ab15b4db867b0/0.0.0. Subsequent calls will reuse this data.


  super().__init__(
[INFO|trainer.py:698] 2024-11-08 09:18:18,238 >> Using auto half precision backend
[INFO|trainer.py:2313] 2024-11-08 09:18:18,546 >> ***** Running training *****
[INFO|trainer.py:2314] 2024-11-08 09:18:18,547 >>   Num examples = 1,301
[INFO|trainer.py:2315] 2024-11-08 09:18:18,548 >>   Num Epochs = 5
[INFO|trainer.py:2316] 2024-11-08 09:18:18,549 >>   Instantaneous batch size per device = 64
[INFO|trainer.py:2319] 2024-11-08 09:18:18,549 >>   Total train batch size (w. parallel, distributed & accumulation) = 64
[INFO|trainer.py:2320] 2024-11-08 09:18:18,550 >>   Gradient Accumulation steps = 1
[INFO|trainer.py:2321] 2024-11-08 09:18:18,551 >>   Total optimization steps = 105
[INFO|trainer.py:2322] 2024-11-08 09:18:18,553 >>   Number of trainable parameters = 589,824


Step,Training Loss
10,6.3813
20,6.1214
30,5.8204
40,5.5963
50,5.3212
60,5.0969
70,4.8875
80,4.7638
90,4.6459
100,4.5591


[INFO|trainer.py:3801] 2024-11-08 09:18:42,169 >> Saving model checkpoint to models/gpt2/checkpoint-50
[INFO|configuration_utils.py:679] 2024-11-08 09:18:42,345 >> loading configuration file config.json from cache at /home/ubuntu/.cache/huggingface/hub/models--openai-community--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/config.json
[INFO|configuration_utils.py:746] 2024-11-08 09:18:42,347 >> Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  

50.3181 seconds used for training.
0.84 minutes used for training.
Peak reserved memory = 3.67 GB.
Peak reserved memory for training = 0.375 GB.
Peak reserved memory % of max memory = 24.897 %.
Peak reserved memory for training % of max memory = 2.544 %.


[INFO|tokenization_utils_base.py:2646] 2024-11-08 09:19:09,132 >> tokenizer config file saved in models/gpt2-data/tokenizer_config.json
[INFO|tokenization_utils_base.py:2655] 2024-11-08 09:19:09,133 >> Special tokens file saved in models/gpt2-data/special_tokens_map.json


In [65]:
inputs = tokenizer(
    [default_prompt.format(instruction, "Who lived longer, Nikola Tesla or Milutin Milankovic?", "")],
    return_tensors="pt",
).to("cuda")

outputs = model.generate(
    **inputs, max_new_tokens=12, use_cache=False, temperature=0.7, do_sample=True, top_p=0.95
)
print("Answer: ", tokenizer.batch_decode(outputs))



Answer:  ['Rewrite this search query: Who lived longer, Nikola Tesla or Milutin Milankovic?\n \xa0 \xa0Who lived longer Nikola Tesla or Milutin']
